diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib/libcrypto/aes/asm/bsaes-x86_64.pl | 3123 |
1 files changed, 0 insertions, 3123 deletions
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl deleted file mode 100644 index c44a338114..0000000000 --- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl +++ /dev/null | |||
@@ -1,3123 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ################################################################### | ||
4 | ### AES-128 [originally in CTR mode] ### | ||
5 | ### bitsliced implementation for Intel Core 2 processors ### | ||
6 | ### requires support of SSE extensions up to SSSE3 ### | ||
7 | ### Author: Emilia Käsper and Peter Schwabe ### | ||
8 | ### Date: 2009-03-19 ### | ||
9 | ### Public domain ### | ||
10 | ### ### | ||
11 | ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### | ||
12 | ### further information. ### | ||
13 | ################################################################### | ||
14 | # | ||
15 | # September 2011. | ||
16 | # | ||
17 | # Started as transliteration to "perlasm" the original code has | ||
18 | # undergone following changes: | ||
19 | # | ||
20 | # - code was made position-independent; | ||
21 | # - rounds were folded into a loop resulting in >5x size reduction | ||
22 | # from 12.5KB to 2.2KB; | ||
23 | # - above was possible thanks to mixcolumns() modification that | ||
24 | # allowed to feed its output back to aesenc[last], this was | ||
25 | # achieved at cost of two additional inter-registers moves; | ||
26 | # - some instruction reordering and interleaving; | ||
27 | # - this module doesn't implement key setup subroutine, instead it | ||
28 | # relies on conversion of "conventional" key schedule as returned | ||
29 | # by AES_set_encrypt_key (see discussion below); | ||
30 | # - first and last round keys are treated differently, which allowed | ||
31 | # to skip one shiftrows(), reduce bit-sliced key schedule and | ||
32 | # speed-up conversion by 22%; | ||
33 | # - support for 192- and 256-bit keys was added; | ||
34 | # | ||
35 | # Resulting performance in CPU cycles spent to encrypt one byte out | ||
36 | # of 4096-byte buffer with 128-bit key is: | ||
37 | # | ||
38 | # Emilia's this(*) difference | ||
39 | # | ||
40 | # Core 2 9.30 8.69 +7% | ||
41 | # Nehalem(**) 7.63 6.98 +9% | ||
42 | # Atom 17.1 17.4 -2%(***) | ||
43 | # | ||
44 | # (*) Comparison is not completely fair, because "this" is ECB, | ||
45 | # i.e. no extra processing such as counter values calculation | ||
46 | # and xor-ing input as in Emilia's CTR implementation is | ||
47 | # performed. However, the CTR calculations stand for not more | ||
48 | # than 1% of total time, so comparison is *rather* fair. | ||
49 | # | ||
50 | # (**) Results were collected on Westmere, which is considered to | ||
51 | # be equivalent to Nehalem for this code. | ||
52 | # | ||
53 | # (***) Slowdown on Atom is rather strange per se, because original | ||
54 | # implementation has a number of 9+-bytes instructions, which | ||
55 | # are bad for Atom front-end, and which I eliminated completely. | ||
56 | # In attempt to address deterioration sbox() was tested in FP | ||
57 | # SIMD "domain" (movaps instead of movdqa, xorps instead of | ||
58 | # pxor, etc.). While it resulted in nominal 4% improvement on | ||
59 | # Atom, it hurted Westmere by more than 2x factor. | ||
60 | # | ||
61 | # As for key schedule conversion subroutine. Interface to OpenSSL | ||
62 | # relies on per-invocation on-the-fly conversion. This naturally | ||
63 | # has impact on performance, especially for short inputs. Conversion | ||
64 | # time in CPU cycles and its ratio to CPU cycles spent in 8x block | ||
65 | # function is: | ||
66 | # | ||
67 | # conversion conversion/8x block | ||
68 | # Core 2 240 0.22 | ||
69 | # Nehalem 180 0.20 | ||
70 | # Atom 430 0.19 | ||
71 | # | ||
72 | # The ratio values mean that 128-byte blocks will be processed | ||
73 | # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, | ||
74 | # etc. Then keep in mind that input sizes not divisible by 128 are | ||
75 | # *effectively* slower, especially shortest ones, e.g. consecutive | ||
76 | # 144-byte blocks are processed 44% slower than one would expect, | ||
77 | # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" | ||
78 | # it's still faster than ["hyper-threading-safe" code path in] | ||
79 | # aes-x86_64.pl on all lengths above 64 bytes... | ||
80 | # | ||
81 | # October 2011. | ||
82 | # | ||
83 | # Add decryption procedure. Performance in CPU cycles spent to decrypt | ||
84 | # one byte out of 4096-byte buffer with 128-bit key is: | ||
85 | # | ||
86 | # Core 2 9.83 | ||
87 | # Nehalem 7.74 | ||
88 | # Atom 19.0 | ||
89 | # | ||
90 | # November 2011. | ||
91 | # | ||
92 | # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is | ||
93 | # suboptimal, but XTS is meant to be used with larger blocks... | ||
94 | # | ||
95 | # <appro@openssl.org> | ||
96 | |||
97 | $flavour = shift; | ||
98 | $output = shift; | ||
99 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
100 | |||
101 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
102 | |||
103 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
104 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
105 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
106 | die "can't locate x86_64-xlate.pl"; | ||
107 | |||
108 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
109 | *STDOUT=*OUT; | ||
110 | |||
111 | my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); | ||
112 | my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) | ||
113 | my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... | ||
114 | |||
115 | { | ||
116 | my ($key,$rounds,$const)=("%rax","%r10d","%r11"); | ||
117 | |||
118 | sub Sbox { | ||
119 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
120 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
121 | my @b=@_[0..7]; | ||
122 | my @t=@_[8..11]; | ||
123 | my @s=@_[12..15]; | ||
124 | &InBasisChange (@b); | ||
125 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
126 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
127 | } | ||
128 | |||
129 | sub InBasisChange { | ||
130 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
131 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
132 | my @b=@_[0..7]; | ||
133 | $code.=<<___; | ||
134 | pxor @b[6], @b[5] | ||
135 | pxor @b[1], @b[2] | ||
136 | pxor @b[0], @b[3] | ||
137 | pxor @b[2], @b[6] | ||
138 | pxor @b[0], @b[5] | ||
139 | |||
140 | pxor @b[3], @b[6] | ||
141 | pxor @b[7], @b[3] | ||
142 | pxor @b[5], @b[7] | ||
143 | pxor @b[4], @b[3] | ||
144 | pxor @b[5], @b[4] | ||
145 | pxor @b[1], @b[3] | ||
146 | |||
147 | pxor @b[7], @b[2] | ||
148 | pxor @b[5], @b[1] | ||
149 | ___ | ||
150 | } | ||
151 | |||
152 | sub OutBasisChange { | ||
153 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
154 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
155 | my @b=@_[0..7]; | ||
156 | $code.=<<___; | ||
157 | pxor @b[6], @b[0] | ||
158 | pxor @b[4], @b[1] | ||
159 | pxor @b[0], @b[2] | ||
160 | pxor @b[6], @b[4] | ||
161 | pxor @b[1], @b[6] | ||
162 | |||
163 | pxor @b[5], @b[1] | ||
164 | pxor @b[3], @b[5] | ||
165 | pxor @b[7], @b[3] | ||
166 | pxor @b[5], @b[7] | ||
167 | pxor @b[5], @b[2] | ||
168 | |||
169 | pxor @b[7], @b[4] | ||
170 | ___ | ||
171 | } | ||
172 | |||
173 | sub InvSbox { | ||
174 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
175 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
176 | my @b=@_[0..7]; | ||
177 | my @t=@_[8..11]; | ||
178 | my @s=@_[12..15]; | ||
179 | &InvInBasisChange (@b); | ||
180 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
181 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
182 | } | ||
183 | |||
184 | sub InvInBasisChange { # OutBasisChange in reverse | ||
185 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
186 | $code.=<<___ | ||
187 | pxor @b[7], @b[4] | ||
188 | |||
189 | pxor @b[5], @b[7] | ||
190 | pxor @b[5], @b[2] | ||
191 | pxor @b[7], @b[3] | ||
192 | pxor @b[3], @b[5] | ||
193 | pxor @b[5], @b[1] | ||
194 | |||
195 | pxor @b[1], @b[6] | ||
196 | pxor @b[0], @b[2] | ||
197 | pxor @b[6], @b[4] | ||
198 | pxor @b[6], @b[0] | ||
199 | pxor @b[4], @b[1] | ||
200 | ___ | ||
201 | } | ||
202 | |||
203 | sub InvOutBasisChange { # InBasisChange in reverse | ||
204 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
205 | $code.=<<___; | ||
206 | pxor @b[5], @b[1] | ||
207 | pxor @b[7], @b[2] | ||
208 | |||
209 | pxor @b[1], @b[3] | ||
210 | pxor @b[5], @b[4] | ||
211 | pxor @b[5], @b[7] | ||
212 | pxor @b[4], @b[3] | ||
213 | pxor @b[0], @b[5] | ||
214 | pxor @b[7], @b[3] | ||
215 | pxor @b[2], @b[6] | ||
216 | pxor @b[1], @b[2] | ||
217 | pxor @b[3], @b[6] | ||
218 | |||
219 | pxor @b[0], @b[3] | ||
220 | pxor @b[6], @b[5] | ||
221 | ___ | ||
222 | } | ||
223 | |||
224 | sub Mul_GF4 { | ||
225 | #;************************************************************* | ||
226 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
227 | #;************************************************************* | ||
228 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
229 | $code.=<<___; | ||
230 | movdqa $y0, $t0 | ||
231 | pxor $y1, $t0 | ||
232 | pand $x0, $t0 | ||
233 | pxor $x1, $x0 | ||
234 | pand $y0, $x1 | ||
235 | pand $y1, $x0 | ||
236 | pxor $x1, $x0 | ||
237 | pxor $t0, $x1 | ||
238 | ___ | ||
239 | } | ||
240 | |||
241 | sub Mul_GF4_N { # not used, see next subroutine | ||
242 | # multiply and scale by N | ||
243 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
244 | $code.=<<___; | ||
245 | movdqa $y0, $t0 | ||
246 | pxor $y1, $t0 | ||
247 | pand $x0, $t0 | ||
248 | pxor $x1, $x0 | ||
249 | pand $y0, $x1 | ||
250 | pand $y1, $x0 | ||
251 | pxor $x0, $x1 | ||
252 | pxor $t0, $x0 | ||
253 | ___ | ||
254 | } | ||
255 | |||
256 | sub Mul_GF4_N_GF4 { | ||
257 | # interleaved Mul_GF4_N and Mul_GF4 | ||
258 | my ($x0,$x1,$y0,$y1,$t0, | ||
259 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
260 | $code.=<<___; | ||
261 | movdqa $y0, $t0 | ||
262 | movdqa $y2, $t1 | ||
263 | pxor $y1, $t0 | ||
264 | pxor $y3, $t1 | ||
265 | pand $x0, $t0 | ||
266 | pand $x2, $t1 | ||
267 | pxor $x1, $x0 | ||
268 | pxor $x3, $x2 | ||
269 | pand $y0, $x1 | ||
270 | pand $y2, $x3 | ||
271 | pand $y1, $x0 | ||
272 | pand $y3, $x2 | ||
273 | pxor $x0, $x1 | ||
274 | pxor $x3, $x2 | ||
275 | pxor $t0, $x0 | ||
276 | pxor $t1, $x3 | ||
277 | ___ | ||
278 | } | ||
279 | sub Mul_GF16_2 { | ||
280 | my @x=@_[0..7]; | ||
281 | my @y=@_[8..11]; | ||
282 | my @t=@_[12..15]; | ||
283 | $code.=<<___; | ||
284 | movdqa @x[0], @t[0] | ||
285 | movdqa @x[1], @t[1] | ||
286 | ___ | ||
287 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); | ||
288 | $code.=<<___; | ||
289 | pxor @x[2], @t[0] | ||
290 | pxor @x[3], @t[1] | ||
291 | pxor @y[2], @y[0] | ||
292 | pxor @y[3], @y[1] | ||
293 | ___ | ||
294 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
295 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
296 | $code.=<<___; | ||
297 | pxor @t[0], @x[0] | ||
298 | pxor @t[0], @x[2] | ||
299 | pxor @t[1], @x[1] | ||
300 | pxor @t[1], @x[3] | ||
301 | |||
302 | movdqa @x[4], @t[0] | ||
303 | movdqa @x[5], @t[1] | ||
304 | pxor @x[6], @t[0] | ||
305 | pxor @x[7], @t[1] | ||
306 | ___ | ||
307 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
308 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
309 | $code.=<<___; | ||
310 | pxor @y[2], @y[0] | ||
311 | pxor @y[3], @y[1] | ||
312 | ___ | ||
313 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); | ||
314 | $code.=<<___; | ||
315 | pxor @t[0], @x[4] | ||
316 | pxor @t[0], @x[6] | ||
317 | pxor @t[1], @x[5] | ||
318 | pxor @t[1], @x[7] | ||
319 | ___ | ||
320 | } | ||
321 | sub Inv_GF256 { | ||
322 | #;******************************************************************** | ||
323 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
324 | #;******************************************************************** | ||
325 | my @x=@_[0..7]; | ||
326 | my @t=@_[8..11]; | ||
327 | my @s=@_[12..15]; | ||
328 | # direct optimizations from hardware | ||
329 | $code.=<<___; | ||
330 | movdqa @x[4], @t[3] | ||
331 | movdqa @x[5], @t[2] | ||
332 | movdqa @x[1], @t[1] | ||
333 | movdqa @x[7], @s[1] | ||
334 | movdqa @x[0], @s[0] | ||
335 | |||
336 | pxor @x[6], @t[3] | ||
337 | pxor @x[7], @t[2] | ||
338 | pxor @x[3], @t[1] | ||
339 | movdqa @t[3], @s[2] | ||
340 | pxor @x[6], @s[1] | ||
341 | movdqa @t[2], @t[0] | ||
342 | pxor @x[2], @s[0] | ||
343 | movdqa @t[3], @s[3] | ||
344 | |||
345 | por @t[1], @t[2] | ||
346 | por @s[0], @t[3] | ||
347 | pxor @t[0], @s[3] | ||
348 | pand @s[0], @s[2] | ||
349 | pxor @t[1], @s[0] | ||
350 | pand @t[1], @t[0] | ||
351 | pand @s[0], @s[3] | ||
352 | movdqa @x[3], @s[0] | ||
353 | pxor @x[2], @s[0] | ||
354 | pand @s[0], @s[1] | ||
355 | pxor @s[1], @t[3] | ||
356 | pxor @s[1], @t[2] | ||
357 | movdqa @x[4], @s[1] | ||
358 | movdqa @x[1], @s[0] | ||
359 | pxor @x[5], @s[1] | ||
360 | pxor @x[0], @s[0] | ||
361 | movdqa @s[1], @t[1] | ||
362 | pand @s[0], @s[1] | ||
363 | por @s[0], @t[1] | ||
364 | pxor @s[1], @t[0] | ||
365 | pxor @s[3], @t[3] | ||
366 | pxor @s[2], @t[2] | ||
367 | pxor @s[3], @t[1] | ||
368 | movdqa @x[7], @s[0] | ||
369 | pxor @s[2], @t[0] | ||
370 | movdqa @x[6], @s[1] | ||
371 | pxor @s[2], @t[1] | ||
372 | movdqa @x[5], @s[2] | ||
373 | pand @x[3], @s[0] | ||
374 | movdqa @x[4], @s[3] | ||
375 | pand @x[2], @s[1] | ||
376 | pand @x[1], @s[2] | ||
377 | por @x[0], @s[3] | ||
378 | pxor @s[0], @t[3] | ||
379 | pxor @s[1], @t[2] | ||
380 | pxor @s[2], @t[1] | ||
381 | pxor @s[3], @t[0] | ||
382 | |||
383 | #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
384 | |||
385 | # new smaller inversion | ||
386 | |||
387 | movdqa @t[3], @s[0] | ||
388 | pand @t[1], @t[3] | ||
389 | pxor @t[2], @s[0] | ||
390 | |||
391 | movdqa @t[0], @s[2] | ||
392 | movdqa @s[0], @s[3] | ||
393 | pxor @t[3], @s[2] | ||
394 | pand @s[2], @s[3] | ||
395 | |||
396 | movdqa @t[1], @s[1] | ||
397 | pxor @t[2], @s[3] | ||
398 | pxor @t[0], @s[1] | ||
399 | |||
400 | pxor @t[2], @t[3] | ||
401 | |||
402 | pand @t[3], @s[1] | ||
403 | |||
404 | movdqa @s[2], @t[2] | ||
405 | pxor @t[0], @s[1] | ||
406 | |||
407 | pxor @s[1], @t[2] | ||
408 | pxor @s[1], @t[1] | ||
409 | |||
410 | pand @t[0], @t[2] | ||
411 | |||
412 | pxor @t[2], @s[2] | ||
413 | pxor @t[2], @t[1] | ||
414 | |||
415 | pand @s[3], @s[2] | ||
416 | |||
417 | pxor @s[0], @s[2] | ||
418 | ___ | ||
419 | # output in s3, s2, s1, t1 | ||
420 | |||
421 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
422 | |||
423 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
424 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
425 | |||
426 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
427 | } | ||
428 | |||
429 | # AES linear components | ||
430 | |||
431 | sub ShiftRows { | ||
432 | my @x=@_[0..7]; | ||
433 | my $mask=pop; | ||
434 | $code.=<<___; | ||
435 | pxor 0x00($key),@x[0] | ||
436 | pxor 0x10($key),@x[1] | ||
437 | pshufb $mask,@x[0] | ||
438 | pxor 0x20($key),@x[2] | ||
439 | pshufb $mask,@x[1] | ||
440 | pxor 0x30($key),@x[3] | ||
441 | pshufb $mask,@x[2] | ||
442 | pxor 0x40($key),@x[4] | ||
443 | pshufb $mask,@x[3] | ||
444 | pxor 0x50($key),@x[5] | ||
445 | pshufb $mask,@x[4] | ||
446 | pxor 0x60($key),@x[6] | ||
447 | pshufb $mask,@x[5] | ||
448 | pxor 0x70($key),@x[7] | ||
449 | pshufb $mask,@x[6] | ||
450 | lea 0x80($key),$key | ||
451 | pshufb $mask,@x[7] | ||
452 | ___ | ||
453 | } | ||
454 | |||
455 | sub MixColumns { | ||
456 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
457 | my @x=@_[0..7]; | ||
458 | my @t=@_[8..15]; | ||
459 | my $inv=@_[16]; # optional | ||
460 | $code.=<<___; | ||
461 | pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 | ||
462 | pshufd \$0x93, @x[1], @t[1] | ||
463 | pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) | ||
464 | pshufd \$0x93, @x[2], @t[2] | ||
465 | pxor @t[1], @x[1] | ||
466 | pshufd \$0x93, @x[3], @t[3] | ||
467 | pxor @t[2], @x[2] | ||
468 | pshufd \$0x93, @x[4], @t[4] | ||
469 | pxor @t[3], @x[3] | ||
470 | pshufd \$0x93, @x[5], @t[5] | ||
471 | pxor @t[4], @x[4] | ||
472 | pshufd \$0x93, @x[6], @t[6] | ||
473 | pxor @t[5], @x[5] | ||
474 | pshufd \$0x93, @x[7], @t[7] | ||
475 | pxor @t[6], @x[6] | ||
476 | pxor @t[7], @x[7] | ||
477 | |||
478 | pxor @x[0], @t[1] | ||
479 | pxor @x[7], @t[0] | ||
480 | pxor @x[7], @t[1] | ||
481 | pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) | ||
482 | pxor @x[1], @t[2] | ||
483 | pshufd \$0x4E, @x[1], @x[1] | ||
484 | pxor @x[4], @t[5] | ||
485 | pxor @t[0], @x[0] | ||
486 | pxor @x[5], @t[6] | ||
487 | pxor @t[1], @x[1] | ||
488 | pxor @x[3], @t[4] | ||
489 | pshufd \$0x4E, @x[4], @t[0] | ||
490 | pxor @x[6], @t[7] | ||
491 | pshufd \$0x4E, @x[5], @t[1] | ||
492 | pxor @x[2], @t[3] | ||
493 | pshufd \$0x4E, @x[3], @x[4] | ||
494 | pxor @x[7], @t[3] | ||
495 | pshufd \$0x4E, @x[7], @x[5] | ||
496 | pxor @x[7], @t[4] | ||
497 | pshufd \$0x4E, @x[6], @x[3] | ||
498 | pxor @t[4], @t[0] | ||
499 | pshufd \$0x4E, @x[2], @x[6] | ||
500 | pxor @t[5], @t[1] | ||
501 | ___ | ||
502 | $code.=<<___ if (!$inv); | ||
503 | pxor @t[3], @x[4] | ||
504 | pxor @t[7], @x[5] | ||
505 | pxor @t[6], @x[3] | ||
506 | movdqa @t[0], @x[2] | ||
507 | pxor @t[2], @x[6] | ||
508 | movdqa @t[1], @x[7] | ||
509 | ___ | ||
510 | $code.=<<___ if ($inv); | ||
511 | pxor @x[4], @t[3] | ||
512 | pxor @t[7], @x[5] | ||
513 | pxor @x[3], @t[6] | ||
514 | movdqa @t[0], @x[3] | ||
515 | pxor @t[2], @x[6] | ||
516 | movdqa @t[6], @x[2] | ||
517 | movdqa @t[1], @x[7] | ||
518 | movdqa @x[6], @x[4] | ||
519 | movdqa @t[3], @x[6] | ||
520 | ___ | ||
521 | } | ||
522 | |||
523 | sub InvMixColumns_orig { | ||
524 | my @x=@_[0..7]; | ||
525 | my @t=@_[8..15]; | ||
526 | |||
527 | $code.=<<___; | ||
528 | # multiplication by 0x0e | ||
529 | pshufd \$0x93, @x[7], @t[7] | ||
530 | movdqa @x[2], @t[2] | ||
531 | pxor @x[5], @x[7] # 7 5 | ||
532 | pxor @x[5], @x[2] # 2 5 | ||
533 | pshufd \$0x93, @x[0], @t[0] | ||
534 | movdqa @x[5], @t[5] | ||
535 | pxor @x[0], @x[5] # 5 0 [1] | ||
536 | pxor @x[1], @x[0] # 0 1 | ||
537 | pshufd \$0x93, @x[1], @t[1] | ||
538 | pxor @x[2], @x[1] # 1 25 | ||
539 | pxor @x[6], @x[0] # 01 6 [2] | ||
540 | pxor @x[3], @x[1] # 125 3 [4] | ||
541 | pshufd \$0x93, @x[3], @t[3] | ||
542 | pxor @x[0], @x[2] # 25 016 [3] | ||
543 | pxor @x[7], @x[3] # 3 75 | ||
544 | pxor @x[6], @x[7] # 75 6 [0] | ||
545 | pshufd \$0x93, @x[6], @t[6] | ||
546 | movdqa @x[4], @t[4] | ||
547 | pxor @x[4], @x[6] # 6 4 | ||
548 | pxor @x[3], @x[4] # 4 375 [6] | ||
549 | pxor @x[7], @x[3] # 375 756=36 | ||
550 | pxor @t[5], @x[6] # 64 5 [7] | ||
551 | pxor @t[2], @x[3] # 36 2 | ||
552 | pxor @t[4], @x[3] # 362 4 [5] | ||
553 | pshufd \$0x93, @t[5], @t[5] | ||
554 | ___ | ||
555 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
556 | $code.=<<___; | ||
557 | # multiplication by 0x0b | ||
558 | pxor @y[0], @y[1] | ||
559 | pxor @t[0], @y[0] | ||
560 | pxor @t[1], @y[1] | ||
561 | pshufd \$0x93, @t[2], @t[2] | ||
562 | pxor @t[5], @y[0] | ||
563 | pxor @t[6], @y[1] | ||
564 | pxor @t[7], @y[0] | ||
565 | pshufd \$0x93, @t[4], @t[4] | ||
566 | pxor @t[6], @t[7] # clobber t[7] | ||
567 | pxor @y[0], @y[1] | ||
568 | |||
569 | pxor @t[0], @y[3] | ||
570 | pshufd \$0x93, @t[0], @t[0] | ||
571 | pxor @t[1], @y[2] | ||
572 | pxor @t[1], @y[4] | ||
573 | pxor @t[2], @y[2] | ||
574 | pshufd \$0x93, @t[1], @t[1] | ||
575 | pxor @t[2], @y[3] | ||
576 | pxor @t[2], @y[5] | ||
577 | pxor @t[7], @y[2] | ||
578 | pshufd \$0x93, @t[2], @t[2] | ||
579 | pxor @t[3], @y[3] | ||
580 | pxor @t[3], @y[6] | ||
581 | pxor @t[3], @y[4] | ||
582 | pshufd \$0x93, @t[3], @t[3] | ||
583 | pxor @t[4], @y[7] | ||
584 | pxor @t[4], @y[5] | ||
585 | pxor @t[7], @y[7] | ||
586 | pxor @t[5], @y[3] | ||
587 | pxor @t[4], @y[4] | ||
588 | pxor @t[5], @t[7] # clobber t[7] even more | ||
589 | |||
590 | pxor @t[7], @y[5] | ||
591 | pshufd \$0x93, @t[4], @t[4] | ||
592 | pxor @t[7], @y[6] | ||
593 | pxor @t[7], @y[4] | ||
594 | |||
595 | pxor @t[5], @t[7] | ||
596 | pshufd \$0x93, @t[5], @t[5] | ||
597 | pxor @t[6], @t[7] # restore t[7] | ||
598 | |||
599 | # multiplication by 0x0d | ||
600 | pxor @y[7], @y[4] | ||
601 | pxor @t[4], @y[7] | ||
602 | pshufd \$0x93, @t[6], @t[6] | ||
603 | pxor @t[0], @y[2] | ||
604 | pxor @t[5], @y[7] | ||
605 | pxor @t[2], @y[2] | ||
606 | pshufd \$0x93, @t[7], @t[7] | ||
607 | |||
608 | pxor @y[1], @y[3] | ||
609 | pxor @t[1], @y[1] | ||
610 | pxor @t[0], @y[0] | ||
611 | pxor @t[0], @y[3] | ||
612 | pxor @t[5], @y[1] | ||
613 | pxor @t[5], @y[0] | ||
614 | pxor @t[7], @y[1] | ||
615 | pshufd \$0x93, @t[0], @t[0] | ||
616 | pxor @t[6], @y[0] | ||
617 | pxor @y[1], @y[3] | ||
618 | pxor @t[1], @y[4] | ||
619 | pshufd \$0x93, @t[1], @t[1] | ||
620 | |||
621 | pxor @t[7], @y[7] | ||
622 | pxor @t[2], @y[4] | ||
623 | pxor @t[2], @y[5] | ||
624 | pshufd \$0x93, @t[2], @t[2] | ||
625 | pxor @t[6], @y[2] | ||
626 | pxor @t[3], @t[6] # clobber t[6] | ||
627 | pxor @y[7], @y[4] | ||
628 | pxor @t[6], @y[3] | ||
629 | |||
630 | pxor @t[6], @y[6] | ||
631 | pxor @t[5], @y[5] | ||
632 | pxor @t[4], @y[6] | ||
633 | pshufd \$0x93, @t[4], @t[4] | ||
634 | pxor @t[6], @y[5] | ||
635 | pxor @t[7], @y[6] | ||
636 | pxor @t[3], @t[6] # restore t[6] | ||
637 | |||
638 | pshufd \$0x93, @t[5], @t[5] | ||
639 | pshufd \$0x93, @t[6], @t[6] | ||
640 | pshufd \$0x93, @t[7], @t[7] | ||
641 | pshufd \$0x93, @t[3], @t[3] | ||
642 | |||
643 | # multiplication by 0x09 | ||
644 | pxor @y[1], @y[4] | ||
645 | pxor @y[1], @t[1] # t[1]=y[1] | ||
646 | pxor @t[5], @t[0] # clobber t[0] | ||
647 | pxor @t[5], @t[1] | ||
648 | pxor @t[0], @y[3] | ||
649 | pxor @y[0], @t[0] # t[0]=y[0] | ||
650 | pxor @t[6], @t[1] | ||
651 | pxor @t[7], @t[6] # clobber t[6] | ||
652 | pxor @t[1], @y[4] | ||
653 | pxor @t[4], @y[7] | ||
654 | pxor @y[4], @t[4] # t[4]=y[4] | ||
655 | pxor @t[3], @y[6] | ||
656 | pxor @y[3], @t[3] # t[3]=y[3] | ||
657 | pxor @t[2], @y[5] | ||
658 | pxor @y[2], @t[2] # t[2]=y[2] | ||
659 | pxor @t[7], @t[3] | ||
660 | pxor @y[5], @t[5] # t[5]=y[5] | ||
661 | pxor @t[6], @t[2] | ||
662 | pxor @t[6], @t[5] | ||
663 | pxor @y[6], @t[6] # t[6]=y[6] | ||
664 | pxor @y[7], @t[7] # t[7]=y[7] | ||
665 | |||
666 | movdqa @t[0],@XMM[0] | ||
667 | movdqa @t[1],@XMM[1] | ||
668 | movdqa @t[2],@XMM[2] | ||
669 | movdqa @t[3],@XMM[3] | ||
670 | movdqa @t[4],@XMM[4] | ||
671 | movdqa @t[5],@XMM[5] | ||
672 | movdqa @t[6],@XMM[6] | ||
673 | movdqa @t[7],@XMM[7] | ||
674 | ___ | ||
675 | } | ||
676 | |||
677 | sub InvMixColumns { | ||
678 | my @x=@_[0..7]; | ||
679 | my @t=@_[8..15]; | ||
680 | |||
681 | # Thanks to Jussi Kivilinna for providing pointer to | ||
682 | # | ||
683 | # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | | ||
684 | # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | | ||
685 | # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | | ||
686 | # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | | ||
687 | |||
688 | $code.=<<___; | ||
689 | # multiplication by 0x05-0x00-0x04-0x00 | ||
690 | pshufd \$0x4E, @x[0], @t[0] | ||
691 | pshufd \$0x4E, @x[6], @t[6] | ||
692 | pxor @x[0], @t[0] | ||
693 | pshufd \$0x4E, @x[7], @t[7] | ||
694 | pxor @x[6], @t[6] | ||
695 | pshufd \$0x4E, @x[1], @t[1] | ||
696 | pxor @x[7], @t[7] | ||
697 | pshufd \$0x4E, @x[2], @t[2] | ||
698 | pxor @x[1], @t[1] | ||
699 | pshufd \$0x4E, @x[3], @t[3] | ||
700 | pxor @x[2], @t[2] | ||
701 | pxor @t[6], @x[0] | ||
702 | pxor @t[6], @x[1] | ||
703 | pshufd \$0x4E, @x[4], @t[4] | ||
704 | pxor @x[3], @t[3] | ||
705 | pxor @t[0], @x[2] | ||
706 | pxor @t[1], @x[3] | ||
707 | pshufd \$0x4E, @x[5], @t[5] | ||
708 | pxor @x[4], @t[4] | ||
709 | pxor @t[7], @x[1] | ||
710 | pxor @t[2], @x[4] | ||
711 | pxor @x[5], @t[5] | ||
712 | |||
713 | pxor @t[7], @x[2] | ||
714 | pxor @t[6], @x[3] | ||
715 | pxor @t[6], @x[4] | ||
716 | pxor @t[3], @x[5] | ||
717 | pxor @t[4], @x[6] | ||
718 | pxor @t[7], @x[4] | ||
719 | pxor @t[7], @x[5] | ||
720 | pxor @t[5], @x[7] | ||
721 | ___ | ||
722 | &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 | ||
723 | } | ||
724 | |||
725 | sub aesenc { # not used | ||
726 | my @b=@_[0..7]; | ||
727 | my @t=@_[8..15]; | ||
728 | $code.=<<___; | ||
729 | movdqa 0x30($const),@t[0] # .LSR | ||
730 | ___ | ||
731 | &ShiftRows (@b,@t[0]); | ||
732 | &Sbox (@b,@t); | ||
733 | &MixColumns (@b[0,1,4,6,3,7,2,5],@t); | ||
734 | } | ||
735 | |||
736 | sub aesenclast { # not used | ||
737 | my @b=@_[0..7]; | ||
738 | my @t=@_[8..15]; | ||
739 | $code.=<<___; | ||
740 | movdqa 0x40($const),@t[0] # .LSRM0 | ||
741 | ___ | ||
742 | &ShiftRows (@b,@t[0]); | ||
743 | &Sbox (@b,@t); | ||
744 | $code.=<<___ | ||
745 | pxor 0x00($key),@b[0] | ||
746 | pxor 0x10($key),@b[1] | ||
747 | pxor 0x20($key),@b[4] | ||
748 | pxor 0x30($key),@b[6] | ||
749 | pxor 0x40($key),@b[3] | ||
750 | pxor 0x50($key),@b[7] | ||
751 | pxor 0x60($key),@b[2] | ||
752 | pxor 0x70($key),@b[5] | ||
753 | ___ | ||
754 | } | ||
755 | |||
756 | sub swapmove { | ||
757 | my ($a,$b,$n,$mask,$t)=@_; | ||
758 | $code.=<<___; | ||
759 | movdqa $b,$t | ||
760 | psrlq \$$n,$b | ||
761 | pxor $a,$b | ||
762 | pand $mask,$b | ||
763 | pxor $b,$a | ||
764 | psllq \$$n,$b | ||
765 | pxor $t,$b | ||
766 | ___ | ||
767 | } | ||
768 | sub swapmove2x { | ||
769 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
770 | $code.=<<___; | ||
771 | movdqa $b0,$t0 | ||
772 | psrlq \$$n,$b0 | ||
773 | movdqa $b1,$t1 | ||
774 | psrlq \$$n,$b1 | ||
775 | pxor $a0,$b0 | ||
776 | pxor $a1,$b1 | ||
777 | pand $mask,$b0 | ||
778 | pand $mask,$b1 | ||
779 | pxor $b0,$a0 | ||
780 | psllq \$$n,$b0 | ||
781 | pxor $b1,$a1 | ||
782 | psllq \$$n,$b1 | ||
783 | pxor $t0,$b0 | ||
784 | pxor $t1,$b1 | ||
785 | ___ | ||
786 | } | ||
787 | |||
788 | sub bitslice { | ||
789 | my @x=reverse(@_[0..7]); | ||
790 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
791 | $code.=<<___; | ||
792 | movdqa 0x00($const),$t0 # .LBS0 | ||
793 | movdqa 0x10($const),$t1 # .LBS1 | ||
794 | ___ | ||
795 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
796 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
797 | $code.=<<___; | ||
798 | movdqa 0x20($const),$t0 # .LBS2 | ||
799 | ___ | ||
800 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
801 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
802 | |||
803 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
804 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
805 | } | ||
806 | |||
807 | $code.=<<___; | ||
808 | .text | ||
809 | |||
810 | .extern asm_AES_encrypt | ||
811 | .extern asm_AES_decrypt | ||
812 | |||
813 | .type _bsaes_encrypt8,\@abi-omnipotent | ||
814 | .align 64 | ||
815 | _bsaes_encrypt8: | ||
816 | _CET_ENDBR | ||
817 | lea .LBS0(%rip), $const # constants table | ||
818 | |||
819 | movdqa ($key), @XMM[9] # round 0 key | ||
820 | lea 0x10($key), $key | ||
821 | movdqa 0x50($const), @XMM[8] # .LM0SR | ||
822 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
823 | pxor @XMM[9], @XMM[1] | ||
824 | pshufb @XMM[8], @XMM[0] | ||
825 | pxor @XMM[9], @XMM[2] | ||
826 | pshufb @XMM[8], @XMM[1] | ||
827 | pxor @XMM[9], @XMM[3] | ||
828 | pshufb @XMM[8], @XMM[2] | ||
829 | pxor @XMM[9], @XMM[4] | ||
830 | pshufb @XMM[8], @XMM[3] | ||
831 | pxor @XMM[9], @XMM[5] | ||
832 | pshufb @XMM[8], @XMM[4] | ||
833 | pxor @XMM[9], @XMM[6] | ||
834 | pshufb @XMM[8], @XMM[5] | ||
835 | pxor @XMM[9], @XMM[7] | ||
836 | pshufb @XMM[8], @XMM[6] | ||
837 | pshufb @XMM[8], @XMM[7] | ||
838 | _bsaes_encrypt8_bitslice: | ||
839 | ___ | ||
840 | &bitslice (@XMM[0..7, 8..11]); | ||
841 | $code.=<<___; | ||
842 | dec $rounds | ||
843 | jmp .Lenc_sbox | ||
844 | .align 16 | ||
845 | .Lenc_loop: | ||
846 | ___ | ||
847 | &ShiftRows (@XMM[0..7, 8]); | ||
848 | $code.=".Lenc_sbox:\n"; | ||
849 | &Sbox (@XMM[0..7, 8..15]); | ||
850 | $code.=<<___; | ||
851 | dec $rounds | ||
852 | jl .Lenc_done | ||
853 | ___ | ||
854 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
855 | $code.=<<___; | ||
856 | movdqa 0x30($const), @XMM[8] # .LSR | ||
857 | jnz .Lenc_loop | ||
858 | movdqa 0x40($const), @XMM[8] # .LSRM0 | ||
859 | jmp .Lenc_loop | ||
860 | .align 16 | ||
861 | .Lenc_done: | ||
862 | ___ | ||
863 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
864 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
865 | $code.=<<___; | ||
866 | movdqa ($key), @XMM[8] # last round key | ||
867 | pxor @XMM[8], @XMM[4] | ||
868 | pxor @XMM[8], @XMM[6] | ||
869 | pxor @XMM[8], @XMM[3] | ||
870 | pxor @XMM[8], @XMM[7] | ||
871 | pxor @XMM[8], @XMM[2] | ||
872 | pxor @XMM[8], @XMM[5] | ||
873 | pxor @XMM[8], @XMM[0] | ||
874 | pxor @XMM[8], @XMM[1] | ||
875 | ret | ||
876 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
877 | |||
878 | .type _bsaes_decrypt8,\@abi-omnipotent | ||
879 | .align 64 | ||
880 | _bsaes_decrypt8: | ||
881 | _CET_ENDBR | ||
882 | lea .LBS0(%rip), $const # constants table | ||
883 | |||
884 | movdqa ($key), @XMM[9] # round 0 key | ||
885 | lea 0x10($key), $key | ||
886 | movdqa -0x30($const), @XMM[8] # .LM0ISR | ||
887 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
888 | pxor @XMM[9], @XMM[1] | ||
889 | pshufb @XMM[8], @XMM[0] | ||
890 | pxor @XMM[9], @XMM[2] | ||
891 | pshufb @XMM[8], @XMM[1] | ||
892 | pxor @XMM[9], @XMM[3] | ||
893 | pshufb @XMM[8], @XMM[2] | ||
894 | pxor @XMM[9], @XMM[4] | ||
895 | pshufb @XMM[8], @XMM[3] | ||
896 | pxor @XMM[9], @XMM[5] | ||
897 | pshufb @XMM[8], @XMM[4] | ||
898 | pxor @XMM[9], @XMM[6] | ||
899 | pshufb @XMM[8], @XMM[5] | ||
900 | pxor @XMM[9], @XMM[7] | ||
901 | pshufb @XMM[8], @XMM[6] | ||
902 | pshufb @XMM[8], @XMM[7] | ||
903 | ___ | ||
904 | &bitslice (@XMM[0..7, 8..11]); | ||
905 | $code.=<<___; | ||
906 | dec $rounds | ||
907 | jmp .Ldec_sbox | ||
908 | .align 16 | ||
909 | .Ldec_loop: | ||
910 | ___ | ||
911 | &ShiftRows (@XMM[0..7, 8]); | ||
912 | $code.=".Ldec_sbox:\n"; | ||
913 | &InvSbox (@XMM[0..7, 8..15]); | ||
914 | $code.=<<___; | ||
915 | dec $rounds | ||
916 | jl .Ldec_done | ||
917 | ___ | ||
918 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
919 | $code.=<<___; | ||
920 | movdqa -0x10($const), @XMM[8] # .LISR | ||
921 | jnz .Ldec_loop | ||
922 | movdqa -0x20($const), @XMM[8] # .LISRM0 | ||
923 | jmp .Ldec_loop | ||
924 | .align 16 | ||
925 | .Ldec_done: | ||
926 | ___ | ||
927 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
928 | $code.=<<___; | ||
929 | movdqa ($key), @XMM[8] # last round key | ||
930 | pxor @XMM[8], @XMM[6] | ||
931 | pxor @XMM[8], @XMM[4] | ||
932 | pxor @XMM[8], @XMM[2] | ||
933 | pxor @XMM[8], @XMM[7] | ||
934 | pxor @XMM[8], @XMM[3] | ||
935 | pxor @XMM[8], @XMM[5] | ||
936 | pxor @XMM[8], @XMM[0] | ||
937 | pxor @XMM[8], @XMM[1] | ||
938 | ret | ||
939 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
940 | ___ | ||
941 | } | ||
942 | { | ||
943 | my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); | ||
944 | |||
945 | sub bitslice_key { | ||
946 | my @x=reverse(@_[0..7]); | ||
947 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
948 | |||
949 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
950 | $code.=<<___; | ||
951 | #&swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
952 | movdqa @x[0], @x[2] | ||
953 | movdqa @x[1], @x[3] | ||
954 | ___ | ||
955 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
956 | |||
957 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
958 | $code.=<<___; | ||
959 | #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
960 | movdqa @x[0], @x[4] | ||
961 | movdqa @x[2], @x[6] | ||
962 | movdqa @x[1], @x[5] | ||
963 | movdqa @x[3], @x[7] | ||
964 | ___ | ||
965 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
966 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
967 | } | ||
968 | |||
969 | $code.=<<___; | ||
970 | .type _bsaes_key_convert,\@abi-omnipotent | ||
971 | .align 16 | ||
972 | _bsaes_key_convert: | ||
973 | _CET_ENDBR | ||
974 | lea .Lmasks(%rip), $const | ||
975 | movdqu ($inp), %xmm7 # load round 0 key | ||
976 | lea 0x10($inp), $inp | ||
977 | movdqa 0x00($const), %xmm0 # 0x01... | ||
978 | movdqa 0x10($const), %xmm1 # 0x02... | ||
979 | movdqa 0x20($const), %xmm2 # 0x04... | ||
980 | movdqa 0x30($const), %xmm3 # 0x08... | ||
981 | movdqa 0x40($const), %xmm4 # .LM0 | ||
982 | pcmpeqd %xmm5, %xmm5 # .LNOT | ||
983 | |||
984 | movdqu ($inp), %xmm6 # load round 1 key | ||
985 | movdqa %xmm7, ($out) # save round 0 key | ||
986 | lea 0x10($out), $out | ||
987 | dec $rounds | ||
988 | jmp .Lkey_loop | ||
989 | .align 16 | ||
990 | .Lkey_loop: | ||
991 | pshufb %xmm4, %xmm6 # .LM0 | ||
992 | |||
993 | movdqa %xmm0, %xmm8 | ||
994 | movdqa %xmm1, %xmm9 | ||
995 | |||
996 | pand %xmm6, %xmm8 | ||
997 | pand %xmm6, %xmm9 | ||
998 | movdqa %xmm2, %xmm10 | ||
999 | pcmpeqb %xmm0, %xmm8 | ||
1000 | psllq \$4, %xmm0 # 0x10... | ||
1001 | movdqa %xmm3, %xmm11 | ||
1002 | pcmpeqb %xmm1, %xmm9 | ||
1003 | psllq \$4, %xmm1 # 0x20... | ||
1004 | |||
1005 | pand %xmm6, %xmm10 | ||
1006 | pand %xmm6, %xmm11 | ||
1007 | movdqa %xmm0, %xmm12 | ||
1008 | pcmpeqb %xmm2, %xmm10 | ||
1009 | psllq \$4, %xmm2 # 0x40... | ||
1010 | movdqa %xmm1, %xmm13 | ||
1011 | pcmpeqb %xmm3, %xmm11 | ||
1012 | psllq \$4, %xmm3 # 0x80... | ||
1013 | |||
1014 | movdqa %xmm2, %xmm14 | ||
1015 | movdqa %xmm3, %xmm15 | ||
1016 | pxor %xmm5, %xmm8 # "pnot" | ||
1017 | pxor %xmm5, %xmm9 | ||
1018 | |||
1019 | pand %xmm6, %xmm12 | ||
1020 | pand %xmm6, %xmm13 | ||
1021 | movdqa %xmm8, 0x00($out) # write bit-sliced round key | ||
1022 | pcmpeqb %xmm0, %xmm12 | ||
1023 | psrlq \$4, %xmm0 # 0x01... | ||
1024 | movdqa %xmm9, 0x10($out) | ||
1025 | pcmpeqb %xmm1, %xmm13 | ||
1026 | psrlq \$4, %xmm1 # 0x02... | ||
1027 | lea 0x10($inp), $inp | ||
1028 | |||
1029 | pand %xmm6, %xmm14 | ||
1030 | pand %xmm6, %xmm15 | ||
1031 | movdqa %xmm10, 0x20($out) | ||
1032 | pcmpeqb %xmm2, %xmm14 | ||
1033 | psrlq \$4, %xmm2 # 0x04... | ||
1034 | movdqa %xmm11, 0x30($out) | ||
1035 | pcmpeqb %xmm3, %xmm15 | ||
1036 | psrlq \$4, %xmm3 # 0x08... | ||
1037 | movdqu ($inp), %xmm6 # load next round key | ||
1038 | |||
1039 | pxor %xmm5, %xmm13 # "pnot" | ||
1040 | pxor %xmm5, %xmm14 | ||
1041 | movdqa %xmm12, 0x40($out) | ||
1042 | movdqa %xmm13, 0x50($out) | ||
1043 | movdqa %xmm14, 0x60($out) | ||
1044 | movdqa %xmm15, 0x70($out) | ||
1045 | lea 0x80($out),$out | ||
1046 | dec $rounds | ||
1047 | jnz .Lkey_loop | ||
1048 | |||
1049 | movdqa 0x50($const), %xmm7 # .L63 | ||
1050 | #movdqa %xmm6, ($out) # don't save last round key | ||
1051 | ret | ||
1052 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
1053 | ___ | ||
1054 | } | ||
1055 | |||
1056 | if (0 && !$win64) { # following four functions are unsupported interface | ||
1057 | # used for benchmarking... | ||
1058 | $code.=<<___; | ||
1059 | .globl bsaes_enc_key_convert | ||
1060 | .type bsaes_enc_key_convert,\@function,2 | ||
1061 | .align 16 | ||
1062 | bsaes_enc_key_convert: | ||
1063 | _CET_ENDBR | ||
1064 | mov 240($inp),%r10d # pass rounds | ||
1065 | mov $inp,%rcx # pass key | ||
1066 | mov $out,%rax # pass key schedule | ||
1067 | call _bsaes_key_convert | ||
1068 | pxor %xmm6,%xmm7 # fix up last round key | ||
1069 | movdqa %xmm7,(%rax) # save last round key | ||
1070 | ret | ||
1071 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
1072 | |||
1073 | .globl bsaes_encrypt_128 | ||
1074 | .type bsaes_encrypt_128,\@function,4 | ||
1075 | .align 16 | ||
1076 | bsaes_encrypt_128: | ||
1077 | .Lenc128_loop: | ||
1078 | _CET_ENDBR | ||
1079 | movdqu 0x00($inp), @XMM[0] # load input | ||
1080 | movdqu 0x10($inp), @XMM[1] | ||
1081 | movdqu 0x20($inp), @XMM[2] | ||
1082 | movdqu 0x30($inp), @XMM[3] | ||
1083 | movdqu 0x40($inp), @XMM[4] | ||
1084 | movdqu 0x50($inp), @XMM[5] | ||
1085 | movdqu 0x60($inp), @XMM[6] | ||
1086 | movdqu 0x70($inp), @XMM[7] | ||
1087 | mov $key, %rax # pass the $key | ||
1088 | lea 0x80($inp), $inp | ||
1089 | mov \$10,%r10d | ||
1090 | |||
1091 | call _bsaes_encrypt8 | ||
1092 | |||
1093 | movdqu @XMM[0], 0x00($out) # write output | ||
1094 | movdqu @XMM[1], 0x10($out) | ||
1095 | movdqu @XMM[4], 0x20($out) | ||
1096 | movdqu @XMM[6], 0x30($out) | ||
1097 | movdqu @XMM[3], 0x40($out) | ||
1098 | movdqu @XMM[7], 0x50($out) | ||
1099 | movdqu @XMM[2], 0x60($out) | ||
1100 | movdqu @XMM[5], 0x70($out) | ||
1101 | lea 0x80($out), $out | ||
1102 | sub \$0x80,$len | ||
1103 | ja .Lenc128_loop | ||
1104 | ret | ||
1105 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
1106 | |||
1107 | .globl bsaes_dec_key_convert | ||
1108 | .type bsaes_dec_key_convert,\@function,2 | ||
1109 | .align 16 | ||
1110 | bsaes_dec_key_convert: | ||
1111 | _CET_ENDBR | ||
1112 | mov 240($inp),%r10d # pass rounds | ||
1113 | mov $inp,%rcx # pass key | ||
1114 | mov $out,%rax # pass key schedule | ||
1115 | call _bsaes_key_convert | ||
1116 | pxor ($out),%xmm7 # fix up round 0 key | ||
1117 | movdqa %xmm6,(%rax) # save last round key | ||
1118 | movdqa %xmm7,($out) | ||
1119 | ret | ||
1120 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
1121 | |||
1122 | .globl bsaes_decrypt_128 | ||
1123 | .type bsaes_decrypt_128,\@function,4 | ||
1124 | .align 16 | ||
1125 | bsaes_decrypt_128: | ||
1126 | _CET_ENDBR | ||
1127 | .Ldec128_loop: | ||
1128 | movdqu 0x00($inp), @XMM[0] # load input | ||
1129 | movdqu 0x10($inp), @XMM[1] | ||
1130 | movdqu 0x20($inp), @XMM[2] | ||
1131 | movdqu 0x30($inp), @XMM[3] | ||
1132 | movdqu 0x40($inp), @XMM[4] | ||
1133 | movdqu 0x50($inp), @XMM[5] | ||
1134 | movdqu 0x60($inp), @XMM[6] | ||
1135 | movdqu 0x70($inp), @XMM[7] | ||
1136 | mov $key, %rax # pass the $key | ||
1137 | lea 0x80($inp), $inp | ||
1138 | mov \$10,%r10d | ||
1139 | |||
1140 | call _bsaes_decrypt8 | ||
1141 | |||
1142 | movdqu @XMM[0], 0x00($out) # write output | ||
1143 | movdqu @XMM[1], 0x10($out) | ||
1144 | movdqu @XMM[6], 0x20($out) | ||
1145 | movdqu @XMM[4], 0x30($out) | ||
1146 | movdqu @XMM[2], 0x40($out) | ||
1147 | movdqu @XMM[7], 0x50($out) | ||
1148 | movdqu @XMM[3], 0x60($out) | ||
1149 | movdqu @XMM[5], 0x70($out) | ||
1150 | lea 0x80($out), $out | ||
1151 | sub \$0x80,$len | ||
1152 | ja .Ldec128_loop | ||
1153 | ret | ||
1154 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
1155 | ___ | ||
1156 | } | ||
1157 | { | ||
1158 | ###################################################################### | ||
1159 | # | ||
1160 | # OpenSSL interface | ||
1161 | # | ||
1162 | my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") | ||
1163 | : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | ||
1164 | my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); | ||
1165 | |||
1166 | if ($ecb) { | ||
1167 | $code.=<<___; | ||
1168 | .globl bsaes_ecb_encrypt_blocks | ||
1169 | .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent | ||
1170 | .align 16 | ||
1171 | bsaes_ecb_encrypt_blocks: | ||
1172 | _CET_ENDBR | ||
1173 | mov %rsp, %rax | ||
1174 | .Lecb_enc_prologue: | ||
1175 | push %rbp | ||
1176 | push %rbx | ||
1177 | push %r12 | ||
1178 | push %r13 | ||
1179 | push %r14 | ||
1180 | push %r15 | ||
1181 | lea -0x48(%rsp),%rsp | ||
1182 | ___ | ||
1183 | $code.=<<___ if ($win64); | ||
1184 | lea -0xa0(%rsp), %rsp | ||
1185 | movaps %xmm6, 0x40(%rsp) | ||
1186 | movaps %xmm7, 0x50(%rsp) | ||
1187 | movaps %xmm8, 0x60(%rsp) | ||
1188 | movaps %xmm9, 0x70(%rsp) | ||
1189 | movaps %xmm10, 0x80(%rsp) | ||
1190 | movaps %xmm11, 0x90(%rsp) | ||
1191 | movaps %xmm12, 0xa0(%rsp) | ||
1192 | movaps %xmm13, 0xb0(%rsp) | ||
1193 | movaps %xmm14, 0xc0(%rsp) | ||
1194 | movaps %xmm15, 0xd0(%rsp) | ||
1195 | .Lecb_enc_body: | ||
1196 | ___ | ||
1197 | $code.=<<___; | ||
1198 | mov %rsp,%rbp # backup %rsp | ||
1199 | mov 240($arg4),%eax # rounds | ||
1200 | mov $arg1,$inp # backup arguments | ||
1201 | mov $arg2,$out | ||
1202 | mov $arg3,$len | ||
1203 | mov $arg4,$key | ||
1204 | cmp \$8,$arg3 | ||
1205 | jb .Lecb_enc_short | ||
1206 | |||
1207 | mov %eax,%ebx # backup rounds | ||
1208 | shl \$7,%rax # 128 bytes per inner round key | ||
1209 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
1210 | sub %rax,%rsp | ||
1211 | mov %rsp,%rax # pass key schedule | ||
1212 | mov $key,%rcx # pass key | ||
1213 | mov %ebx,%r10d # pass rounds | ||
1214 | call _bsaes_key_convert | ||
1215 | pxor %xmm6,%xmm7 # fix up last round key | ||
1216 | movdqa %xmm7,(%rax) # save last round key | ||
1217 | |||
1218 | sub \$8,$len | ||
1219 | .Lecb_enc_loop: | ||
1220 | movdqu 0x00($inp), @XMM[0] # load input | ||
1221 | movdqu 0x10($inp), @XMM[1] | ||
1222 | movdqu 0x20($inp), @XMM[2] | ||
1223 | movdqu 0x30($inp), @XMM[3] | ||
1224 | movdqu 0x40($inp), @XMM[4] | ||
1225 | movdqu 0x50($inp), @XMM[5] | ||
1226 | mov %rsp, %rax # pass key schedule | ||
1227 | movdqu 0x60($inp), @XMM[6] | ||
1228 | mov %ebx,%r10d # pass rounds | ||
1229 | movdqu 0x70($inp), @XMM[7] | ||
1230 | lea 0x80($inp), $inp | ||
1231 | |||
1232 | call _bsaes_encrypt8 | ||
1233 | |||
1234 | movdqu @XMM[0], 0x00($out) # write output | ||
1235 | movdqu @XMM[1], 0x10($out) | ||
1236 | movdqu @XMM[4], 0x20($out) | ||
1237 | movdqu @XMM[6], 0x30($out) | ||
1238 | movdqu @XMM[3], 0x40($out) | ||
1239 | movdqu @XMM[7], 0x50($out) | ||
1240 | movdqu @XMM[2], 0x60($out) | ||
1241 | movdqu @XMM[5], 0x70($out) | ||
1242 | lea 0x80($out), $out | ||
1243 | sub \$8,$len | ||
1244 | jnc .Lecb_enc_loop | ||
1245 | |||
1246 | add \$8,$len | ||
1247 | jz .Lecb_enc_done | ||
1248 | |||
1249 | movdqu 0x00($inp), @XMM[0] # load input | ||
1250 | mov %rsp, %rax # pass key schedule | ||
1251 | mov %ebx,%r10d # pass rounds | ||
1252 | cmp \$2,$len | ||
1253 | jb .Lecb_enc_one | ||
1254 | movdqu 0x10($inp), @XMM[1] | ||
1255 | je .Lecb_enc_two | ||
1256 | movdqu 0x20($inp), @XMM[2] | ||
1257 | cmp \$4,$len | ||
1258 | jb .Lecb_enc_three | ||
1259 | movdqu 0x30($inp), @XMM[3] | ||
1260 | je .Lecb_enc_four | ||
1261 | movdqu 0x40($inp), @XMM[4] | ||
1262 | cmp \$6,$len | ||
1263 | jb .Lecb_enc_five | ||
1264 | movdqu 0x50($inp), @XMM[5] | ||
1265 | je .Lecb_enc_six | ||
1266 | movdqu 0x60($inp), @XMM[6] | ||
1267 | call _bsaes_encrypt8 | ||
1268 | movdqu @XMM[0], 0x00($out) # write output | ||
1269 | movdqu @XMM[1], 0x10($out) | ||
1270 | movdqu @XMM[4], 0x20($out) | ||
1271 | movdqu @XMM[6], 0x30($out) | ||
1272 | movdqu @XMM[3], 0x40($out) | ||
1273 | movdqu @XMM[7], 0x50($out) | ||
1274 | movdqu @XMM[2], 0x60($out) | ||
1275 | jmp .Lecb_enc_done | ||
1276 | .align 16 | ||
1277 | .Lecb_enc_six: | ||
1278 | call _bsaes_encrypt8 | ||
1279 | movdqu @XMM[0], 0x00($out) # write output | ||
1280 | movdqu @XMM[1], 0x10($out) | ||
1281 | movdqu @XMM[4], 0x20($out) | ||
1282 | movdqu @XMM[6], 0x30($out) | ||
1283 | movdqu @XMM[3], 0x40($out) | ||
1284 | movdqu @XMM[7], 0x50($out) | ||
1285 | jmp .Lecb_enc_done | ||
1286 | .align 16 | ||
1287 | .Lecb_enc_five: | ||
1288 | call _bsaes_encrypt8 | ||
1289 | movdqu @XMM[0], 0x00($out) # write output | ||
1290 | movdqu @XMM[1], 0x10($out) | ||
1291 | movdqu @XMM[4], 0x20($out) | ||
1292 | movdqu @XMM[6], 0x30($out) | ||
1293 | movdqu @XMM[3], 0x40($out) | ||
1294 | jmp .Lecb_enc_done | ||
1295 | .align 16 | ||
1296 | .Lecb_enc_four: | ||
1297 | call _bsaes_encrypt8 | ||
1298 | movdqu @XMM[0], 0x00($out) # write output | ||
1299 | movdqu @XMM[1], 0x10($out) | ||
1300 | movdqu @XMM[4], 0x20($out) | ||
1301 | movdqu @XMM[6], 0x30($out) | ||
1302 | jmp .Lecb_enc_done | ||
1303 | .align 16 | ||
1304 | .Lecb_enc_three: | ||
1305 | call _bsaes_encrypt8 | ||
1306 | movdqu @XMM[0], 0x00($out) # write output | ||
1307 | movdqu @XMM[1], 0x10($out) | ||
1308 | movdqu @XMM[4], 0x20($out) | ||
1309 | jmp .Lecb_enc_done | ||
1310 | .align 16 | ||
1311 | .Lecb_enc_two: | ||
1312 | call _bsaes_encrypt8 | ||
1313 | movdqu @XMM[0], 0x00($out) # write output | ||
1314 | movdqu @XMM[1], 0x10($out) | ||
1315 | jmp .Lecb_enc_done | ||
1316 | .align 16 | ||
1317 | .Lecb_enc_one: | ||
1318 | call _bsaes_encrypt8 | ||
1319 | movdqu @XMM[0], 0x00($out) # write output | ||
1320 | jmp .Lecb_enc_done | ||
1321 | .align 16 | ||
1322 | .Lecb_enc_short: | ||
1323 | lea ($inp), $arg1 | ||
1324 | lea ($out), $arg2 | ||
1325 | lea ($key), $arg3 | ||
1326 | call asm_AES_encrypt | ||
1327 | lea 16($inp), $inp | ||
1328 | lea 16($out), $out | ||
1329 | dec $len | ||
1330 | jnz .Lecb_enc_short | ||
1331 | |||
1332 | .Lecb_enc_done: | ||
1333 | lea (%rsp),%rax | ||
1334 | pxor %xmm0, %xmm0 | ||
1335 | .Lecb_enc_bzero: # wipe key schedule [if any] | ||
1336 | movdqa %xmm0, 0x00(%rax) | ||
1337 | movdqa %xmm0, 0x10(%rax) | ||
1338 | lea 0x20(%rax), %rax | ||
1339 | cmp %rax, %rbp | ||
1340 | jb .Lecb_enc_bzero | ||
1341 | |||
1342 | lea (%rbp),%rsp # restore %rsp | ||
1343 | ___ | ||
1344 | $code.=<<___ if ($win64); | ||
1345 | movaps 0x40(%rbp), %xmm6 | ||
1346 | movaps 0x50(%rbp), %xmm7 | ||
1347 | movaps 0x60(%rbp), %xmm8 | ||
1348 | movaps 0x70(%rbp), %xmm9 | ||
1349 | movaps 0x80(%rbp), %xmm10 | ||
1350 | movaps 0x90(%rbp), %xmm11 | ||
1351 | movaps 0xa0(%rbp), %xmm12 | ||
1352 | movaps 0xb0(%rbp), %xmm13 | ||
1353 | movaps 0xc0(%rbp), %xmm14 | ||
1354 | movaps 0xd0(%rbp), %xmm15 | ||
1355 | lea 0xa0(%rbp), %rsp | ||
1356 | ___ | ||
1357 | $code.=<<___; | ||
1358 | mov 0x48(%rsp), %r15 | ||
1359 | mov 0x50(%rsp), %r14 | ||
1360 | mov 0x58(%rsp), %r13 | ||
1361 | mov 0x60(%rsp), %r12 | ||
1362 | mov 0x68(%rsp), %rbx | ||
1363 | mov 0x70(%rsp), %rax | ||
1364 | lea 0x78(%rsp), %rsp | ||
1365 | mov %rax, %rbp | ||
1366 | .Lecb_enc_epilogue: | ||
1367 | ret | ||
1368 | .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | ||
1369 | |||
1370 | .globl bsaes_ecb_decrypt_blocks | ||
1371 | .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent | ||
1372 | .align 16 | ||
1373 | bsaes_ecb_decrypt_blocks: | ||
1374 | _CET_ENDBR | ||
1375 | mov %rsp, %rax | ||
1376 | .Lecb_dec_prologue: | ||
1377 | push %rbp | ||
1378 | push %rbx | ||
1379 | push %r12 | ||
1380 | push %r13 | ||
1381 | push %r14 | ||
1382 | push %r15 | ||
1383 | lea -0x48(%rsp),%rsp | ||
1384 | ___ | ||
1385 | $code.=<<___ if ($win64); | ||
1386 | lea -0xa0(%rsp), %rsp | ||
1387 | movaps %xmm6, 0x40(%rsp) | ||
1388 | movaps %xmm7, 0x50(%rsp) | ||
1389 | movaps %xmm8, 0x60(%rsp) | ||
1390 | movaps %xmm9, 0x70(%rsp) | ||
1391 | movaps %xmm10, 0x80(%rsp) | ||
1392 | movaps %xmm11, 0x90(%rsp) | ||
1393 | movaps %xmm12, 0xa0(%rsp) | ||
1394 | movaps %xmm13, 0xb0(%rsp) | ||
1395 | movaps %xmm14, 0xc0(%rsp) | ||
1396 | movaps %xmm15, 0xd0(%rsp) | ||
1397 | .Lecb_dec_body: | ||
1398 | ___ | ||
1399 | $code.=<<___; | ||
1400 | mov %rsp,%rbp # backup %rsp | ||
1401 | mov 240($arg4),%eax # rounds | ||
1402 | mov $arg1,$inp # backup arguments | ||
1403 | mov $arg2,$out | ||
1404 | mov $arg3,$len | ||
1405 | mov $arg4,$key | ||
1406 | cmp \$8,$arg3 | ||
1407 | jb .Lecb_dec_short | ||
1408 | |||
1409 | mov %eax,%ebx # backup rounds | ||
1410 | shl \$7,%rax # 128 bytes per inner round key | ||
1411 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
1412 | sub %rax,%rsp | ||
1413 | mov %rsp,%rax # pass key schedule | ||
1414 | mov $key,%rcx # pass key | ||
1415 | mov %ebx,%r10d # pass rounds | ||
1416 | call _bsaes_key_convert | ||
1417 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
1418 | movdqa %xmm6,(%rax) # save last round key | ||
1419 | movdqa %xmm7,(%rsp) | ||
1420 | |||
1421 | sub \$8,$len | ||
1422 | .Lecb_dec_loop: | ||
1423 | movdqu 0x00($inp), @XMM[0] # load input | ||
1424 | movdqu 0x10($inp), @XMM[1] | ||
1425 | movdqu 0x20($inp), @XMM[2] | ||
1426 | movdqu 0x30($inp), @XMM[3] | ||
1427 | movdqu 0x40($inp), @XMM[4] | ||
1428 | movdqu 0x50($inp), @XMM[5] | ||
1429 | mov %rsp, %rax # pass key schedule | ||
1430 | movdqu 0x60($inp), @XMM[6] | ||
1431 | mov %ebx,%r10d # pass rounds | ||
1432 | movdqu 0x70($inp), @XMM[7] | ||
1433 | lea 0x80($inp), $inp | ||
1434 | |||
1435 | call _bsaes_decrypt8 | ||
1436 | |||
1437 | movdqu @XMM[0], 0x00($out) # write output | ||
1438 | movdqu @XMM[1], 0x10($out) | ||
1439 | movdqu @XMM[6], 0x20($out) | ||
1440 | movdqu @XMM[4], 0x30($out) | ||
1441 | movdqu @XMM[2], 0x40($out) | ||
1442 | movdqu @XMM[7], 0x50($out) | ||
1443 | movdqu @XMM[3], 0x60($out) | ||
1444 | movdqu @XMM[5], 0x70($out) | ||
1445 | lea 0x80($out), $out | ||
1446 | sub \$8,$len | ||
1447 | jnc .Lecb_dec_loop | ||
1448 | |||
1449 | add \$8,$len | ||
1450 | jz .Lecb_dec_done | ||
1451 | |||
1452 | movdqu 0x00($inp), @XMM[0] # load input | ||
1453 | mov %rsp, %rax # pass key schedule | ||
1454 | mov %ebx,%r10d # pass rounds | ||
1455 | cmp \$2,$len | ||
1456 | jb .Lecb_dec_one | ||
1457 | movdqu 0x10($inp), @XMM[1] | ||
1458 | je .Lecb_dec_two | ||
1459 | movdqu 0x20($inp), @XMM[2] | ||
1460 | cmp \$4,$len | ||
1461 | jb .Lecb_dec_three | ||
1462 | movdqu 0x30($inp), @XMM[3] | ||
1463 | je .Lecb_dec_four | ||
1464 | movdqu 0x40($inp), @XMM[4] | ||
1465 | cmp \$6,$len | ||
1466 | jb .Lecb_dec_five | ||
1467 | movdqu 0x50($inp), @XMM[5] | ||
1468 | je .Lecb_dec_six | ||
1469 | movdqu 0x60($inp), @XMM[6] | ||
1470 | call _bsaes_decrypt8 | ||
1471 | movdqu @XMM[0], 0x00($out) # write output | ||
1472 | movdqu @XMM[1], 0x10($out) | ||
1473 | movdqu @XMM[6], 0x20($out) | ||
1474 | movdqu @XMM[4], 0x30($out) | ||
1475 | movdqu @XMM[2], 0x40($out) | ||
1476 | movdqu @XMM[7], 0x50($out) | ||
1477 | movdqu @XMM[3], 0x60($out) | ||
1478 | jmp .Lecb_dec_done | ||
1479 | .align 16 | ||
1480 | .Lecb_dec_six: | ||
1481 | call _bsaes_decrypt8 | ||
1482 | movdqu @XMM[0], 0x00($out) # write output | ||
1483 | movdqu @XMM[1], 0x10($out) | ||
1484 | movdqu @XMM[6], 0x20($out) | ||
1485 | movdqu @XMM[4], 0x30($out) | ||
1486 | movdqu @XMM[2], 0x40($out) | ||
1487 | movdqu @XMM[7], 0x50($out) | ||
1488 | jmp .Lecb_dec_done | ||
1489 | .align 16 | ||
1490 | .Lecb_dec_five: | ||
1491 | call _bsaes_decrypt8 | ||
1492 | movdqu @XMM[0], 0x00($out) # write output | ||
1493 | movdqu @XMM[1], 0x10($out) | ||
1494 | movdqu @XMM[6], 0x20($out) | ||
1495 | movdqu @XMM[4], 0x30($out) | ||
1496 | movdqu @XMM[2], 0x40($out) | ||
1497 | jmp .Lecb_dec_done | ||
1498 | .align 16 | ||
1499 | .Lecb_dec_four: | ||
1500 | call _bsaes_decrypt8 | ||
1501 | movdqu @XMM[0], 0x00($out) # write output | ||
1502 | movdqu @XMM[1], 0x10($out) | ||
1503 | movdqu @XMM[6], 0x20($out) | ||
1504 | movdqu @XMM[4], 0x30($out) | ||
1505 | jmp .Lecb_dec_done | ||
1506 | .align 16 | ||
1507 | .Lecb_dec_three: | ||
1508 | call _bsaes_decrypt8 | ||
1509 | movdqu @XMM[0], 0x00($out) # write output | ||
1510 | movdqu @XMM[1], 0x10($out) | ||
1511 | movdqu @XMM[6], 0x20($out) | ||
1512 | jmp .Lecb_dec_done | ||
1513 | .align 16 | ||
1514 | .Lecb_dec_two: | ||
1515 | call _bsaes_decrypt8 | ||
1516 | movdqu @XMM[0], 0x00($out) # write output | ||
1517 | movdqu @XMM[1], 0x10($out) | ||
1518 | jmp .Lecb_dec_done | ||
1519 | .align 16 | ||
1520 | .Lecb_dec_one: | ||
1521 | call _bsaes_decrypt8 | ||
1522 | movdqu @XMM[0], 0x00($out) # write output | ||
1523 | jmp .Lecb_dec_done | ||
1524 | .align 16 | ||
1525 | .Lecb_dec_short: | ||
1526 | lea ($inp), $arg1 | ||
1527 | lea ($out), $arg2 | ||
1528 | lea ($key), $arg3 | ||
1529 | call asm_AES_decrypt | ||
1530 | lea 16($inp), $inp | ||
1531 | lea 16($out), $out | ||
1532 | dec $len | ||
1533 | jnz .Lecb_dec_short | ||
1534 | |||
1535 | .Lecb_dec_done: | ||
1536 | lea (%rsp),%rax | ||
1537 | pxor %xmm0, %xmm0 | ||
1538 | .Lecb_dec_bzero: # wipe key schedule [if any] | ||
1539 | movdqa %xmm0, 0x00(%rax) | ||
1540 | movdqa %xmm0, 0x10(%rax) | ||
1541 | lea 0x20(%rax), %rax | ||
1542 | cmp %rax, %rbp | ||
1543 | jb .Lecb_dec_bzero | ||
1544 | |||
1545 | lea (%rbp),%rsp # restore %rsp | ||
1546 | ___ | ||
1547 | $code.=<<___ if ($win64); | ||
1548 | movaps 0x40(%rbp), %xmm6 | ||
1549 | movaps 0x50(%rbp), %xmm7 | ||
1550 | movaps 0x60(%rbp), %xmm8 | ||
1551 | movaps 0x70(%rbp), %xmm9 | ||
1552 | movaps 0x80(%rbp), %xmm10 | ||
1553 | movaps 0x90(%rbp), %xmm11 | ||
1554 | movaps 0xa0(%rbp), %xmm12 | ||
1555 | movaps 0xb0(%rbp), %xmm13 | ||
1556 | movaps 0xc0(%rbp), %xmm14 | ||
1557 | movaps 0xd0(%rbp), %xmm15 | ||
1558 | lea 0xa0(%rbp), %rsp | ||
1559 | ___ | ||
1560 | $code.=<<___; | ||
1561 | mov 0x48(%rsp), %r15 | ||
1562 | mov 0x50(%rsp), %r14 | ||
1563 | mov 0x58(%rsp), %r13 | ||
1564 | mov 0x60(%rsp), %r12 | ||
1565 | mov 0x68(%rsp), %rbx | ||
1566 | mov 0x70(%rsp), %rax | ||
1567 | lea 0x78(%rsp), %rsp | ||
1568 | mov %rax, %rbp | ||
1569 | .Lecb_dec_epilogue: | ||
1570 | ret | ||
1571 | .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | ||
1572 | ___ | ||
1573 | } | ||
1574 | $code.=<<___; | ||
1575 | .extern asm_AES_cbc_encrypt | ||
1576 | .globl bsaes_cbc_encrypt | ||
1577 | .type bsaes_cbc_encrypt,\@abi-omnipotent | ||
1578 | .align 16 | ||
1579 | bsaes_cbc_encrypt: | ||
1580 | _CET_ENDBR | ||
1581 | ___ | ||
1582 | $code.=<<___ if ($win64); | ||
1583 | mov 48(%rsp),$arg6 # pull direction flag | ||
1584 | ___ | ||
1585 | $code.=<<___; | ||
1586 | cmp \$0,$arg6 | ||
1587 | jne asm_AES_cbc_encrypt | ||
1588 | cmp \$128,$arg3 | ||
1589 | jb asm_AES_cbc_encrypt | ||
1590 | |||
1591 | mov %rsp, %rax | ||
1592 | .Lcbc_dec_prologue: | ||
1593 | push %rbp | ||
1594 | push %rbx | ||
1595 | push %r12 | ||
1596 | push %r13 | ||
1597 | push %r14 | ||
1598 | push %r15 | ||
1599 | lea -0x48(%rsp), %rsp | ||
1600 | ___ | ||
1601 | $code.=<<___ if ($win64); | ||
1602 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
1603 | lea -0xa0(%rsp), %rsp | ||
1604 | movaps %xmm6, 0x40(%rsp) | ||
1605 | movaps %xmm7, 0x50(%rsp) | ||
1606 | movaps %xmm8, 0x60(%rsp) | ||
1607 | movaps %xmm9, 0x70(%rsp) | ||
1608 | movaps %xmm10, 0x80(%rsp) | ||
1609 | movaps %xmm11, 0x90(%rsp) | ||
1610 | movaps %xmm12, 0xa0(%rsp) | ||
1611 | movaps %xmm13, 0xb0(%rsp) | ||
1612 | movaps %xmm14, 0xc0(%rsp) | ||
1613 | movaps %xmm15, 0xd0(%rsp) | ||
1614 | .Lcbc_dec_body: | ||
1615 | ___ | ||
1616 | $code.=<<___; | ||
1617 | mov %rsp, %rbp # backup %rsp | ||
1618 | mov 240($arg4), %eax # rounds | ||
1619 | mov $arg1, $inp # backup arguments | ||
1620 | mov $arg2, $out | ||
1621 | mov $arg3, $len | ||
1622 | mov $arg4, $key | ||
1623 | mov $arg5, %rbx | ||
1624 | shr \$4, $len # bytes to blocks | ||
1625 | |||
1626 | mov %eax, %edx # rounds | ||
1627 | shl \$7, %rax # 128 bytes per inner round key | ||
1628 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
1629 | sub %rax, %rsp | ||
1630 | |||
1631 | mov %rsp, %rax # pass key schedule | ||
1632 | mov $key, %rcx # pass key | ||
1633 | mov %edx, %r10d # pass rounds | ||
1634 | call _bsaes_key_convert | ||
1635 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
1636 | movdqa %xmm6,(%rax) # save last round key | ||
1637 | movdqa %xmm7,(%rsp) | ||
1638 | |||
1639 | movdqu (%rbx), @XMM[15] # load IV | ||
1640 | sub \$8,$len | ||
1641 | .Lcbc_dec_loop: | ||
1642 | movdqu 0x00($inp), @XMM[0] # load input | ||
1643 | movdqu 0x10($inp), @XMM[1] | ||
1644 | movdqu 0x20($inp), @XMM[2] | ||
1645 | movdqu 0x30($inp), @XMM[3] | ||
1646 | movdqu 0x40($inp), @XMM[4] | ||
1647 | movdqu 0x50($inp), @XMM[5] | ||
1648 | mov %rsp, %rax # pass key schedule | ||
1649 | movdqu 0x60($inp), @XMM[6] | ||
1650 | mov %edx,%r10d # pass rounds | ||
1651 | movdqu 0x70($inp), @XMM[7] | ||
1652 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1653 | |||
1654 | call _bsaes_decrypt8 | ||
1655 | |||
1656 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1657 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1658 | movdqu 0x10($inp), @XMM[9] | ||
1659 | pxor @XMM[8], @XMM[1] | ||
1660 | movdqu 0x20($inp), @XMM[10] | ||
1661 | pxor @XMM[9], @XMM[6] | ||
1662 | movdqu 0x30($inp), @XMM[11] | ||
1663 | pxor @XMM[10], @XMM[4] | ||
1664 | movdqu 0x40($inp), @XMM[12] | ||
1665 | pxor @XMM[11], @XMM[2] | ||
1666 | movdqu 0x50($inp), @XMM[13] | ||
1667 | pxor @XMM[12], @XMM[7] | ||
1668 | movdqu 0x60($inp), @XMM[14] | ||
1669 | pxor @XMM[13], @XMM[3] | ||
1670 | movdqu 0x70($inp), @XMM[15] # IV | ||
1671 | pxor @XMM[14], @XMM[5] | ||
1672 | movdqu @XMM[0], 0x00($out) # write output | ||
1673 | lea 0x80($inp), $inp | ||
1674 | movdqu @XMM[1], 0x10($out) | ||
1675 | movdqu @XMM[6], 0x20($out) | ||
1676 | movdqu @XMM[4], 0x30($out) | ||
1677 | movdqu @XMM[2], 0x40($out) | ||
1678 | movdqu @XMM[7], 0x50($out) | ||
1679 | movdqu @XMM[3], 0x60($out) | ||
1680 | movdqu @XMM[5], 0x70($out) | ||
1681 | lea 0x80($out), $out | ||
1682 | sub \$8,$len | ||
1683 | jnc .Lcbc_dec_loop | ||
1684 | |||
1685 | add \$8,$len | ||
1686 | jz .Lcbc_dec_done | ||
1687 | |||
1688 | movdqu 0x00($inp), @XMM[0] # load input | ||
1689 | mov %rsp, %rax # pass key schedule | ||
1690 | mov %edx, %r10d # pass rounds | ||
1691 | cmp \$2,$len | ||
1692 | jb .Lcbc_dec_one | ||
1693 | movdqu 0x10($inp), @XMM[1] | ||
1694 | je .Lcbc_dec_two | ||
1695 | movdqu 0x20($inp), @XMM[2] | ||
1696 | cmp \$4,$len | ||
1697 | jb .Lcbc_dec_three | ||
1698 | movdqu 0x30($inp), @XMM[3] | ||
1699 | je .Lcbc_dec_four | ||
1700 | movdqu 0x40($inp), @XMM[4] | ||
1701 | cmp \$6,$len | ||
1702 | jb .Lcbc_dec_five | ||
1703 | movdqu 0x50($inp), @XMM[5] | ||
1704 | je .Lcbc_dec_six | ||
1705 | movdqu 0x60($inp), @XMM[6] | ||
1706 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1707 | call _bsaes_decrypt8 | ||
1708 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1709 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1710 | movdqu 0x10($inp), @XMM[9] | ||
1711 | pxor @XMM[8], @XMM[1] | ||
1712 | movdqu 0x20($inp), @XMM[10] | ||
1713 | pxor @XMM[9], @XMM[6] | ||
1714 | movdqu 0x30($inp), @XMM[11] | ||
1715 | pxor @XMM[10], @XMM[4] | ||
1716 | movdqu 0x40($inp), @XMM[12] | ||
1717 | pxor @XMM[11], @XMM[2] | ||
1718 | movdqu 0x50($inp), @XMM[13] | ||
1719 | pxor @XMM[12], @XMM[7] | ||
1720 | movdqu 0x60($inp), @XMM[15] # IV | ||
1721 | pxor @XMM[13], @XMM[3] | ||
1722 | movdqu @XMM[0], 0x00($out) # write output | ||
1723 | movdqu @XMM[1], 0x10($out) | ||
1724 | movdqu @XMM[6], 0x20($out) | ||
1725 | movdqu @XMM[4], 0x30($out) | ||
1726 | movdqu @XMM[2], 0x40($out) | ||
1727 | movdqu @XMM[7], 0x50($out) | ||
1728 | movdqu @XMM[3], 0x60($out) | ||
1729 | jmp .Lcbc_dec_done | ||
1730 | .align 16 | ||
1731 | .Lcbc_dec_six: | ||
1732 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1733 | call _bsaes_decrypt8 | ||
1734 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1735 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1736 | movdqu 0x10($inp), @XMM[9] | ||
1737 | pxor @XMM[8], @XMM[1] | ||
1738 | movdqu 0x20($inp), @XMM[10] | ||
1739 | pxor @XMM[9], @XMM[6] | ||
1740 | movdqu 0x30($inp), @XMM[11] | ||
1741 | pxor @XMM[10], @XMM[4] | ||
1742 | movdqu 0x40($inp), @XMM[12] | ||
1743 | pxor @XMM[11], @XMM[2] | ||
1744 | movdqu 0x50($inp), @XMM[15] # IV | ||
1745 | pxor @XMM[12], @XMM[7] | ||
1746 | movdqu @XMM[0], 0x00($out) # write output | ||
1747 | movdqu @XMM[1], 0x10($out) | ||
1748 | movdqu @XMM[6], 0x20($out) | ||
1749 | movdqu @XMM[4], 0x30($out) | ||
1750 | movdqu @XMM[2], 0x40($out) | ||
1751 | movdqu @XMM[7], 0x50($out) | ||
1752 | jmp .Lcbc_dec_done | ||
1753 | .align 16 | ||
1754 | .Lcbc_dec_five: | ||
1755 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1756 | call _bsaes_decrypt8 | ||
1757 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1758 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1759 | movdqu 0x10($inp), @XMM[9] | ||
1760 | pxor @XMM[8], @XMM[1] | ||
1761 | movdqu 0x20($inp), @XMM[10] | ||
1762 | pxor @XMM[9], @XMM[6] | ||
1763 | movdqu 0x30($inp), @XMM[11] | ||
1764 | pxor @XMM[10], @XMM[4] | ||
1765 | movdqu 0x40($inp), @XMM[15] # IV | ||
1766 | pxor @XMM[11], @XMM[2] | ||
1767 | movdqu @XMM[0], 0x00($out) # write output | ||
1768 | movdqu @XMM[1], 0x10($out) | ||
1769 | movdqu @XMM[6], 0x20($out) | ||
1770 | movdqu @XMM[4], 0x30($out) | ||
1771 | movdqu @XMM[2], 0x40($out) | ||
1772 | jmp .Lcbc_dec_done | ||
1773 | .align 16 | ||
1774 | .Lcbc_dec_four: | ||
1775 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1776 | call _bsaes_decrypt8 | ||
1777 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1778 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1779 | movdqu 0x10($inp), @XMM[9] | ||
1780 | pxor @XMM[8], @XMM[1] | ||
1781 | movdqu 0x20($inp), @XMM[10] | ||
1782 | pxor @XMM[9], @XMM[6] | ||
1783 | movdqu 0x30($inp), @XMM[15] # IV | ||
1784 | pxor @XMM[10], @XMM[4] | ||
1785 | movdqu @XMM[0], 0x00($out) # write output | ||
1786 | movdqu @XMM[1], 0x10($out) | ||
1787 | movdqu @XMM[6], 0x20($out) | ||
1788 | movdqu @XMM[4], 0x30($out) | ||
1789 | jmp .Lcbc_dec_done | ||
1790 | .align 16 | ||
1791 | .Lcbc_dec_three: | ||
1792 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1793 | call _bsaes_decrypt8 | ||
1794 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1795 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1796 | movdqu 0x10($inp), @XMM[9] | ||
1797 | pxor @XMM[8], @XMM[1] | ||
1798 | movdqu 0x20($inp), @XMM[15] # IV | ||
1799 | pxor @XMM[9], @XMM[6] | ||
1800 | movdqu @XMM[0], 0x00($out) # write output | ||
1801 | movdqu @XMM[1], 0x10($out) | ||
1802 | movdqu @XMM[6], 0x20($out) | ||
1803 | jmp .Lcbc_dec_done | ||
1804 | .align 16 | ||
1805 | .Lcbc_dec_two: | ||
1806 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1807 | call _bsaes_decrypt8 | ||
1808 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1809 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1810 | movdqu 0x10($inp), @XMM[15] # IV | ||
1811 | pxor @XMM[8], @XMM[1] | ||
1812 | movdqu @XMM[0], 0x00($out) # write output | ||
1813 | movdqu @XMM[1], 0x10($out) | ||
1814 | jmp .Lcbc_dec_done | ||
1815 | .align 16 | ||
1816 | .Lcbc_dec_one: | ||
1817 | lea ($inp), $arg1 | ||
1818 | lea 0x20(%rbp), $arg2 # buffer output | ||
1819 | lea ($key), $arg3 | ||
1820 | call asm_AES_decrypt # doesn't touch %xmm | ||
1821 | pxor 0x20(%rbp), @XMM[15] # ^= IV | ||
1822 | movdqu @XMM[15], ($out) # write output | ||
1823 | movdqa @XMM[0], @XMM[15] # IV | ||
1824 | |||
1825 | .Lcbc_dec_done: | ||
1826 | movdqu @XMM[15], (%rbx) # return IV | ||
1827 | lea (%rsp), %rax | ||
1828 | pxor %xmm0, %xmm0 | ||
1829 | .Lcbc_dec_bzero: # wipe key schedule [if any] | ||
1830 | movdqa %xmm0, 0x00(%rax) | ||
1831 | movdqa %xmm0, 0x10(%rax) | ||
1832 | lea 0x20(%rax), %rax | ||
1833 | cmp %rax, %rbp | ||
1834 | ja .Lcbc_dec_bzero | ||
1835 | |||
1836 | lea (%rbp),%rsp # restore %rsp | ||
1837 | ___ | ||
1838 | $code.=<<___ if ($win64); | ||
1839 | movaps 0x40(%rbp), %xmm6 | ||
1840 | movaps 0x50(%rbp), %xmm7 | ||
1841 | movaps 0x60(%rbp), %xmm8 | ||
1842 | movaps 0x70(%rbp), %xmm9 | ||
1843 | movaps 0x80(%rbp), %xmm10 | ||
1844 | movaps 0x90(%rbp), %xmm11 | ||
1845 | movaps 0xa0(%rbp), %xmm12 | ||
1846 | movaps 0xb0(%rbp), %xmm13 | ||
1847 | movaps 0xc0(%rbp), %xmm14 | ||
1848 | movaps 0xd0(%rbp), %xmm15 | ||
1849 | lea 0xa0(%rbp), %rsp | ||
1850 | ___ | ||
1851 | $code.=<<___; | ||
1852 | mov 0x48(%rsp), %r15 | ||
1853 | mov 0x50(%rsp), %r14 | ||
1854 | mov 0x58(%rsp), %r13 | ||
1855 | mov 0x60(%rsp), %r12 | ||
1856 | mov 0x68(%rsp), %rbx | ||
1857 | mov 0x70(%rsp), %rax | ||
1858 | lea 0x78(%rsp), %rsp | ||
1859 | mov %rax, %rbp | ||
1860 | .Lcbc_dec_epilogue: | ||
1861 | ret | ||
1862 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
1863 | |||
1864 | .globl bsaes_ctr32_encrypt_blocks | ||
1865 | .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent | ||
1866 | .align 16 | ||
1867 | bsaes_ctr32_encrypt_blocks: | ||
1868 | _CET_ENDBR | ||
1869 | mov %rsp, %rax | ||
1870 | .Lctr_enc_prologue: | ||
1871 | push %rbp | ||
1872 | push %rbx | ||
1873 | push %r12 | ||
1874 | push %r13 | ||
1875 | push %r14 | ||
1876 | push %r15 | ||
1877 | lea -0x48(%rsp), %rsp | ||
1878 | ___ | ||
1879 | $code.=<<___ if ($win64); | ||
1880 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
1881 | lea -0xa0(%rsp), %rsp | ||
1882 | movaps %xmm6, 0x40(%rsp) | ||
1883 | movaps %xmm7, 0x50(%rsp) | ||
1884 | movaps %xmm8, 0x60(%rsp) | ||
1885 | movaps %xmm9, 0x70(%rsp) | ||
1886 | movaps %xmm10, 0x80(%rsp) | ||
1887 | movaps %xmm11, 0x90(%rsp) | ||
1888 | movaps %xmm12, 0xa0(%rsp) | ||
1889 | movaps %xmm13, 0xb0(%rsp) | ||
1890 | movaps %xmm14, 0xc0(%rsp) | ||
1891 | movaps %xmm15, 0xd0(%rsp) | ||
1892 | .Lctr_enc_body: | ||
1893 | ___ | ||
1894 | $code.=<<___; | ||
1895 | mov %rsp, %rbp # backup %rsp | ||
1896 | movdqu ($arg5), %xmm0 # load counter | ||
1897 | mov 240($arg4), %eax # rounds | ||
1898 | mov $arg1, $inp # backup arguments | ||
1899 | mov $arg2, $out | ||
1900 | mov $arg3, $len | ||
1901 | mov $arg4, $key | ||
1902 | movdqa %xmm0, 0x20(%rbp) # copy counter | ||
1903 | cmp \$8, $arg3 | ||
1904 | jb .Lctr_enc_short | ||
1905 | |||
1906 | mov %eax, %ebx # rounds | ||
1907 | shl \$7, %rax # 128 bytes per inner round key | ||
1908 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
1909 | sub %rax, %rsp | ||
1910 | |||
1911 | mov %rsp, %rax # pass key schedule | ||
1912 | mov $key, %rcx # pass key | ||
1913 | mov %ebx, %r10d # pass rounds | ||
1914 | call _bsaes_key_convert | ||
1915 | pxor %xmm6,%xmm7 # fix up last round key | ||
1916 | movdqa %xmm7,(%rax) # save last round key | ||
1917 | |||
1918 | movdqa (%rsp), @XMM[9] # load round0 key | ||
1919 | lea .LADD1(%rip), %r11 | ||
1920 | movdqa 0x20(%rbp), @XMM[0] # counter copy | ||
1921 | movdqa -0x20(%r11), @XMM[8] # .LSWPUP | ||
1922 | pshufb @XMM[8], @XMM[9] # byte swap upper part | ||
1923 | pshufb @XMM[8], @XMM[0] | ||
1924 | movdqa @XMM[9], (%rsp) # save adjusted round0 key | ||
1925 | jmp .Lctr_enc_loop | ||
1926 | .align 16 | ||
1927 | .Lctr_enc_loop: | ||
1928 | movdqa @XMM[0], 0x20(%rbp) # save counter | ||
1929 | movdqa @XMM[0], @XMM[1] # prepare 8 counter values | ||
1930 | movdqa @XMM[0], @XMM[2] | ||
1931 | paddd 0x00(%r11), @XMM[1] # .LADD1 | ||
1932 | movdqa @XMM[0], @XMM[3] | ||
1933 | paddd 0x10(%r11), @XMM[2] # .LADD2 | ||
1934 | movdqa @XMM[0], @XMM[4] | ||
1935 | paddd 0x20(%r11), @XMM[3] # .LADD3 | ||
1936 | movdqa @XMM[0], @XMM[5] | ||
1937 | paddd 0x30(%r11), @XMM[4] # .LADD4 | ||
1938 | movdqa @XMM[0], @XMM[6] | ||
1939 | paddd 0x40(%r11), @XMM[5] # .LADD5 | ||
1940 | movdqa @XMM[0], @XMM[7] | ||
1941 | paddd 0x50(%r11), @XMM[6] # .LADD6 | ||
1942 | paddd 0x60(%r11), @XMM[7] # .LADD7 | ||
1943 | |||
1944 | # Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
1945 | # to flip byte order in 32-bit counter | ||
1946 | movdqa (%rsp), @XMM[9] # round 0 key | ||
1947 | lea 0x10(%rsp), %rax # pass key schedule | ||
1948 | movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR | ||
1949 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
1950 | pxor @XMM[9], @XMM[1] | ||
1951 | pshufb @XMM[8], @XMM[0] | ||
1952 | pxor @XMM[9], @XMM[2] | ||
1953 | pshufb @XMM[8], @XMM[1] | ||
1954 | pxor @XMM[9], @XMM[3] | ||
1955 | pshufb @XMM[8], @XMM[2] | ||
1956 | pxor @XMM[9], @XMM[4] | ||
1957 | pshufb @XMM[8], @XMM[3] | ||
1958 | pxor @XMM[9], @XMM[5] | ||
1959 | pshufb @XMM[8], @XMM[4] | ||
1960 | pxor @XMM[9], @XMM[6] | ||
1961 | pshufb @XMM[8], @XMM[5] | ||
1962 | pxor @XMM[9], @XMM[7] | ||
1963 | pshufb @XMM[8], @XMM[6] | ||
1964 | lea .LBS0(%rip), %r11 # constants table | ||
1965 | pshufb @XMM[8], @XMM[7] | ||
1966 | mov %ebx,%r10d # pass rounds | ||
1967 | |||
1968 | call _bsaes_encrypt8_bitslice | ||
1969 | |||
1970 | sub \$8,$len | ||
1971 | jc .Lctr_enc_loop_done | ||
1972 | |||
1973 | movdqu 0x00($inp), @XMM[8] # load input | ||
1974 | movdqu 0x10($inp), @XMM[9] | ||
1975 | movdqu 0x20($inp), @XMM[10] | ||
1976 | movdqu 0x30($inp), @XMM[11] | ||
1977 | movdqu 0x40($inp), @XMM[12] | ||
1978 | movdqu 0x50($inp), @XMM[13] | ||
1979 | movdqu 0x60($inp), @XMM[14] | ||
1980 | movdqu 0x70($inp), @XMM[15] | ||
1981 | lea 0x80($inp),$inp | ||
1982 | pxor @XMM[0], @XMM[8] | ||
1983 | movdqa 0x20(%rbp), @XMM[0] # load counter | ||
1984 | pxor @XMM[9], @XMM[1] | ||
1985 | movdqu @XMM[8], 0x00($out) # write output | ||
1986 | pxor @XMM[10], @XMM[4] | ||
1987 | movdqu @XMM[1], 0x10($out) | ||
1988 | pxor @XMM[11], @XMM[6] | ||
1989 | movdqu @XMM[4], 0x20($out) | ||
1990 | pxor @XMM[12], @XMM[3] | ||
1991 | movdqu @XMM[6], 0x30($out) | ||
1992 | pxor @XMM[13], @XMM[7] | ||
1993 | movdqu @XMM[3], 0x40($out) | ||
1994 | pxor @XMM[14], @XMM[2] | ||
1995 | movdqu @XMM[7], 0x50($out) | ||
1996 | pxor @XMM[15], @XMM[5] | ||
1997 | movdqu @XMM[2], 0x60($out) | ||
1998 | lea .LADD1(%rip), %r11 | ||
1999 | movdqu @XMM[5], 0x70($out) | ||
2000 | lea 0x80($out), $out | ||
2001 | paddd 0x70(%r11), @XMM[0] # .LADD8 | ||
2002 | jnz .Lctr_enc_loop | ||
2003 | |||
2004 | jmp .Lctr_enc_done | ||
2005 | .align 16 | ||
2006 | .Lctr_enc_loop_done: | ||
2007 | add \$8, $len | ||
2008 | movdqu 0x00($inp), @XMM[8] # load input | ||
2009 | pxor @XMM[8], @XMM[0] | ||
2010 | movdqu @XMM[0], 0x00($out) # write output | ||
2011 | cmp \$2,$len | ||
2012 | jb .Lctr_enc_done | ||
2013 | movdqu 0x10($inp), @XMM[9] | ||
2014 | pxor @XMM[9], @XMM[1] | ||
2015 | movdqu @XMM[1], 0x10($out) | ||
2016 | je .Lctr_enc_done | ||
2017 | movdqu 0x20($inp), @XMM[10] | ||
2018 | pxor @XMM[10], @XMM[4] | ||
2019 | movdqu @XMM[4], 0x20($out) | ||
2020 | cmp \$4,$len | ||
2021 | jb .Lctr_enc_done | ||
2022 | movdqu 0x30($inp), @XMM[11] | ||
2023 | pxor @XMM[11], @XMM[6] | ||
2024 | movdqu @XMM[6], 0x30($out) | ||
2025 | je .Lctr_enc_done | ||
2026 | movdqu 0x40($inp), @XMM[12] | ||
2027 | pxor @XMM[12], @XMM[3] | ||
2028 | movdqu @XMM[3], 0x40($out) | ||
2029 | cmp \$6,$len | ||
2030 | jb .Lctr_enc_done | ||
2031 | movdqu 0x50($inp), @XMM[13] | ||
2032 | pxor @XMM[13], @XMM[7] | ||
2033 | movdqu @XMM[7], 0x50($out) | ||
2034 | je .Lctr_enc_done | ||
2035 | movdqu 0x60($inp), @XMM[14] | ||
2036 | pxor @XMM[14], @XMM[2] | ||
2037 | movdqu @XMM[2], 0x60($out) | ||
2038 | jmp .Lctr_enc_done | ||
2039 | |||
2040 | .align 16 | ||
2041 | .Lctr_enc_short: | ||
2042 | lea 0x20(%rbp), $arg1 | ||
2043 | lea 0x30(%rbp), $arg2 | ||
2044 | lea ($key), $arg3 | ||
2045 | call asm_AES_encrypt | ||
2046 | movdqu ($inp), @XMM[1] | ||
2047 | lea 16($inp), $inp | ||
2048 | mov 0x2c(%rbp), %eax # load 32-bit counter | ||
2049 | bswap %eax | ||
2050 | pxor 0x30(%rbp), @XMM[1] | ||
2051 | inc %eax # increment | ||
2052 | movdqu @XMM[1], ($out) | ||
2053 | bswap %eax | ||
2054 | lea 16($out), $out | ||
2055 | mov %eax, 0x2c(%rsp) # save 32-bit counter | ||
2056 | dec $len | ||
2057 | jnz .Lctr_enc_short | ||
2058 | |||
2059 | .Lctr_enc_done: | ||
2060 | lea (%rsp), %rax | ||
2061 | pxor %xmm0, %xmm0 | ||
2062 | .Lctr_enc_bzero: # wipe key schedule [if any] | ||
2063 | movdqa %xmm0, 0x00(%rax) | ||
2064 | movdqa %xmm0, 0x10(%rax) | ||
2065 | lea 0x20(%rax), %rax | ||
2066 | cmp %rax, %rbp | ||
2067 | ja .Lctr_enc_bzero | ||
2068 | |||
2069 | lea (%rbp),%rsp # restore %rsp | ||
2070 | ___ | ||
2071 | $code.=<<___ if ($win64); | ||
2072 | movaps 0x40(%rbp), %xmm6 | ||
2073 | movaps 0x50(%rbp), %xmm7 | ||
2074 | movaps 0x60(%rbp), %xmm8 | ||
2075 | movaps 0x70(%rbp), %xmm9 | ||
2076 | movaps 0x80(%rbp), %xmm10 | ||
2077 | movaps 0x90(%rbp), %xmm11 | ||
2078 | movaps 0xa0(%rbp), %xmm12 | ||
2079 | movaps 0xb0(%rbp), %xmm13 | ||
2080 | movaps 0xc0(%rbp), %xmm14 | ||
2081 | movaps 0xd0(%rbp), %xmm15 | ||
2082 | lea 0xa0(%rbp), %rsp | ||
2083 | ___ | ||
2084 | $code.=<<___; | ||
2085 | mov 0x48(%rsp), %r15 | ||
2086 | mov 0x50(%rsp), %r14 | ||
2087 | mov 0x58(%rsp), %r13 | ||
2088 | mov 0x60(%rsp), %r12 | ||
2089 | mov 0x68(%rsp), %rbx | ||
2090 | mov 0x70(%rsp), %rax | ||
2091 | lea 0x78(%rsp), %rsp | ||
2092 | mov %rax, %rbp | ||
2093 | .Lctr_enc_epilogue: | ||
2094 | ret | ||
2095 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
2096 | ___ | ||
2097 | ###################################################################### | ||
2098 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
2099 | # const AES_KEY *key1, const AES_KEY *key2, | ||
2100 | # const unsigned char iv[16]); | ||
2101 | # | ||
2102 | my ($twmask,$twres,$twtmp)=@XMM[13..15]; | ||
2103 | $arg6=~s/d$//; | ||
2104 | |||
2105 | $code.=<<___; | ||
2106 | .globl bsaes_xts_encrypt | ||
2107 | .type bsaes_xts_encrypt,\@abi-omnipotent | ||
2108 | .align 16 | ||
2109 | bsaes_xts_encrypt: | ||
2110 | _CET_ENDBR | ||
2111 | mov %rsp, %rax | ||
2112 | .Lxts_enc_prologue: | ||
2113 | push %rbp | ||
2114 | push %rbx | ||
2115 | push %r12 | ||
2116 | push %r13 | ||
2117 | push %r14 | ||
2118 | push %r15 | ||
2119 | lea -0x48(%rsp), %rsp | ||
2120 | ___ | ||
2121 | $code.=<<___ if ($win64); | ||
2122 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
2123 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
2124 | lea -0xa0(%rsp), %rsp | ||
2125 | movaps %xmm6, 0x40(%rsp) | ||
2126 | movaps %xmm7, 0x50(%rsp) | ||
2127 | movaps %xmm8, 0x60(%rsp) | ||
2128 | movaps %xmm9, 0x70(%rsp) | ||
2129 | movaps %xmm10, 0x80(%rsp) | ||
2130 | movaps %xmm11, 0x90(%rsp) | ||
2131 | movaps %xmm12, 0xa0(%rsp) | ||
2132 | movaps %xmm13, 0xb0(%rsp) | ||
2133 | movaps %xmm14, 0xc0(%rsp) | ||
2134 | movaps %xmm15, 0xd0(%rsp) | ||
2135 | .Lxts_enc_body: | ||
2136 | ___ | ||
2137 | $code.=<<___; | ||
2138 | mov %rsp, %rbp # backup %rsp | ||
2139 | mov $arg1, $inp # backup arguments | ||
2140 | mov $arg2, $out | ||
2141 | mov $arg3, $len | ||
2142 | mov $arg4, $key | ||
2143 | |||
2144 | lea ($arg6), $arg1 | ||
2145 | lea 0x20(%rbp), $arg2 | ||
2146 | lea ($arg5), $arg3 | ||
2147 | call asm_AES_encrypt # generate initial tweak | ||
2148 | |||
2149 | mov 240($key), %eax # rounds | ||
2150 | mov $len, %rbx # backup $len | ||
2151 | |||
2152 | mov %eax, %edx # rounds | ||
2153 | shl \$7, %rax # 128 bytes per inner round key | ||
2154 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
2155 | sub %rax, %rsp | ||
2156 | |||
2157 | mov %rsp, %rax # pass key schedule | ||
2158 | mov $key, %rcx # pass key | ||
2159 | mov %edx, %r10d # pass rounds | ||
2160 | call _bsaes_key_convert | ||
2161 | pxor %xmm6, %xmm7 # fix up last round key | ||
2162 | movdqa %xmm7, (%rax) # save last round key | ||
2163 | |||
2164 | and \$-16, $len | ||
2165 | sub \$0x80, %rsp # place for tweak[8] | ||
2166 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
2167 | |||
2168 | pxor $twtmp, $twtmp | ||
2169 | movdqa .Lxts_magic(%rip), $twmask | ||
2170 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2171 | |||
2172 | sub \$0x80, $len | ||
2173 | jc .Lxts_enc_short | ||
2174 | jmp .Lxts_enc_loop | ||
2175 | |||
2176 | .align 16 | ||
2177 | .Lxts_enc_loop: | ||
2178 | ___ | ||
2179 | for ($i=0;$i<7;$i++) { | ||
2180 | $code.=<<___; | ||
2181 | pshufd \$0x13, $twtmp, $twres | ||
2182 | pxor $twtmp, $twtmp | ||
2183 | movdqa @XMM[7], @XMM[$i] | ||
2184 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2185 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2186 | pand $twmask, $twres # isolate carry and residue | ||
2187 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2188 | pxor $twres, @XMM[7] | ||
2189 | ___ | ||
2190 | $code.=<<___ if ($i>=1); | ||
2191 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2192 | ___ | ||
2193 | $code.=<<___ if ($i>=2); | ||
2194 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2195 | ___ | ||
2196 | } | ||
2197 | $code.=<<___; | ||
2198 | movdqu 0x60($inp), @XMM[8+6] | ||
2199 | pxor @XMM[8+5], @XMM[5] | ||
2200 | movdqu 0x70($inp), @XMM[8+7] | ||
2201 | lea 0x80($inp), $inp | ||
2202 | movdqa @XMM[7], 0x70(%rsp) | ||
2203 | pxor @XMM[8+6], @XMM[6] | ||
2204 | lea 0x80(%rsp), %rax # pass key schedule | ||
2205 | pxor @XMM[8+7], @XMM[7] | ||
2206 | mov %edx, %r10d # pass rounds | ||
2207 | |||
2208 | call _bsaes_encrypt8 | ||
2209 | |||
2210 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2211 | pxor 0x10(%rsp), @XMM[1] | ||
2212 | movdqu @XMM[0], 0x00($out) # write output | ||
2213 | pxor 0x20(%rsp), @XMM[4] | ||
2214 | movdqu @XMM[1], 0x10($out) | ||
2215 | pxor 0x30(%rsp), @XMM[6] | ||
2216 | movdqu @XMM[4], 0x20($out) | ||
2217 | pxor 0x40(%rsp), @XMM[3] | ||
2218 | movdqu @XMM[6], 0x30($out) | ||
2219 | pxor 0x50(%rsp), @XMM[7] | ||
2220 | movdqu @XMM[3], 0x40($out) | ||
2221 | pxor 0x60(%rsp), @XMM[2] | ||
2222 | movdqu @XMM[7], 0x50($out) | ||
2223 | pxor 0x70(%rsp), @XMM[5] | ||
2224 | movdqu @XMM[2], 0x60($out) | ||
2225 | movdqu @XMM[5], 0x70($out) | ||
2226 | lea 0x80($out), $out | ||
2227 | |||
2228 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
2229 | pxor $twtmp, $twtmp | ||
2230 | movdqa .Lxts_magic(%rip), $twmask | ||
2231 | pcmpgtd @XMM[7], $twtmp | ||
2232 | pshufd \$0x13, $twtmp, $twres | ||
2233 | pxor $twtmp, $twtmp | ||
2234 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2235 | pand $twmask, $twres # isolate carry and residue | ||
2236 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2237 | pxor $twres, @XMM[7] | ||
2238 | |||
2239 | sub \$0x80,$len | ||
2240 | jnc .Lxts_enc_loop | ||
2241 | |||
2242 | .Lxts_enc_short: | ||
2243 | add \$0x80, $len | ||
2244 | jz .Lxts_enc_done | ||
2245 | ___ | ||
2246 | for ($i=0;$i<7;$i++) { | ||
2247 | $code.=<<___; | ||
2248 | pshufd \$0x13, $twtmp, $twres | ||
2249 | pxor $twtmp, $twtmp | ||
2250 | movdqa @XMM[7], @XMM[$i] | ||
2251 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2252 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2253 | pand $twmask, $twres # isolate carry and residue | ||
2254 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2255 | pxor $twres, @XMM[7] | ||
2256 | ___ | ||
2257 | $code.=<<___ if ($i>=1); | ||
2258 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2259 | cmp \$`0x10*$i`,$len | ||
2260 | je .Lxts_enc_$i | ||
2261 | ___ | ||
2262 | $code.=<<___ if ($i>=2); | ||
2263 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2264 | ___ | ||
2265 | } | ||
2266 | $code.=<<___; | ||
2267 | movdqu 0x60($inp), @XMM[8+6] | ||
2268 | pxor @XMM[8+5], @XMM[5] | ||
2269 | movdqa @XMM[7], 0x70(%rsp) | ||
2270 | lea 0x70($inp), $inp | ||
2271 | pxor @XMM[8+6], @XMM[6] | ||
2272 | lea 0x80(%rsp), %rax # pass key schedule | ||
2273 | mov %edx, %r10d # pass rounds | ||
2274 | |||
2275 | call _bsaes_encrypt8 | ||
2276 | |||
2277 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2278 | pxor 0x10(%rsp), @XMM[1] | ||
2279 | movdqu @XMM[0], 0x00($out) # write output | ||
2280 | pxor 0x20(%rsp), @XMM[4] | ||
2281 | movdqu @XMM[1], 0x10($out) | ||
2282 | pxor 0x30(%rsp), @XMM[6] | ||
2283 | movdqu @XMM[4], 0x20($out) | ||
2284 | pxor 0x40(%rsp), @XMM[3] | ||
2285 | movdqu @XMM[6], 0x30($out) | ||
2286 | pxor 0x50(%rsp), @XMM[7] | ||
2287 | movdqu @XMM[3], 0x40($out) | ||
2288 | pxor 0x60(%rsp), @XMM[2] | ||
2289 | movdqu @XMM[7], 0x50($out) | ||
2290 | movdqu @XMM[2], 0x60($out) | ||
2291 | lea 0x70($out), $out | ||
2292 | |||
2293 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
2294 | jmp .Lxts_enc_done | ||
2295 | .align 16 | ||
2296 | .Lxts_enc_6: | ||
2297 | pxor @XMM[8+4], @XMM[4] | ||
2298 | lea 0x60($inp), $inp | ||
2299 | pxor @XMM[8+5], @XMM[5] | ||
2300 | lea 0x80(%rsp), %rax # pass key schedule | ||
2301 | mov %edx, %r10d # pass rounds | ||
2302 | |||
2303 | call _bsaes_encrypt8 | ||
2304 | |||
2305 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2306 | pxor 0x10(%rsp), @XMM[1] | ||
2307 | movdqu @XMM[0], 0x00($out) # write output | ||
2308 | pxor 0x20(%rsp), @XMM[4] | ||
2309 | movdqu @XMM[1], 0x10($out) | ||
2310 | pxor 0x30(%rsp), @XMM[6] | ||
2311 | movdqu @XMM[4], 0x20($out) | ||
2312 | pxor 0x40(%rsp), @XMM[3] | ||
2313 | movdqu @XMM[6], 0x30($out) | ||
2314 | pxor 0x50(%rsp), @XMM[7] | ||
2315 | movdqu @XMM[3], 0x40($out) | ||
2316 | movdqu @XMM[7], 0x50($out) | ||
2317 | lea 0x60($out), $out | ||
2318 | |||
2319 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
2320 | jmp .Lxts_enc_done | ||
2321 | .align 16 | ||
2322 | .Lxts_enc_5: | ||
2323 | pxor @XMM[8+3], @XMM[3] | ||
2324 | lea 0x50($inp), $inp | ||
2325 | pxor @XMM[8+4], @XMM[4] | ||
2326 | lea 0x80(%rsp), %rax # pass key schedule | ||
2327 | mov %edx, %r10d # pass rounds | ||
2328 | |||
2329 | call _bsaes_encrypt8 | ||
2330 | |||
2331 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2332 | pxor 0x10(%rsp), @XMM[1] | ||
2333 | movdqu @XMM[0], 0x00($out) # write output | ||
2334 | pxor 0x20(%rsp), @XMM[4] | ||
2335 | movdqu @XMM[1], 0x10($out) | ||
2336 | pxor 0x30(%rsp), @XMM[6] | ||
2337 | movdqu @XMM[4], 0x20($out) | ||
2338 | pxor 0x40(%rsp), @XMM[3] | ||
2339 | movdqu @XMM[6], 0x30($out) | ||
2340 | movdqu @XMM[3], 0x40($out) | ||
2341 | lea 0x50($out), $out | ||
2342 | |||
2343 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
2344 | jmp .Lxts_enc_done | ||
2345 | .align 16 | ||
2346 | .Lxts_enc_4: | ||
2347 | pxor @XMM[8+2], @XMM[2] | ||
2348 | lea 0x40($inp), $inp | ||
2349 | pxor @XMM[8+3], @XMM[3] | ||
2350 | lea 0x80(%rsp), %rax # pass key schedule | ||
2351 | mov %edx, %r10d # pass rounds | ||
2352 | |||
2353 | call _bsaes_encrypt8 | ||
2354 | |||
2355 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2356 | pxor 0x10(%rsp), @XMM[1] | ||
2357 | movdqu @XMM[0], 0x00($out) # write output | ||
2358 | pxor 0x20(%rsp), @XMM[4] | ||
2359 | movdqu @XMM[1], 0x10($out) | ||
2360 | pxor 0x30(%rsp), @XMM[6] | ||
2361 | movdqu @XMM[4], 0x20($out) | ||
2362 | movdqu @XMM[6], 0x30($out) | ||
2363 | lea 0x40($out), $out | ||
2364 | |||
2365 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
2366 | jmp .Lxts_enc_done | ||
2367 | .align 16 | ||
2368 | .Lxts_enc_3: | ||
2369 | pxor @XMM[8+1], @XMM[1] | ||
2370 | lea 0x30($inp), $inp | ||
2371 | pxor @XMM[8+2], @XMM[2] | ||
2372 | lea 0x80(%rsp), %rax # pass key schedule | ||
2373 | mov %edx, %r10d # pass rounds | ||
2374 | |||
2375 | call _bsaes_encrypt8 | ||
2376 | |||
2377 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2378 | pxor 0x10(%rsp), @XMM[1] | ||
2379 | movdqu @XMM[0], 0x00($out) # write output | ||
2380 | pxor 0x20(%rsp), @XMM[4] | ||
2381 | movdqu @XMM[1], 0x10($out) | ||
2382 | movdqu @XMM[4], 0x20($out) | ||
2383 | lea 0x30($out), $out | ||
2384 | |||
2385 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
2386 | jmp .Lxts_enc_done | ||
2387 | .align 16 | ||
2388 | .Lxts_enc_2: | ||
2389 | pxor @XMM[8+0], @XMM[0] | ||
2390 | lea 0x20($inp), $inp | ||
2391 | pxor @XMM[8+1], @XMM[1] | ||
2392 | lea 0x80(%rsp), %rax # pass key schedule | ||
2393 | mov %edx, %r10d # pass rounds | ||
2394 | |||
2395 | call _bsaes_encrypt8 | ||
2396 | |||
2397 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2398 | pxor 0x10(%rsp), @XMM[1] | ||
2399 | movdqu @XMM[0], 0x00($out) # write output | ||
2400 | movdqu @XMM[1], 0x10($out) | ||
2401 | lea 0x20($out), $out | ||
2402 | |||
2403 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
2404 | jmp .Lxts_enc_done | ||
2405 | .align 16 | ||
2406 | .Lxts_enc_1: | ||
2407 | pxor @XMM[0], @XMM[8] | ||
2408 | lea 0x10($inp), $inp | ||
2409 | movdqa @XMM[8], 0x20(%rbp) | ||
2410 | lea 0x20(%rbp), $arg1 | ||
2411 | lea 0x20(%rbp), $arg2 | ||
2412 | lea ($key), $arg3 | ||
2413 | call asm_AES_encrypt # doesn't touch %xmm | ||
2414 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
2415 | #pxor @XMM[8], @XMM[0] | ||
2416 | #lea 0x80(%rsp), %rax # pass key schedule | ||
2417 | #mov %edx, %r10d # pass rounds | ||
2418 | #call _bsaes_encrypt8 | ||
2419 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2420 | movdqu @XMM[0], 0x00($out) # write output | ||
2421 | lea 0x10($out), $out | ||
2422 | |||
2423 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
2424 | |||
2425 | .Lxts_enc_done: | ||
2426 | and \$15, %ebx | ||
2427 | jz .Lxts_enc_ret | ||
2428 | mov $out, %rdx | ||
2429 | |||
2430 | .Lxts_enc_steal: | ||
2431 | movzb ($inp), %eax | ||
2432 | movzb -16(%rdx), %ecx | ||
2433 | lea 1($inp), $inp | ||
2434 | mov %al, -16(%rdx) | ||
2435 | mov %cl, 0(%rdx) | ||
2436 | lea 1(%rdx), %rdx | ||
2437 | sub \$1,%ebx | ||
2438 | jnz .Lxts_enc_steal | ||
2439 | |||
2440 | movdqu -16($out), @XMM[0] | ||
2441 | lea 0x20(%rbp), $arg1 | ||
2442 | pxor @XMM[7], @XMM[0] | ||
2443 | lea 0x20(%rbp), $arg2 | ||
2444 | movdqa @XMM[0], 0x20(%rbp) | ||
2445 | lea ($key), $arg3 | ||
2446 | call asm_AES_encrypt # doesn't touch %xmm | ||
2447 | pxor 0x20(%rbp), @XMM[7] | ||
2448 | movdqu @XMM[7], -16($out) | ||
2449 | |||
2450 | .Lxts_enc_ret: | ||
2451 | lea (%rsp), %rax | ||
2452 | pxor %xmm0, %xmm0 | ||
2453 | .Lxts_enc_bzero: # wipe key schedule [if any] | ||
2454 | movdqa %xmm0, 0x00(%rax) | ||
2455 | movdqa %xmm0, 0x10(%rax) | ||
2456 | lea 0x20(%rax), %rax | ||
2457 | cmp %rax, %rbp | ||
2458 | ja .Lxts_enc_bzero | ||
2459 | |||
2460 | lea (%rbp),%rsp # restore %rsp | ||
2461 | ___ | ||
2462 | $code.=<<___ if ($win64); | ||
2463 | movaps 0x40(%rbp), %xmm6 | ||
2464 | movaps 0x50(%rbp), %xmm7 | ||
2465 | movaps 0x60(%rbp), %xmm8 | ||
2466 | movaps 0x70(%rbp), %xmm9 | ||
2467 | movaps 0x80(%rbp), %xmm10 | ||
2468 | movaps 0x90(%rbp), %xmm11 | ||
2469 | movaps 0xa0(%rbp), %xmm12 | ||
2470 | movaps 0xb0(%rbp), %xmm13 | ||
2471 | movaps 0xc0(%rbp), %xmm14 | ||
2472 | movaps 0xd0(%rbp), %xmm15 | ||
2473 | lea 0xa0(%rbp), %rsp | ||
2474 | ___ | ||
2475 | $code.=<<___; | ||
2476 | mov 0x48(%rsp), %r15 | ||
2477 | mov 0x50(%rsp), %r14 | ||
2478 | mov 0x58(%rsp), %r13 | ||
2479 | mov 0x60(%rsp), %r12 | ||
2480 | mov 0x68(%rsp), %rbx | ||
2481 | mov 0x70(%rsp), %rax | ||
2482 | lea 0x78(%rsp), %rsp | ||
2483 | mov %rax, %rbp | ||
2484 | .Lxts_enc_epilogue: | ||
2485 | ret | ||
2486 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
2487 | |||
2488 | .globl bsaes_xts_decrypt | ||
2489 | .type bsaes_xts_decrypt,\@abi-omnipotent | ||
2490 | .align 16 | ||
2491 | bsaes_xts_decrypt: | ||
2492 | _CET_ENDBR | ||
2493 | mov %rsp, %rax | ||
2494 | .Lxts_dec_prologue: | ||
2495 | push %rbp | ||
2496 | push %rbx | ||
2497 | push %r12 | ||
2498 | push %r13 | ||
2499 | push %r14 | ||
2500 | push %r15 | ||
2501 | lea -0x48(%rsp), %rsp | ||
2502 | ___ | ||
2503 | $code.=<<___ if ($win64); | ||
2504 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
2505 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
2506 | lea -0xa0(%rsp), %rsp | ||
2507 | movaps %xmm6, 0x40(%rsp) | ||
2508 | movaps %xmm7, 0x50(%rsp) | ||
2509 | movaps %xmm8, 0x60(%rsp) | ||
2510 | movaps %xmm9, 0x70(%rsp) | ||
2511 | movaps %xmm10, 0x80(%rsp) | ||
2512 | movaps %xmm11, 0x90(%rsp) | ||
2513 | movaps %xmm12, 0xa0(%rsp) | ||
2514 | movaps %xmm13, 0xb0(%rsp) | ||
2515 | movaps %xmm14, 0xc0(%rsp) | ||
2516 | movaps %xmm15, 0xd0(%rsp) | ||
2517 | .Lxts_dec_body: | ||
2518 | ___ | ||
2519 | $code.=<<___; | ||
2520 | mov %rsp, %rbp # backup %rsp | ||
2521 | mov $arg1, $inp # backup arguments | ||
2522 | mov $arg2, $out | ||
2523 | mov $arg3, $len | ||
2524 | mov $arg4, $key | ||
2525 | |||
2526 | lea ($arg6), $arg1 | ||
2527 | lea 0x20(%rbp), $arg2 | ||
2528 | lea ($arg5), $arg3 | ||
2529 | call asm_AES_encrypt # generate initial tweak | ||
2530 | |||
2531 | mov 240($key), %eax # rounds | ||
2532 | mov $len, %rbx # backup $len | ||
2533 | |||
2534 | mov %eax, %edx # rounds | ||
2535 | shl \$7, %rax # 128 bytes per inner round key | ||
2536 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
2537 | sub %rax, %rsp | ||
2538 | |||
2539 | mov %rsp, %rax # pass key schedule | ||
2540 | mov $key, %rcx # pass key | ||
2541 | mov %edx, %r10d # pass rounds | ||
2542 | call _bsaes_key_convert | ||
2543 | pxor (%rsp), %xmm7 # fix up round 0 key | ||
2544 | movdqa %xmm6, (%rax) # save last round key | ||
2545 | movdqa %xmm7, (%rsp) | ||
2546 | |||
2547 | xor %eax, %eax # if ($len%16) len-=16; | ||
2548 | and \$-16, $len | ||
2549 | test \$15, %ebx | ||
2550 | setnz %al | ||
2551 | shl \$4, %rax | ||
2552 | sub %rax, $len | ||
2553 | |||
2554 | sub \$0x80, %rsp # place for tweak[8] | ||
2555 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
2556 | |||
2557 | pxor $twtmp, $twtmp | ||
2558 | movdqa .Lxts_magic(%rip), $twmask | ||
2559 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2560 | |||
2561 | sub \$0x80, $len | ||
2562 | jc .Lxts_dec_short | ||
2563 | jmp .Lxts_dec_loop | ||
2564 | |||
2565 | .align 16 | ||
2566 | .Lxts_dec_loop: | ||
2567 | ___ | ||
2568 | for ($i=0;$i<7;$i++) { | ||
2569 | $code.=<<___; | ||
2570 | pshufd \$0x13, $twtmp, $twres | ||
2571 | pxor $twtmp, $twtmp | ||
2572 | movdqa @XMM[7], @XMM[$i] | ||
2573 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2574 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2575 | pand $twmask, $twres # isolate carry and residue | ||
2576 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2577 | pxor $twres, @XMM[7] | ||
2578 | ___ | ||
2579 | $code.=<<___ if ($i>=1); | ||
2580 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2581 | ___ | ||
2582 | $code.=<<___ if ($i>=2); | ||
2583 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2584 | ___ | ||
2585 | } | ||
2586 | $code.=<<___; | ||
2587 | movdqu 0x60($inp), @XMM[8+6] | ||
2588 | pxor @XMM[8+5], @XMM[5] | ||
2589 | movdqu 0x70($inp), @XMM[8+7] | ||
2590 | lea 0x80($inp), $inp | ||
2591 | movdqa @XMM[7], 0x70(%rsp) | ||
2592 | pxor @XMM[8+6], @XMM[6] | ||
2593 | lea 0x80(%rsp), %rax # pass key schedule | ||
2594 | pxor @XMM[8+7], @XMM[7] | ||
2595 | mov %edx, %r10d # pass rounds | ||
2596 | |||
2597 | call _bsaes_decrypt8 | ||
2598 | |||
2599 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2600 | pxor 0x10(%rsp), @XMM[1] | ||
2601 | movdqu @XMM[0], 0x00($out) # write output | ||
2602 | pxor 0x20(%rsp), @XMM[6] | ||
2603 | movdqu @XMM[1], 0x10($out) | ||
2604 | pxor 0x30(%rsp), @XMM[4] | ||
2605 | movdqu @XMM[6], 0x20($out) | ||
2606 | pxor 0x40(%rsp), @XMM[2] | ||
2607 | movdqu @XMM[4], 0x30($out) | ||
2608 | pxor 0x50(%rsp), @XMM[7] | ||
2609 | movdqu @XMM[2], 0x40($out) | ||
2610 | pxor 0x60(%rsp), @XMM[3] | ||
2611 | movdqu @XMM[7], 0x50($out) | ||
2612 | pxor 0x70(%rsp), @XMM[5] | ||
2613 | movdqu @XMM[3], 0x60($out) | ||
2614 | movdqu @XMM[5], 0x70($out) | ||
2615 | lea 0x80($out), $out | ||
2616 | |||
2617 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
2618 | pxor $twtmp, $twtmp | ||
2619 | movdqa .Lxts_magic(%rip), $twmask | ||
2620 | pcmpgtd @XMM[7], $twtmp | ||
2621 | pshufd \$0x13, $twtmp, $twres | ||
2622 | pxor $twtmp, $twtmp | ||
2623 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2624 | pand $twmask, $twres # isolate carry and residue | ||
2625 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2626 | pxor $twres, @XMM[7] | ||
2627 | |||
2628 | sub \$0x80,$len | ||
2629 | jnc .Lxts_dec_loop | ||
2630 | |||
2631 | .Lxts_dec_short: | ||
2632 | add \$0x80, $len | ||
2633 | jz .Lxts_dec_done | ||
2634 | ___ | ||
2635 | for ($i=0;$i<7;$i++) { | ||
2636 | $code.=<<___; | ||
2637 | pshufd \$0x13, $twtmp, $twres | ||
2638 | pxor $twtmp, $twtmp | ||
2639 | movdqa @XMM[7], @XMM[$i] | ||
2640 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2641 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2642 | pand $twmask, $twres # isolate carry and residue | ||
2643 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2644 | pxor $twres, @XMM[7] | ||
2645 | ___ | ||
2646 | $code.=<<___ if ($i>=1); | ||
2647 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2648 | cmp \$`0x10*$i`,$len | ||
2649 | je .Lxts_dec_$i | ||
2650 | ___ | ||
2651 | $code.=<<___ if ($i>=2); | ||
2652 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2653 | ___ | ||
2654 | } | ||
2655 | $code.=<<___; | ||
2656 | movdqu 0x60($inp), @XMM[8+6] | ||
2657 | pxor @XMM[8+5], @XMM[5] | ||
2658 | movdqa @XMM[7], 0x70(%rsp) | ||
2659 | lea 0x70($inp), $inp | ||
2660 | pxor @XMM[8+6], @XMM[6] | ||
2661 | lea 0x80(%rsp), %rax # pass key schedule | ||
2662 | mov %edx, %r10d # pass rounds | ||
2663 | |||
2664 | call _bsaes_decrypt8 | ||
2665 | |||
2666 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2667 | pxor 0x10(%rsp), @XMM[1] | ||
2668 | movdqu @XMM[0], 0x00($out) # write output | ||
2669 | pxor 0x20(%rsp), @XMM[6] | ||
2670 | movdqu @XMM[1], 0x10($out) | ||
2671 | pxor 0x30(%rsp), @XMM[4] | ||
2672 | movdqu @XMM[6], 0x20($out) | ||
2673 | pxor 0x40(%rsp), @XMM[2] | ||
2674 | movdqu @XMM[4], 0x30($out) | ||
2675 | pxor 0x50(%rsp), @XMM[7] | ||
2676 | movdqu @XMM[2], 0x40($out) | ||
2677 | pxor 0x60(%rsp), @XMM[3] | ||
2678 | movdqu @XMM[7], 0x50($out) | ||
2679 | movdqu @XMM[3], 0x60($out) | ||
2680 | lea 0x70($out), $out | ||
2681 | |||
2682 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
2683 | jmp .Lxts_dec_done | ||
2684 | .align 16 | ||
2685 | .Lxts_dec_6: | ||
2686 | pxor @XMM[8+4], @XMM[4] | ||
2687 | lea 0x60($inp), $inp | ||
2688 | pxor @XMM[8+5], @XMM[5] | ||
2689 | lea 0x80(%rsp), %rax # pass key schedule | ||
2690 | mov %edx, %r10d # pass rounds | ||
2691 | |||
2692 | call _bsaes_decrypt8 | ||
2693 | |||
2694 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2695 | pxor 0x10(%rsp), @XMM[1] | ||
2696 | movdqu @XMM[0], 0x00($out) # write output | ||
2697 | pxor 0x20(%rsp), @XMM[6] | ||
2698 | movdqu @XMM[1], 0x10($out) | ||
2699 | pxor 0x30(%rsp), @XMM[4] | ||
2700 | movdqu @XMM[6], 0x20($out) | ||
2701 | pxor 0x40(%rsp), @XMM[2] | ||
2702 | movdqu @XMM[4], 0x30($out) | ||
2703 | pxor 0x50(%rsp), @XMM[7] | ||
2704 | movdqu @XMM[2], 0x40($out) | ||
2705 | movdqu @XMM[7], 0x50($out) | ||
2706 | lea 0x60($out), $out | ||
2707 | |||
2708 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
2709 | jmp .Lxts_dec_done | ||
2710 | .align 16 | ||
2711 | .Lxts_dec_5: | ||
2712 | pxor @XMM[8+3], @XMM[3] | ||
2713 | lea 0x50($inp), $inp | ||
2714 | pxor @XMM[8+4], @XMM[4] | ||
2715 | lea 0x80(%rsp), %rax # pass key schedule | ||
2716 | mov %edx, %r10d # pass rounds | ||
2717 | |||
2718 | call _bsaes_decrypt8 | ||
2719 | |||
2720 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2721 | pxor 0x10(%rsp), @XMM[1] | ||
2722 | movdqu @XMM[0], 0x00($out) # write output | ||
2723 | pxor 0x20(%rsp), @XMM[6] | ||
2724 | movdqu @XMM[1], 0x10($out) | ||
2725 | pxor 0x30(%rsp), @XMM[4] | ||
2726 | movdqu @XMM[6], 0x20($out) | ||
2727 | pxor 0x40(%rsp), @XMM[2] | ||
2728 | movdqu @XMM[4], 0x30($out) | ||
2729 | movdqu @XMM[2], 0x40($out) | ||
2730 | lea 0x50($out), $out | ||
2731 | |||
2732 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
2733 | jmp .Lxts_dec_done | ||
2734 | .align 16 | ||
2735 | .Lxts_dec_4: | ||
2736 | pxor @XMM[8+2], @XMM[2] | ||
2737 | lea 0x40($inp), $inp | ||
2738 | pxor @XMM[8+3], @XMM[3] | ||
2739 | lea 0x80(%rsp), %rax # pass key schedule | ||
2740 | mov %edx, %r10d # pass rounds | ||
2741 | |||
2742 | call _bsaes_decrypt8 | ||
2743 | |||
2744 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2745 | pxor 0x10(%rsp), @XMM[1] | ||
2746 | movdqu @XMM[0], 0x00($out) # write output | ||
2747 | pxor 0x20(%rsp), @XMM[6] | ||
2748 | movdqu @XMM[1], 0x10($out) | ||
2749 | pxor 0x30(%rsp), @XMM[4] | ||
2750 | movdqu @XMM[6], 0x20($out) | ||
2751 | movdqu @XMM[4], 0x30($out) | ||
2752 | lea 0x40($out), $out | ||
2753 | |||
2754 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
2755 | jmp .Lxts_dec_done | ||
2756 | .align 16 | ||
2757 | .Lxts_dec_3: | ||
2758 | pxor @XMM[8+1], @XMM[1] | ||
2759 | lea 0x30($inp), $inp | ||
2760 | pxor @XMM[8+2], @XMM[2] | ||
2761 | lea 0x80(%rsp), %rax # pass key schedule | ||
2762 | mov %edx, %r10d # pass rounds | ||
2763 | |||
2764 | call _bsaes_decrypt8 | ||
2765 | |||
2766 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2767 | pxor 0x10(%rsp), @XMM[1] | ||
2768 | movdqu @XMM[0], 0x00($out) # write output | ||
2769 | pxor 0x20(%rsp), @XMM[6] | ||
2770 | movdqu @XMM[1], 0x10($out) | ||
2771 | movdqu @XMM[6], 0x20($out) | ||
2772 | lea 0x30($out), $out | ||
2773 | |||
2774 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
2775 | jmp .Lxts_dec_done | ||
2776 | .align 16 | ||
2777 | .Lxts_dec_2: | ||
2778 | pxor @XMM[8+0], @XMM[0] | ||
2779 | lea 0x20($inp), $inp | ||
2780 | pxor @XMM[8+1], @XMM[1] | ||
2781 | lea 0x80(%rsp), %rax # pass key schedule | ||
2782 | mov %edx, %r10d # pass rounds | ||
2783 | |||
2784 | call _bsaes_decrypt8 | ||
2785 | |||
2786 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2787 | pxor 0x10(%rsp), @XMM[1] | ||
2788 | movdqu @XMM[0], 0x00($out) # write output | ||
2789 | movdqu @XMM[1], 0x10($out) | ||
2790 | lea 0x20($out), $out | ||
2791 | |||
2792 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
2793 | jmp .Lxts_dec_done | ||
2794 | .align 16 | ||
2795 | .Lxts_dec_1: | ||
2796 | pxor @XMM[0], @XMM[8] | ||
2797 | lea 0x10($inp), $inp | ||
2798 | movdqa @XMM[8], 0x20(%rbp) | ||
2799 | lea 0x20(%rbp), $arg1 | ||
2800 | lea 0x20(%rbp), $arg2 | ||
2801 | lea ($key), $arg3 | ||
2802 | call asm_AES_decrypt # doesn't touch %xmm | ||
2803 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
2804 | #pxor @XMM[8], @XMM[0] | ||
2805 | #lea 0x80(%rsp), %rax # pass key schedule | ||
2806 | #mov %edx, %r10d # pass rounds | ||
2807 | #call _bsaes_decrypt8 | ||
2808 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2809 | movdqu @XMM[0], 0x00($out) # write output | ||
2810 | lea 0x10($out), $out | ||
2811 | |||
2812 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
2813 | |||
2814 | .Lxts_dec_done: | ||
2815 | and \$15, %ebx | ||
2816 | jz .Lxts_dec_ret | ||
2817 | |||
2818 | pxor $twtmp, $twtmp | ||
2819 | movdqa .Lxts_magic(%rip), $twmask | ||
2820 | pcmpgtd @XMM[7], $twtmp | ||
2821 | pshufd \$0x13, $twtmp, $twres | ||
2822 | movdqa @XMM[7], @XMM[6] | ||
2823 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2824 | pand $twmask, $twres # isolate carry and residue | ||
2825 | movdqu ($inp), @XMM[0] | ||
2826 | pxor $twres, @XMM[7] | ||
2827 | |||
2828 | lea 0x20(%rbp), $arg1 | ||
2829 | pxor @XMM[7], @XMM[0] | ||
2830 | lea 0x20(%rbp), $arg2 | ||
2831 | movdqa @XMM[0], 0x20(%rbp) | ||
2832 | lea ($key), $arg3 | ||
2833 | call asm_AES_decrypt # doesn't touch %xmm | ||
2834 | pxor 0x20(%rbp), @XMM[7] | ||
2835 | mov $out, %rdx | ||
2836 | movdqu @XMM[7], ($out) | ||
2837 | |||
2838 | .Lxts_dec_steal: | ||
2839 | movzb 16($inp), %eax | ||
2840 | movzb (%rdx), %ecx | ||
2841 | lea 1($inp), $inp | ||
2842 | mov %al, (%rdx) | ||
2843 | mov %cl, 16(%rdx) | ||
2844 | lea 1(%rdx), %rdx | ||
2845 | sub \$1,%ebx | ||
2846 | jnz .Lxts_dec_steal | ||
2847 | |||
2848 | movdqu ($out), @XMM[0] | ||
2849 | lea 0x20(%rbp), $arg1 | ||
2850 | pxor @XMM[6], @XMM[0] | ||
2851 | lea 0x20(%rbp), $arg2 | ||
2852 | movdqa @XMM[0], 0x20(%rbp) | ||
2853 | lea ($key), $arg3 | ||
2854 | call asm_AES_decrypt # doesn't touch %xmm | ||
2855 | pxor 0x20(%rbp), @XMM[6] | ||
2856 | movdqu @XMM[6], ($out) | ||
2857 | |||
2858 | .Lxts_dec_ret: | ||
2859 | lea (%rsp), %rax | ||
2860 | pxor %xmm0, %xmm0 | ||
2861 | .Lxts_dec_bzero: # wipe key schedule [if any] | ||
2862 | movdqa %xmm0, 0x00(%rax) | ||
2863 | movdqa %xmm0, 0x10(%rax) | ||
2864 | lea 0x20(%rax), %rax | ||
2865 | cmp %rax, %rbp | ||
2866 | ja .Lxts_dec_bzero | ||
2867 | |||
2868 | lea (%rbp),%rsp # restore %rsp | ||
2869 | ___ | ||
2870 | $code.=<<___ if ($win64); | ||
2871 | movaps 0x40(%rbp), %xmm6 | ||
2872 | movaps 0x50(%rbp), %xmm7 | ||
2873 | movaps 0x60(%rbp), %xmm8 | ||
2874 | movaps 0x70(%rbp), %xmm9 | ||
2875 | movaps 0x80(%rbp), %xmm10 | ||
2876 | movaps 0x90(%rbp), %xmm11 | ||
2877 | movaps 0xa0(%rbp), %xmm12 | ||
2878 | movaps 0xb0(%rbp), %xmm13 | ||
2879 | movaps 0xc0(%rbp), %xmm14 | ||
2880 | movaps 0xd0(%rbp), %xmm15 | ||
2881 | lea 0xa0(%rbp), %rsp | ||
2882 | ___ | ||
2883 | $code.=<<___; | ||
2884 | mov 0x48(%rsp), %r15 | ||
2885 | mov 0x50(%rsp), %r14 | ||
2886 | mov 0x58(%rsp), %r13 | ||
2887 | mov 0x60(%rsp), %r12 | ||
2888 | mov 0x68(%rsp), %rbx | ||
2889 | mov 0x70(%rsp), %rax | ||
2890 | lea 0x78(%rsp), %rsp | ||
2891 | mov %rax, %rbp | ||
2892 | .Lxts_dec_epilogue: | ||
2893 | ret | ||
2894 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
2895 | ___ | ||
2896 | } | ||
2897 | $code.=<<___; | ||
2898 | .section .rodata | ||
2899 | .type _bsaes_const,\@object | ||
2900 | .align 64 | ||
2901 | _bsaes_const: | ||
2902 | .LM0ISR: # InvShiftRows constants | ||
2903 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
2904 | .LISRM0: | ||
2905 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
2906 | .LISR: | ||
2907 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
2908 | .LBS0: # bit-slice constants | ||
2909 | .quad 0x5555555555555555, 0x5555555555555555 | ||
2910 | .LBS1: | ||
2911 | .quad 0x3333333333333333, 0x3333333333333333 | ||
2912 | .LBS2: | ||
2913 | .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f | ||
2914 | .LSR: # shiftrows constants | ||
2915 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
2916 | .LSRM0: | ||
2917 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
2918 | .LM0SR: | ||
2919 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
2920 | .LSWPUP: # byte-swap upper dword | ||
2921 | .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 | ||
2922 | .LSWPUPM0SR: | ||
2923 | .quad 0x0a0d02060c03070b, 0x0004080f05090e01 | ||
2924 | .LADD1: # counter increment constants | ||
2925 | .quad 0x0000000000000000, 0x0000000100000000 | ||
2926 | .LADD2: | ||
2927 | .quad 0x0000000000000000, 0x0000000200000000 | ||
2928 | .LADD3: | ||
2929 | .quad 0x0000000000000000, 0x0000000300000000 | ||
2930 | .LADD4: | ||
2931 | .quad 0x0000000000000000, 0x0000000400000000 | ||
2932 | .LADD5: | ||
2933 | .quad 0x0000000000000000, 0x0000000500000000 | ||
2934 | .LADD6: | ||
2935 | .quad 0x0000000000000000, 0x0000000600000000 | ||
2936 | .LADD7: | ||
2937 | .quad 0x0000000000000000, 0x0000000700000000 | ||
2938 | .LADD8: | ||
2939 | .quad 0x0000000000000000, 0x0000000800000000 | ||
2940 | .Lxts_magic: | ||
2941 | .long 0x87,0,1,0 | ||
2942 | .Lmasks: | ||
2943 | .quad 0x0101010101010101, 0x0101010101010101 | ||
2944 | .quad 0x0202020202020202, 0x0202020202020202 | ||
2945 | .quad 0x0404040404040404, 0x0404040404040404 | ||
2946 | .quad 0x0808080808080808, 0x0808080808080808 | ||
2947 | .LM0: | ||
2948 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
2949 | .L63: | ||
2950 | .quad 0x6363636363636363, 0x6363636363636363 | ||
2951 | .align 64 | ||
2952 | .size _bsaes_const,.-_bsaes_const | ||
2953 | .text | ||
2954 | ___ | ||
2955 | |||
2956 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2957 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2958 | if ($win64) { | ||
2959 | $rec="%rcx"; | ||
2960 | $frame="%rdx"; | ||
2961 | $context="%r8"; | ||
2962 | $disp="%r9"; | ||
2963 | |||
2964 | $code.=<<___; | ||
2965 | .extern __imp_RtlVirtualUnwind | ||
2966 | .type se_handler,\@abi-omnipotent | ||
2967 | .align 16 | ||
2968 | se_handler: | ||
2969 | _CET_ENDBR | ||
2970 | push %rsi | ||
2971 | push %rdi | ||
2972 | push %rbx | ||
2973 | push %rbp | ||
2974 | push %r12 | ||
2975 | push %r13 | ||
2976 | push %r14 | ||
2977 | push %r15 | ||
2978 | pushfq | ||
2979 | sub \$64,%rsp | ||
2980 | |||
2981 | mov 120($context),%rax # pull context->Rax | ||
2982 | mov 248($context),%rbx # pull context->Rip | ||
2983 | |||
2984 | mov 8($disp),%rsi # disp->ImageBase | ||
2985 | mov 56($disp),%r11 # disp->HandlerData | ||
2986 | |||
2987 | mov 0(%r11),%r10d # HandlerData[0] | ||
2988 | lea (%rsi,%r10),%r10 # prologue label | ||
2989 | cmp %r10,%rbx # context->Rip<prologue label | ||
2990 | jb .Lin_prologue | ||
2991 | |||
2992 | mov 152($context),%rax # pull context->Rsp | ||
2993 | |||
2994 | mov 4(%r11),%r10d # HandlerData[1] | ||
2995 | lea (%rsi,%r10),%r10 # epilogue label | ||
2996 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2997 | jae .Lin_prologue | ||
2998 | |||
2999 | mov 160($context),%rax # pull context->Rbp | ||
3000 | |||
3001 | lea 0x40(%rax),%rsi # %xmm save area | ||
3002 | lea 512($context),%rdi # &context.Xmm6 | ||
3003 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
3004 | .long 0xa548f3fc # cld; rep movsq | ||
3005 | lea 0xa0(%rax),%rax # adjust stack pointer | ||
3006 | |||
3007 | mov 0x70(%rax),%rbp | ||
3008 | mov 0x68(%rax),%rbx | ||
3009 | mov 0x60(%rax),%r12 | ||
3010 | mov 0x58(%rax),%r13 | ||
3011 | mov 0x50(%rax),%r14 | ||
3012 | mov 0x48(%rax),%r15 | ||
3013 | lea 0x78(%rax),%rax # adjust stack pointer | ||
3014 | mov %rbx,144($context) # restore context->Rbx | ||
3015 | mov %rbp,160($context) # restore context->Rbp | ||
3016 | mov %r12,216($context) # restore context->R12 | ||
3017 | mov %r13,224($context) # restore context->R13 | ||
3018 | mov %r14,232($context) # restore context->R14 | ||
3019 | mov %r15,240($context) # restore context->R15 | ||
3020 | |||
3021 | .Lin_prologue: | ||
3022 | mov %rax,152($context) # restore context->Rsp | ||
3023 | |||
3024 | mov 40($disp),%rdi # disp->ContextRecord | ||
3025 | mov $context,%rsi # context | ||
3026 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
3027 | .long 0xa548f3fc # cld; rep movsq | ||
3028 | |||
3029 | mov $disp,%rsi | ||
3030 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
3031 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
3032 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
3033 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
3034 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
3035 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
3036 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
3037 | mov %r10,32(%rsp) # arg5 | ||
3038 | mov %r11,40(%rsp) # arg6 | ||
3039 | mov %r12,48(%rsp) # arg7 | ||
3040 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
3041 | call *__imp_RtlVirtualUnwind(%rip) | ||
3042 | |||
3043 | mov \$1,%eax # ExceptionContinueSearch | ||
3044 | add \$64,%rsp | ||
3045 | popfq | ||
3046 | pop %r15 | ||
3047 | pop %r14 | ||
3048 | pop %r13 | ||
3049 | pop %r12 | ||
3050 | pop %rbp | ||
3051 | pop %rbx | ||
3052 | pop %rdi | ||
3053 | pop %rsi | ||
3054 | ret | ||
3055 | .size se_handler,.-se_handler | ||
3056 | |||
3057 | .section .pdata | ||
3058 | .align 4 | ||
3059 | ___ | ||
3060 | $code.=<<___ if ($ecb); | ||
3061 | .rva .Lecb_enc_prologue | ||
3062 | .rva .Lecb_enc_epilogue | ||
3063 | .rva .Lecb_enc_info | ||
3064 | |||
3065 | .rva .Lecb_dec_prologue | ||
3066 | .rva .Lecb_dec_epilogue | ||
3067 | .rva .Lecb_dec_info | ||
3068 | ___ | ||
3069 | $code.=<<___; | ||
3070 | .rva .Lcbc_dec_prologue | ||
3071 | .rva .Lcbc_dec_epilogue | ||
3072 | .rva .Lcbc_dec_info | ||
3073 | |||
3074 | .rva .Lctr_enc_prologue | ||
3075 | .rva .Lctr_enc_epilogue | ||
3076 | .rva .Lctr_enc_info | ||
3077 | |||
3078 | .rva .Lxts_enc_prologue | ||
3079 | .rva .Lxts_enc_epilogue | ||
3080 | .rva .Lxts_enc_info | ||
3081 | |||
3082 | .rva .Lxts_dec_prologue | ||
3083 | .rva .Lxts_dec_epilogue | ||
3084 | .rva .Lxts_dec_info | ||
3085 | |||
3086 | .section .xdata | ||
3087 | .align 8 | ||
3088 | ___ | ||
3089 | $code.=<<___ if ($ecb); | ||
3090 | .Lecb_enc_info: | ||
3091 | .byte 9,0,0,0 | ||
3092 | .rva se_handler | ||
3093 | .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | ||
3094 | .Lecb_dec_info: | ||
3095 | .byte 9,0,0,0 | ||
3096 | .rva se_handler | ||
3097 | .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | ||
3098 | ___ | ||
3099 | $code.=<<___; | ||
3100 | .Lcbc_dec_info: | ||
3101 | .byte 9,0,0,0 | ||
3102 | .rva se_handler | ||
3103 | .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | ||
3104 | .Lctr_enc_info: | ||
3105 | .byte 9,0,0,0 | ||
3106 | .rva se_handler | ||
3107 | .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | ||
3108 | .Lxts_enc_info: | ||
3109 | .byte 9,0,0,0 | ||
3110 | .rva se_handler | ||
3111 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
3112 | .Lxts_dec_info: | ||
3113 | .byte 9,0,0,0 | ||
3114 | .rva se_handler | ||
3115 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
3116 | ___ | ||
3117 | } | ||
3118 | |||
3119 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
3120 | |||
3121 | print $code; | ||
3122 | |||
3123 | close STDOUT; | ||