diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm/aesni-x86_64.pl')
-rw-r--r-- | src/lib/libcrypto/aes/asm/aesni-x86_64.pl | 3080 |
1 files changed, 0 insertions, 3080 deletions
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl deleted file mode 100644 index 441524036a..0000000000 --- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl +++ /dev/null | |||
@@ -1,3080 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # This module implements support for Intel AES-NI extension. In | ||
11 | # OpenSSL context it's used with Intel engine, but can also be used as | ||
12 | # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for | ||
13 | # details]. | ||
14 | # | ||
15 | # Performance. | ||
16 | # | ||
17 | # Given aes(enc|dec) instructions' latency asymptotic performance for | ||
18 | # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte | ||
19 | # processed with 128-bit key. And given their throughput asymptotic | ||
20 | # performance for parallelizable modes is 1.25 cycles per byte. Being | ||
21 | # asymptotic limit it's not something you commonly achieve in reality, | ||
22 | # but how close does one get? Below are results collected for | ||
23 | # different modes and block sized. Pairs of numbers are for en-/ | ||
24 | # decryption. | ||
25 | # | ||
26 | # 16-byte 64-byte 256-byte 1-KB 8-KB | ||
27 | # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 | ||
28 | # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 | ||
29 | # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 | ||
30 | # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 | ||
31 | # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 | ||
32 | # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 | ||
33 | # | ||
34 | # ECB, CTR, CBC and CCM results are free from EVP overhead. This means | ||
35 | # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni | ||
36 | # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. | ||
37 | # The results were collected with specially crafted speed.c benchmark | ||
38 | # in order to compare them with results reported in "Intel Advanced | ||
39 | # Encryption Standard (AES) New Instruction Set" White Paper Revision | ||
40 | # 3.0 dated May 2010. All above results are consistently better. This | ||
41 | # module also provides better performance for block sizes smaller than | ||
42 | # 128 bytes in points *not* represented in the above table. | ||
43 | # | ||
44 | # Looking at the results for 8-KB buffer. | ||
45 | # | ||
46 | # CFB and OFB results are far from the limit, because implementation | ||
47 | # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on | ||
48 | # single-block aesni_encrypt, which is not the most optimal way to go. | ||
49 | # CBC encrypt result is unexpectedly high and there is no documented | ||
50 | # explanation for it. Seemingly there is a small penalty for feeding | ||
51 | # the result back to AES unit the way it's done in CBC mode. There is | ||
52 | # nothing one can do and the result appears optimal. CCM result is | ||
53 | # identical to CBC, because CBC-MAC is essentially CBC encrypt without | ||
54 | # saving output. CCM CTR "stays invisible," because it's neatly | ||
55 | # interleaved with CBC-MAC. This provides ~30% improvement over | ||
56 | # "straghtforward" CCM implementation with CTR and CBC-MAC performed | ||
57 | # disjointly. Parallelizable modes practically achieve the theoretical | ||
58 | # limit. | ||
59 | # | ||
60 | # Looking at how results vary with buffer size. | ||
61 | # | ||
62 | # Curves are practically saturated at 1-KB buffer size. In most cases | ||
63 | # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. | ||
64 | # CTR curve doesn't follow this pattern and is "slowest" changing one | ||
65 | # with "256-byte" result being 87% of "8-KB." This is because overhead | ||
66 | # in CTR mode is most computationally intensive. Small-block CCM | ||
67 | # decrypt is slower than encrypt, because first CTR and last CBC-MAC | ||
68 | # iterations can't be interleaved. | ||
69 | # | ||
70 | # Results for 192- and 256-bit keys. | ||
71 | # | ||
72 | # EVP-free results were observed to scale perfectly with number of | ||
73 | # rounds for larger block sizes, i.e. 192-bit result being 10/12 times | ||
74 | # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences | ||
75 | # are a tad smaller, because the above mentioned penalty biases all | ||
76 | # results by same constant value. In similar way function call | ||
77 | # overhead affects small-block performance, as well as OFB and CFB | ||
78 | # results. Differences are not large, most common coefficients are | ||
79 | # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one | ||
80 | # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... | ||
81 | |||
82 | # January 2011 | ||
83 | # | ||
84 | # While Westmere processor features 6 cycles latency for aes[enc|dec] | ||
85 | # instructions, which can be scheduled every second cycle, Sandy | ||
86 | # Bridge spends 8 cycles per instruction, but it can schedule them | ||
87 | # every cycle. This means that code targeting Westmere would perform | ||
88 | # suboptimally on Sandy Bridge. Therefore this update. | ||
89 | # | ||
90 | # In addition, non-parallelizable CBC encrypt (as well as CCM) is | ||
91 | # optimized. Relative improvement might appear modest, 8% on Westmere, | ||
92 | # but in absolute terms it's 3.77 cycles per byte encrypted with | ||
93 | # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers | ||
94 | # should be compared to asymptotic limits of 3.75 for Westmere and | ||
95 | # 5.00 for Sandy Bridge. Actually, the fact that they get this close | ||
96 | # to asymptotic limits is quite amazing. Indeed, the limit is | ||
97 | # calculated as latency times number of rounds, 10 for 128-bit key, | ||
98 | # and divided by 16, the number of bytes in block, or in other words | ||
99 | # it accounts *solely* for aesenc instructions. But there are extra | ||
100 | # instructions, and numbers so close to the asymptotic limits mean | ||
101 | # that it's as if it takes as little as *one* additional cycle to | ||
102 | # execute all of them. How is it possible? It is possible thanks to | ||
103 | # out-of-order execution logic, which manages to overlap post- | ||
104 | # processing of previous block, things like saving the output, with | ||
105 | # actual encryption of current block, as well as pre-processing of | ||
106 | # current block, things like fetching input and xor-ing it with | ||
107 | # 0-round element of the key schedule, with actual encryption of | ||
108 | # previous block. Keep this in mind... | ||
109 | # | ||
110 | # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher | ||
111 | # performance is achieved by interleaving instructions working on | ||
112 | # independent blocks. In which case asymptotic limit for such modes | ||
113 | # can be obtained by dividing above mentioned numbers by AES | ||
114 | # instructions' interleave factor. Westmere can execute at most 3 | ||
115 | # instructions at a time, meaning that optimal interleave factor is 3, | ||
116 | # and that's where the "magic" number of 1.25 come from. "Optimal | ||
117 | # interleave factor" means that increase of interleave factor does | ||
118 | # not improve performance. The formula has proven to reflect reality | ||
119 | # pretty well on Westmere... Sandy Bridge on the other hand can | ||
120 | # execute up to 8 AES instructions at a time, so how does varying | ||
121 | # interleave factor affect the performance? Here is table for ECB | ||
122 | # (numbers are cycles per byte processed with 128-bit key): | ||
123 | # | ||
124 | # instruction interleave factor 3x 6x 8x | ||
125 | # theoretical asymptotic limit 1.67 0.83 0.625 | ||
126 | # measured performance for 8KB block 1.05 0.86 0.84 | ||
127 | # | ||
128 | # "as if" interleave factor 4.7x 5.8x 6.0x | ||
129 | # | ||
130 | # Further data for other parallelizable modes: | ||
131 | # | ||
132 | # CBC decrypt 1.16 0.93 0.93 | ||
133 | # CTR 1.14 0.91 n/a | ||
134 | # | ||
135 | # Well, given 3x column it's probably inappropriate to call the limit | ||
136 | # asymptotic, if it can be surpassed, isn't it? What happens there? | ||
137 | # Rewind to CBC paragraph for the answer. Yes, out-of-order execution | ||
138 | # magic is responsible for this. Processor overlaps not only the | ||
139 | # additional instructions with AES ones, but even AES instructions | ||
140 | # processing adjacent triplets of independent blocks. In the 6x case | ||
141 | # additional instructions still claim disproportionally small amount | ||
142 | # of additional cycles, but in 8x case number of instructions must be | ||
143 | # a tad too high for out-of-order logic to cope with, and AES unit | ||
144 | # remains underutilized... As you can see 8x interleave is hardly | ||
145 | # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl | ||
146 | # utilizies 6x interleave because of limited register bank capacity. | ||
147 | # | ||
148 | # Higher interleave factors do have negative impact on Westmere | ||
149 | # performance. While for ECB mode it's negligible ~1.5%, other | ||
150 | # parallelizables perform ~5% worse, which is outweighed by ~25% | ||
151 | # improvement on Sandy Bridge. To balance regression on Westmere | ||
152 | # CTR mode was implemented with 6x aesenc interleave factor. | ||
153 | |||
154 | # April 2011 | ||
155 | # | ||
156 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing | ||
157 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like | ||
158 | # in CTR mode AES instruction interleave factor was chosen to be 6x. | ||
159 | |||
160 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script | ||
161 | # generates drop-in replacement for | ||
162 | # crypto/aes/asm/aes-x86_64.pl:-) | ||
163 | |||
164 | $flavour = shift; | ||
165 | $output = shift; | ||
166 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
167 | |||
168 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
169 | |||
170 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
171 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
172 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
173 | die "can't locate x86_64-xlate.pl"; | ||
174 | |||
175 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
176 | *STDOUT=*OUT; | ||
177 | |||
178 | $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; | ||
179 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order | ||
180 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | ||
181 | |||
182 | $code=".text\n"; | ||
183 | |||
184 | $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! | ||
185 | # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... | ||
186 | $inp="%rdi"; | ||
187 | $out="%rsi"; | ||
188 | $len="%rdx"; | ||
189 | $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! | ||
190 | $ivp="%r8"; # cbc, ctr, ... | ||
191 | |||
192 | $rnds_="%r10d"; # backup copy for $rounds | ||
193 | $key_="%r11"; # backup copy for $key | ||
194 | |||
195 | # %xmm register layout | ||
196 | $rndkey0="%xmm0"; $rndkey1="%xmm1"; | ||
197 | $inout0="%xmm2"; $inout1="%xmm3"; | ||
198 | $inout2="%xmm4"; $inout3="%xmm5"; | ||
199 | $inout4="%xmm6"; $inout5="%xmm7"; | ||
200 | $inout6="%xmm8"; $inout7="%xmm9"; | ||
201 | |||
202 | $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... | ||
203 | $in0="%xmm8"; $iv="%xmm9"; | ||
204 | |||
205 | # Inline version of internal aesni_[en|de]crypt1. | ||
206 | # | ||
207 | # Why folded loop? Because aes[enc|dec] is slow enough to accommodate | ||
208 | # cycles which take care of loop variables... | ||
209 | { my $sn; | ||
210 | sub aesni_generate1 { | ||
211 | my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); | ||
212 | ++$sn; | ||
213 | $code.=<<___; | ||
214 | $movkey ($key),$rndkey0 | ||
215 | $movkey 16($key),$rndkey1 | ||
216 | ___ | ||
217 | $code.=<<___ if (defined($ivec)); | ||
218 | xorps $rndkey0,$ivec | ||
219 | lea 32($key),$key | ||
220 | xorps $ivec,$inout | ||
221 | ___ | ||
222 | $code.=<<___ if (!defined($ivec)); | ||
223 | lea 32($key),$key | ||
224 | xorps $rndkey0,$inout | ||
225 | ___ | ||
226 | $code.=<<___; | ||
227 | .Loop_${p}1_$sn: | ||
228 | aes${p} $rndkey1,$inout | ||
229 | dec $rounds | ||
230 | $movkey ($key),$rndkey1 | ||
231 | lea 16($key),$key | ||
232 | jnz .Loop_${p}1_$sn # loop body is 16 bytes | ||
233 | aes${p}last $rndkey1,$inout | ||
234 | ___ | ||
235 | }} | ||
236 | # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); | ||
237 | # | ||
238 | { my ($inp,$out,$key) = @_4args; | ||
239 | |||
240 | $code.=<<___; | ||
241 | .globl ${PREFIX}_encrypt | ||
242 | .type ${PREFIX}_encrypt,\@abi-omnipotent | ||
243 | .align 16 | ||
244 | ${PREFIX}_encrypt: | ||
245 | _CET_ENDBR | ||
246 | movups ($inp),$inout0 # load input | ||
247 | mov 240($key),$rounds # key->rounds | ||
248 | ___ | ||
249 | &aesni_generate1("enc",$key,$rounds); | ||
250 | $code.=<<___; | ||
251 | movups $inout0,($out) # output | ||
252 | ret | ||
253 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
254 | |||
255 | .globl ${PREFIX}_decrypt | ||
256 | .type ${PREFIX}_decrypt,\@abi-omnipotent | ||
257 | .align 16 | ||
258 | ${PREFIX}_decrypt: | ||
259 | _CET_ENDBR | ||
260 | movups ($inp),$inout0 # load input | ||
261 | mov 240($key),$rounds # key->rounds | ||
262 | ___ | ||
263 | &aesni_generate1("dec",$key,$rounds); | ||
264 | $code.=<<___; | ||
265 | movups $inout0,($out) # output | ||
266 | ret | ||
267 | .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt | ||
268 | ___ | ||
269 | } | ||
270 | |||
271 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave | ||
272 | # factor. Why 3x subroutine were originally used in loops? Even though | ||
273 | # aes[enc|dec] latency was originally 6, it could be scheduled only | ||
274 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal | ||
275 | # utilization, i.e. when subroutine's throughput is virtually same as | ||
276 | # of non-interleaved subroutine [for number of input blocks up to 3]. | ||
277 | # This is why it makes no sense to implement 2x subroutine. | ||
278 | # aes[enc|dec] latency in next processor generation is 8, but the | ||
279 | # instructions can be scheduled every cycle. Optimal interleave for | ||
280 | # new processor is therefore 8x... | ||
281 | sub aesni_generate3 { | ||
282 | my $dir=shift; | ||
283 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
284 | # preserved. $inout[0-2] is cipher/clear text... | ||
285 | $code.=<<___; | ||
286 | .type _aesni_${dir}rypt3,\@abi-omnipotent | ||
287 | .align 16 | ||
288 | _aesni_${dir}rypt3: | ||
289 | _CET_ENDBR | ||
290 | $movkey ($key),$rndkey0 | ||
291 | shr \$1,$rounds | ||
292 | $movkey 16($key),$rndkey1 | ||
293 | lea 32($key),$key | ||
294 | xorps $rndkey0,$inout0 | ||
295 | xorps $rndkey0,$inout1 | ||
296 | xorps $rndkey0,$inout2 | ||
297 | $movkey ($key),$rndkey0 | ||
298 | |||
299 | .L${dir}_loop3: | ||
300 | aes${dir} $rndkey1,$inout0 | ||
301 | aes${dir} $rndkey1,$inout1 | ||
302 | dec $rounds | ||
303 | aes${dir} $rndkey1,$inout2 | ||
304 | $movkey 16($key),$rndkey1 | ||
305 | aes${dir} $rndkey0,$inout0 | ||
306 | aes${dir} $rndkey0,$inout1 | ||
307 | lea 32($key),$key | ||
308 | aes${dir} $rndkey0,$inout2 | ||
309 | $movkey ($key),$rndkey0 | ||
310 | jnz .L${dir}_loop3 | ||
311 | |||
312 | aes${dir} $rndkey1,$inout0 | ||
313 | aes${dir} $rndkey1,$inout1 | ||
314 | aes${dir} $rndkey1,$inout2 | ||
315 | aes${dir}last $rndkey0,$inout0 | ||
316 | aes${dir}last $rndkey0,$inout1 | ||
317 | aes${dir}last $rndkey0,$inout2 | ||
318 | ret | ||
319 | .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 | ||
320 | ___ | ||
321 | } | ||
322 | # 4x interleave is implemented to improve small block performance, | ||
323 | # most notably [and naturally] 4 block by ~30%. One can argue that one | ||
324 | # should have implemented 5x as well, but improvement would be <20%, | ||
325 | # so it's not worth it... | ||
326 | sub aesni_generate4 { | ||
327 | my $dir=shift; | ||
328 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
329 | # preserved. $inout[0-3] is cipher/clear text... | ||
330 | $code.=<<___; | ||
331 | .type _aesni_${dir}rypt4,\@abi-omnipotent | ||
332 | .align 16 | ||
333 | _aesni_${dir}rypt4: | ||
334 | _CET_ENDBR | ||
335 | $movkey ($key),$rndkey0 | ||
336 | shr \$1,$rounds | ||
337 | $movkey 16($key),$rndkey1 | ||
338 | lea 32($key),$key | ||
339 | xorps $rndkey0,$inout0 | ||
340 | xorps $rndkey0,$inout1 | ||
341 | xorps $rndkey0,$inout2 | ||
342 | xorps $rndkey0,$inout3 | ||
343 | $movkey ($key),$rndkey0 | ||
344 | |||
345 | .L${dir}_loop4: | ||
346 | aes${dir} $rndkey1,$inout0 | ||
347 | aes${dir} $rndkey1,$inout1 | ||
348 | dec $rounds | ||
349 | aes${dir} $rndkey1,$inout2 | ||
350 | aes${dir} $rndkey1,$inout3 | ||
351 | $movkey 16($key),$rndkey1 | ||
352 | aes${dir} $rndkey0,$inout0 | ||
353 | aes${dir} $rndkey0,$inout1 | ||
354 | lea 32($key),$key | ||
355 | aes${dir} $rndkey0,$inout2 | ||
356 | aes${dir} $rndkey0,$inout3 | ||
357 | $movkey ($key),$rndkey0 | ||
358 | jnz .L${dir}_loop4 | ||
359 | |||
360 | aes${dir} $rndkey1,$inout0 | ||
361 | aes${dir} $rndkey1,$inout1 | ||
362 | aes${dir} $rndkey1,$inout2 | ||
363 | aes${dir} $rndkey1,$inout3 | ||
364 | aes${dir}last $rndkey0,$inout0 | ||
365 | aes${dir}last $rndkey0,$inout1 | ||
366 | aes${dir}last $rndkey0,$inout2 | ||
367 | aes${dir}last $rndkey0,$inout3 | ||
368 | ret | ||
369 | .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 | ||
370 | ___ | ||
371 | } | ||
372 | sub aesni_generate6 { | ||
373 | my $dir=shift; | ||
374 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
375 | # preserved. $inout[0-5] is cipher/clear text... | ||
376 | $code.=<<___; | ||
377 | .type _aesni_${dir}rypt6,\@abi-omnipotent | ||
378 | .align 16 | ||
379 | _aesni_${dir}rypt6: | ||
380 | _CET_ENDBR | ||
381 | $movkey ($key),$rndkey0 | ||
382 | shr \$1,$rounds | ||
383 | $movkey 16($key),$rndkey1 | ||
384 | lea 32($key),$key | ||
385 | xorps $rndkey0,$inout0 | ||
386 | pxor $rndkey0,$inout1 | ||
387 | aes${dir} $rndkey1,$inout0 | ||
388 | pxor $rndkey0,$inout2 | ||
389 | aes${dir} $rndkey1,$inout1 | ||
390 | pxor $rndkey0,$inout3 | ||
391 | aes${dir} $rndkey1,$inout2 | ||
392 | pxor $rndkey0,$inout4 | ||
393 | aes${dir} $rndkey1,$inout3 | ||
394 | pxor $rndkey0,$inout5 | ||
395 | dec $rounds | ||
396 | aes${dir} $rndkey1,$inout4 | ||
397 | $movkey ($key),$rndkey0 | ||
398 | aes${dir} $rndkey1,$inout5 | ||
399 | jmp .L${dir}_loop6_enter | ||
400 | .align 16 | ||
401 | .L${dir}_loop6: | ||
402 | aes${dir} $rndkey1,$inout0 | ||
403 | aes${dir} $rndkey1,$inout1 | ||
404 | dec $rounds | ||
405 | aes${dir} $rndkey1,$inout2 | ||
406 | aes${dir} $rndkey1,$inout3 | ||
407 | aes${dir} $rndkey1,$inout4 | ||
408 | aes${dir} $rndkey1,$inout5 | ||
409 | .L${dir}_loop6_enter: # happens to be 16-byte aligned | ||
410 | $movkey 16($key),$rndkey1 | ||
411 | aes${dir} $rndkey0,$inout0 | ||
412 | aes${dir} $rndkey0,$inout1 | ||
413 | lea 32($key),$key | ||
414 | aes${dir} $rndkey0,$inout2 | ||
415 | aes${dir} $rndkey0,$inout3 | ||
416 | aes${dir} $rndkey0,$inout4 | ||
417 | aes${dir} $rndkey0,$inout5 | ||
418 | $movkey ($key),$rndkey0 | ||
419 | jnz .L${dir}_loop6 | ||
420 | |||
421 | aes${dir} $rndkey1,$inout0 | ||
422 | aes${dir} $rndkey1,$inout1 | ||
423 | aes${dir} $rndkey1,$inout2 | ||
424 | aes${dir} $rndkey1,$inout3 | ||
425 | aes${dir} $rndkey1,$inout4 | ||
426 | aes${dir} $rndkey1,$inout5 | ||
427 | aes${dir}last $rndkey0,$inout0 | ||
428 | aes${dir}last $rndkey0,$inout1 | ||
429 | aes${dir}last $rndkey0,$inout2 | ||
430 | aes${dir}last $rndkey0,$inout3 | ||
431 | aes${dir}last $rndkey0,$inout4 | ||
432 | aes${dir}last $rndkey0,$inout5 | ||
433 | ret | ||
434 | .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 | ||
435 | ___ | ||
436 | } | ||
437 | sub aesni_generate8 { | ||
438 | my $dir=shift; | ||
439 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
440 | # preserved. $inout[0-7] is cipher/clear text... | ||
441 | $code.=<<___; | ||
442 | .type _aesni_${dir}rypt8,\@abi-omnipotent | ||
443 | .align 16 | ||
444 | _aesni_${dir}rypt8: | ||
445 | _CET_ENDBR | ||
446 | $movkey ($key),$rndkey0 | ||
447 | shr \$1,$rounds | ||
448 | $movkey 16($key),$rndkey1 | ||
449 | lea 32($key),$key | ||
450 | xorps $rndkey0,$inout0 | ||
451 | xorps $rndkey0,$inout1 | ||
452 | aes${dir} $rndkey1,$inout0 | ||
453 | pxor $rndkey0,$inout2 | ||
454 | aes${dir} $rndkey1,$inout1 | ||
455 | pxor $rndkey0,$inout3 | ||
456 | aes${dir} $rndkey1,$inout2 | ||
457 | pxor $rndkey0,$inout4 | ||
458 | aes${dir} $rndkey1,$inout3 | ||
459 | pxor $rndkey0,$inout5 | ||
460 | dec $rounds | ||
461 | aes${dir} $rndkey1,$inout4 | ||
462 | pxor $rndkey0,$inout6 | ||
463 | aes${dir} $rndkey1,$inout5 | ||
464 | pxor $rndkey0,$inout7 | ||
465 | $movkey ($key),$rndkey0 | ||
466 | aes${dir} $rndkey1,$inout6 | ||
467 | aes${dir} $rndkey1,$inout7 | ||
468 | $movkey 16($key),$rndkey1 | ||
469 | jmp .L${dir}_loop8_enter | ||
470 | .align 16 | ||
471 | .L${dir}_loop8: | ||
472 | aes${dir} $rndkey1,$inout0 | ||
473 | aes${dir} $rndkey1,$inout1 | ||
474 | dec $rounds | ||
475 | aes${dir} $rndkey1,$inout2 | ||
476 | aes${dir} $rndkey1,$inout3 | ||
477 | aes${dir} $rndkey1,$inout4 | ||
478 | aes${dir} $rndkey1,$inout5 | ||
479 | aes${dir} $rndkey1,$inout6 | ||
480 | aes${dir} $rndkey1,$inout7 | ||
481 | $movkey 16($key),$rndkey1 | ||
482 | .L${dir}_loop8_enter: # happens to be 16-byte aligned | ||
483 | aes${dir} $rndkey0,$inout0 | ||
484 | aes${dir} $rndkey0,$inout1 | ||
485 | lea 32($key),$key | ||
486 | aes${dir} $rndkey0,$inout2 | ||
487 | aes${dir} $rndkey0,$inout3 | ||
488 | aes${dir} $rndkey0,$inout4 | ||
489 | aes${dir} $rndkey0,$inout5 | ||
490 | aes${dir} $rndkey0,$inout6 | ||
491 | aes${dir} $rndkey0,$inout7 | ||
492 | $movkey ($key),$rndkey0 | ||
493 | jnz .L${dir}_loop8 | ||
494 | |||
495 | aes${dir} $rndkey1,$inout0 | ||
496 | aes${dir} $rndkey1,$inout1 | ||
497 | aes${dir} $rndkey1,$inout2 | ||
498 | aes${dir} $rndkey1,$inout3 | ||
499 | aes${dir} $rndkey1,$inout4 | ||
500 | aes${dir} $rndkey1,$inout5 | ||
501 | aes${dir} $rndkey1,$inout6 | ||
502 | aes${dir} $rndkey1,$inout7 | ||
503 | aes${dir}last $rndkey0,$inout0 | ||
504 | aes${dir}last $rndkey0,$inout1 | ||
505 | aes${dir}last $rndkey0,$inout2 | ||
506 | aes${dir}last $rndkey0,$inout3 | ||
507 | aes${dir}last $rndkey0,$inout4 | ||
508 | aes${dir}last $rndkey0,$inout5 | ||
509 | aes${dir}last $rndkey0,$inout6 | ||
510 | aes${dir}last $rndkey0,$inout7 | ||
511 | ret | ||
512 | .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 | ||
513 | ___ | ||
514 | } | ||
515 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); | ||
516 | &aesni_generate3("dec"); | ||
517 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | ||
518 | &aesni_generate4("dec"); | ||
519 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); | ||
520 | &aesni_generate6("dec"); | ||
521 | &aesni_generate8("enc") if ($PREFIX eq "aesni"); | ||
522 | &aesni_generate8("dec"); | ||
523 | |||
524 | if ($PREFIX eq "aesni") { | ||
525 | ######################################################################## | ||
526 | # void aesni_ecb_encrypt (const void *in, void *out, | ||
527 | # size_t length, const AES_KEY *key, | ||
528 | # int enc); | ||
529 | $code.=<<___; | ||
530 | .globl aesni_ecb_encrypt | ||
531 | .type aesni_ecb_encrypt,\@function,5 | ||
532 | .align 16 | ||
533 | aesni_ecb_encrypt: | ||
534 | _CET_ENDBR | ||
535 | and \$-16,$len | ||
536 | jz .Lecb_ret | ||
537 | |||
538 | mov 240($key),$rounds # key->rounds | ||
539 | $movkey ($key),$rndkey0 | ||
540 | mov $key,$key_ # backup $key | ||
541 | mov $rounds,$rnds_ # backup $rounds | ||
542 | test %r8d,%r8d # 5th argument | ||
543 | jz .Lecb_decrypt | ||
544 | #--------------------------- ECB ENCRYPT ------------------------------# | ||
545 | cmp \$0x80,$len | ||
546 | jb .Lecb_enc_tail | ||
547 | |||
548 | movdqu ($inp),$inout0 | ||
549 | movdqu 0x10($inp),$inout1 | ||
550 | movdqu 0x20($inp),$inout2 | ||
551 | movdqu 0x30($inp),$inout3 | ||
552 | movdqu 0x40($inp),$inout4 | ||
553 | movdqu 0x50($inp),$inout5 | ||
554 | movdqu 0x60($inp),$inout6 | ||
555 | movdqu 0x70($inp),$inout7 | ||
556 | lea 0x80($inp),$inp | ||
557 | sub \$0x80,$len | ||
558 | jmp .Lecb_enc_loop8_enter | ||
559 | .align 16 | ||
560 | .Lecb_enc_loop8: | ||
561 | movups $inout0,($out) | ||
562 | mov $key_,$key # restore $key | ||
563 | movdqu ($inp),$inout0 | ||
564 | mov $rnds_,$rounds # restore $rounds | ||
565 | movups $inout1,0x10($out) | ||
566 | movdqu 0x10($inp),$inout1 | ||
567 | movups $inout2,0x20($out) | ||
568 | movdqu 0x20($inp),$inout2 | ||
569 | movups $inout3,0x30($out) | ||
570 | movdqu 0x30($inp),$inout3 | ||
571 | movups $inout4,0x40($out) | ||
572 | movdqu 0x40($inp),$inout4 | ||
573 | movups $inout5,0x50($out) | ||
574 | movdqu 0x50($inp),$inout5 | ||
575 | movups $inout6,0x60($out) | ||
576 | movdqu 0x60($inp),$inout6 | ||
577 | movups $inout7,0x70($out) | ||
578 | lea 0x80($out),$out | ||
579 | movdqu 0x70($inp),$inout7 | ||
580 | lea 0x80($inp),$inp | ||
581 | .Lecb_enc_loop8_enter: | ||
582 | |||
583 | call _aesni_encrypt8 | ||
584 | |||
585 | sub \$0x80,$len | ||
586 | jnc .Lecb_enc_loop8 | ||
587 | |||
588 | movups $inout0,($out) | ||
589 | mov $key_,$key # restore $key | ||
590 | movups $inout1,0x10($out) | ||
591 | mov $rnds_,$rounds # restore $rounds | ||
592 | movups $inout2,0x20($out) | ||
593 | movups $inout3,0x30($out) | ||
594 | movups $inout4,0x40($out) | ||
595 | movups $inout5,0x50($out) | ||
596 | movups $inout6,0x60($out) | ||
597 | movups $inout7,0x70($out) | ||
598 | lea 0x80($out),$out | ||
599 | add \$0x80,$len | ||
600 | jz .Lecb_ret | ||
601 | |||
602 | .Lecb_enc_tail: | ||
603 | movups ($inp),$inout0 | ||
604 | cmp \$0x20,$len | ||
605 | jb .Lecb_enc_one | ||
606 | movups 0x10($inp),$inout1 | ||
607 | je .Lecb_enc_two | ||
608 | movups 0x20($inp),$inout2 | ||
609 | cmp \$0x40,$len | ||
610 | jb .Lecb_enc_three | ||
611 | movups 0x30($inp),$inout3 | ||
612 | je .Lecb_enc_four | ||
613 | movups 0x40($inp),$inout4 | ||
614 | cmp \$0x60,$len | ||
615 | jb .Lecb_enc_five | ||
616 | movups 0x50($inp),$inout5 | ||
617 | je .Lecb_enc_six | ||
618 | movdqu 0x60($inp),$inout6 | ||
619 | call _aesni_encrypt8 | ||
620 | movups $inout0,($out) | ||
621 | movups $inout1,0x10($out) | ||
622 | movups $inout2,0x20($out) | ||
623 | movups $inout3,0x30($out) | ||
624 | movups $inout4,0x40($out) | ||
625 | movups $inout5,0x50($out) | ||
626 | movups $inout6,0x60($out) | ||
627 | jmp .Lecb_ret | ||
628 | .align 16 | ||
629 | .Lecb_enc_one: | ||
630 | ___ | ||
631 | &aesni_generate1("enc",$key,$rounds); | ||
632 | $code.=<<___; | ||
633 | movups $inout0,($out) | ||
634 | jmp .Lecb_ret | ||
635 | .align 16 | ||
636 | .Lecb_enc_two: | ||
637 | xorps $inout2,$inout2 | ||
638 | call _aesni_encrypt3 | ||
639 | movups $inout0,($out) | ||
640 | movups $inout1,0x10($out) | ||
641 | jmp .Lecb_ret | ||
642 | .align 16 | ||
643 | .Lecb_enc_three: | ||
644 | call _aesni_encrypt3 | ||
645 | movups $inout0,($out) | ||
646 | movups $inout1,0x10($out) | ||
647 | movups $inout2,0x20($out) | ||
648 | jmp .Lecb_ret | ||
649 | .align 16 | ||
650 | .Lecb_enc_four: | ||
651 | call _aesni_encrypt4 | ||
652 | movups $inout0,($out) | ||
653 | movups $inout1,0x10($out) | ||
654 | movups $inout2,0x20($out) | ||
655 | movups $inout3,0x30($out) | ||
656 | jmp .Lecb_ret | ||
657 | .align 16 | ||
658 | .Lecb_enc_five: | ||
659 | xorps $inout5,$inout5 | ||
660 | call _aesni_encrypt6 | ||
661 | movups $inout0,($out) | ||
662 | movups $inout1,0x10($out) | ||
663 | movups $inout2,0x20($out) | ||
664 | movups $inout3,0x30($out) | ||
665 | movups $inout4,0x40($out) | ||
666 | jmp .Lecb_ret | ||
667 | .align 16 | ||
668 | .Lecb_enc_six: | ||
669 | call _aesni_encrypt6 | ||
670 | movups $inout0,($out) | ||
671 | movups $inout1,0x10($out) | ||
672 | movups $inout2,0x20($out) | ||
673 | movups $inout3,0x30($out) | ||
674 | movups $inout4,0x40($out) | ||
675 | movups $inout5,0x50($out) | ||
676 | jmp .Lecb_ret | ||
677 | #--------------------------- ECB DECRYPT ------------------------------# | ||
678 | .align 16 | ||
679 | .Lecb_decrypt: | ||
680 | cmp \$0x80,$len | ||
681 | jb .Lecb_dec_tail | ||
682 | |||
683 | movdqu ($inp),$inout0 | ||
684 | movdqu 0x10($inp),$inout1 | ||
685 | movdqu 0x20($inp),$inout2 | ||
686 | movdqu 0x30($inp),$inout3 | ||
687 | movdqu 0x40($inp),$inout4 | ||
688 | movdqu 0x50($inp),$inout5 | ||
689 | movdqu 0x60($inp),$inout6 | ||
690 | movdqu 0x70($inp),$inout7 | ||
691 | lea 0x80($inp),$inp | ||
692 | sub \$0x80,$len | ||
693 | jmp .Lecb_dec_loop8_enter | ||
694 | .align 16 | ||
695 | .Lecb_dec_loop8: | ||
696 | movups $inout0,($out) | ||
697 | mov $key_,$key # restore $key | ||
698 | movdqu ($inp),$inout0 | ||
699 | mov $rnds_,$rounds # restore $rounds | ||
700 | movups $inout1,0x10($out) | ||
701 | movdqu 0x10($inp),$inout1 | ||
702 | movups $inout2,0x20($out) | ||
703 | movdqu 0x20($inp),$inout2 | ||
704 | movups $inout3,0x30($out) | ||
705 | movdqu 0x30($inp),$inout3 | ||
706 | movups $inout4,0x40($out) | ||
707 | movdqu 0x40($inp),$inout4 | ||
708 | movups $inout5,0x50($out) | ||
709 | movdqu 0x50($inp),$inout5 | ||
710 | movups $inout6,0x60($out) | ||
711 | movdqu 0x60($inp),$inout6 | ||
712 | movups $inout7,0x70($out) | ||
713 | lea 0x80($out),$out | ||
714 | movdqu 0x70($inp),$inout7 | ||
715 | lea 0x80($inp),$inp | ||
716 | .Lecb_dec_loop8_enter: | ||
717 | |||
718 | call _aesni_decrypt8 | ||
719 | |||
720 | $movkey ($key_),$rndkey0 | ||
721 | sub \$0x80,$len | ||
722 | jnc .Lecb_dec_loop8 | ||
723 | |||
724 | movups $inout0,($out) | ||
725 | mov $key_,$key # restore $key | ||
726 | movups $inout1,0x10($out) | ||
727 | mov $rnds_,$rounds # restore $rounds | ||
728 | movups $inout2,0x20($out) | ||
729 | movups $inout3,0x30($out) | ||
730 | movups $inout4,0x40($out) | ||
731 | movups $inout5,0x50($out) | ||
732 | movups $inout6,0x60($out) | ||
733 | movups $inout7,0x70($out) | ||
734 | lea 0x80($out),$out | ||
735 | add \$0x80,$len | ||
736 | jz .Lecb_ret | ||
737 | |||
738 | .Lecb_dec_tail: | ||
739 | movups ($inp),$inout0 | ||
740 | cmp \$0x20,$len | ||
741 | jb .Lecb_dec_one | ||
742 | movups 0x10($inp),$inout1 | ||
743 | je .Lecb_dec_two | ||
744 | movups 0x20($inp),$inout2 | ||
745 | cmp \$0x40,$len | ||
746 | jb .Lecb_dec_three | ||
747 | movups 0x30($inp),$inout3 | ||
748 | je .Lecb_dec_four | ||
749 | movups 0x40($inp),$inout4 | ||
750 | cmp \$0x60,$len | ||
751 | jb .Lecb_dec_five | ||
752 | movups 0x50($inp),$inout5 | ||
753 | je .Lecb_dec_six | ||
754 | movups 0x60($inp),$inout6 | ||
755 | $movkey ($key),$rndkey0 | ||
756 | call _aesni_decrypt8 | ||
757 | movups $inout0,($out) | ||
758 | movups $inout1,0x10($out) | ||
759 | movups $inout2,0x20($out) | ||
760 | movups $inout3,0x30($out) | ||
761 | movups $inout4,0x40($out) | ||
762 | movups $inout5,0x50($out) | ||
763 | movups $inout6,0x60($out) | ||
764 | jmp .Lecb_ret | ||
765 | .align 16 | ||
766 | .Lecb_dec_one: | ||
767 | ___ | ||
768 | &aesni_generate1("dec",$key,$rounds); | ||
769 | $code.=<<___; | ||
770 | movups $inout0,($out) | ||
771 | jmp .Lecb_ret | ||
772 | .align 16 | ||
773 | .Lecb_dec_two: | ||
774 | xorps $inout2,$inout2 | ||
775 | call _aesni_decrypt3 | ||
776 | movups $inout0,($out) | ||
777 | movups $inout1,0x10($out) | ||
778 | jmp .Lecb_ret | ||
779 | .align 16 | ||
780 | .Lecb_dec_three: | ||
781 | call _aesni_decrypt3 | ||
782 | movups $inout0,($out) | ||
783 | movups $inout1,0x10($out) | ||
784 | movups $inout2,0x20($out) | ||
785 | jmp .Lecb_ret | ||
786 | .align 16 | ||
787 | .Lecb_dec_four: | ||
788 | call _aesni_decrypt4 | ||
789 | movups $inout0,($out) | ||
790 | movups $inout1,0x10($out) | ||
791 | movups $inout2,0x20($out) | ||
792 | movups $inout3,0x30($out) | ||
793 | jmp .Lecb_ret | ||
794 | .align 16 | ||
795 | .Lecb_dec_five: | ||
796 | xorps $inout5,$inout5 | ||
797 | call _aesni_decrypt6 | ||
798 | movups $inout0,($out) | ||
799 | movups $inout1,0x10($out) | ||
800 | movups $inout2,0x20($out) | ||
801 | movups $inout3,0x30($out) | ||
802 | movups $inout4,0x40($out) | ||
803 | jmp .Lecb_ret | ||
804 | .align 16 | ||
805 | .Lecb_dec_six: | ||
806 | call _aesni_decrypt6 | ||
807 | movups $inout0,($out) | ||
808 | movups $inout1,0x10($out) | ||
809 | movups $inout2,0x20($out) | ||
810 | movups $inout3,0x30($out) | ||
811 | movups $inout4,0x40($out) | ||
812 | movups $inout5,0x50($out) | ||
813 | |||
814 | .Lecb_ret: | ||
815 | ret | ||
816 | .size aesni_ecb_encrypt,.-aesni_ecb_encrypt | ||
817 | ___ | ||
818 | |||
819 | { | ||
820 | ###################################################################### | ||
821 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, | ||
822 | # size_t blocks, const AES_KEY *key, | ||
823 | # const char *ivec,char *cmac); | ||
824 | # | ||
825 | # Handles only complete blocks, operates on 64-bit counter and | ||
826 | # does not update *ivec! Nor does it finalize CMAC value | ||
827 | # (see engine/eng_aesni.c for details) | ||
828 | # | ||
829 | { | ||
830 | my $cmac="%r9"; # 6th argument | ||
831 | |||
832 | my $increment="%xmm6"; | ||
833 | my $bswap_mask="%xmm7"; | ||
834 | |||
835 | $code.=<<___; | ||
836 | .globl aesni_ccm64_encrypt_blocks | ||
837 | .type aesni_ccm64_encrypt_blocks,\@function,6 | ||
838 | .align 16 | ||
839 | aesni_ccm64_encrypt_blocks: | ||
840 | _CET_ENDBR | ||
841 | ___ | ||
842 | $code.=<<___ if ($win64); | ||
843 | lea -0x58(%rsp),%rsp | ||
844 | movaps %xmm6,(%rsp) | ||
845 | movaps %xmm7,0x10(%rsp) | ||
846 | movaps %xmm8,0x20(%rsp) | ||
847 | movaps %xmm9,0x30(%rsp) | ||
848 | .Lccm64_enc_body: | ||
849 | ___ | ||
850 | $code.=<<___; | ||
851 | mov 240($key),$rounds # key->rounds | ||
852 | movdqu ($ivp),$iv | ||
853 | movdqa .Lincrement64(%rip),$increment | ||
854 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
855 | |||
856 | shr \$1,$rounds | ||
857 | lea 0($key),$key_ | ||
858 | movdqu ($cmac),$inout1 | ||
859 | movdqa $iv,$inout0 | ||
860 | mov $rounds,$rnds_ | ||
861 | pshufb $bswap_mask,$iv | ||
862 | jmp .Lccm64_enc_outer | ||
863 | .align 16 | ||
864 | .Lccm64_enc_outer: | ||
865 | $movkey ($key_),$rndkey0 | ||
866 | mov $rnds_,$rounds | ||
867 | movups ($inp),$in0 # load inp | ||
868 | |||
869 | xorps $rndkey0,$inout0 # counter | ||
870 | $movkey 16($key_),$rndkey1 | ||
871 | xorps $in0,$rndkey0 | ||
872 | lea 32($key_),$key | ||
873 | xorps $rndkey0,$inout1 # cmac^=inp | ||
874 | $movkey ($key),$rndkey0 | ||
875 | |||
876 | .Lccm64_enc2_loop: | ||
877 | aesenc $rndkey1,$inout0 | ||
878 | dec $rounds | ||
879 | aesenc $rndkey1,$inout1 | ||
880 | $movkey 16($key),$rndkey1 | ||
881 | aesenc $rndkey0,$inout0 | ||
882 | lea 32($key),$key | ||
883 | aesenc $rndkey0,$inout1 | ||
884 | $movkey 0($key),$rndkey0 | ||
885 | jnz .Lccm64_enc2_loop | ||
886 | aesenc $rndkey1,$inout0 | ||
887 | aesenc $rndkey1,$inout1 | ||
888 | paddq $increment,$iv | ||
889 | aesenclast $rndkey0,$inout0 | ||
890 | aesenclast $rndkey0,$inout1 | ||
891 | |||
892 | dec $len | ||
893 | lea 16($inp),$inp | ||
894 | xorps $inout0,$in0 # inp ^= E(iv) | ||
895 | movdqa $iv,$inout0 | ||
896 | movups $in0,($out) # save output | ||
897 | lea 16($out),$out | ||
898 | pshufb $bswap_mask,$inout0 | ||
899 | jnz .Lccm64_enc_outer | ||
900 | |||
901 | movups $inout1,($cmac) | ||
902 | ___ | ||
903 | $code.=<<___ if ($win64); | ||
904 | movaps (%rsp),%xmm6 | ||
905 | movaps 0x10(%rsp),%xmm7 | ||
906 | movaps 0x20(%rsp),%xmm8 | ||
907 | movaps 0x30(%rsp),%xmm9 | ||
908 | lea 0x58(%rsp),%rsp | ||
909 | .Lccm64_enc_ret: | ||
910 | ___ | ||
911 | $code.=<<___; | ||
912 | ret | ||
913 | .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks | ||
914 | ___ | ||
915 | ###################################################################### | ||
916 | $code.=<<___; | ||
917 | .globl aesni_ccm64_decrypt_blocks | ||
918 | .type aesni_ccm64_decrypt_blocks,\@function,6 | ||
919 | .align 16 | ||
920 | aesni_ccm64_decrypt_blocks: | ||
921 | _CET_ENDBR | ||
922 | ___ | ||
923 | $code.=<<___ if ($win64); | ||
924 | lea -0x58(%rsp),%rsp | ||
925 | movaps %xmm6,(%rsp) | ||
926 | movaps %xmm7,0x10(%rsp) | ||
927 | movaps %xmm8,0x20(%rsp) | ||
928 | movaps %xmm9,0x30(%rsp) | ||
929 | .Lccm64_dec_body: | ||
930 | ___ | ||
931 | $code.=<<___; | ||
932 | mov 240($key),$rounds # key->rounds | ||
933 | movups ($ivp),$iv | ||
934 | movdqu ($cmac),$inout1 | ||
935 | movdqa .Lincrement64(%rip),$increment | ||
936 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
937 | |||
938 | movaps $iv,$inout0 | ||
939 | mov $rounds,$rnds_ | ||
940 | mov $key,$key_ | ||
941 | pshufb $bswap_mask,$iv | ||
942 | ___ | ||
943 | &aesni_generate1("enc",$key,$rounds); | ||
944 | $code.=<<___; | ||
945 | movups ($inp),$in0 # load inp | ||
946 | paddq $increment,$iv | ||
947 | lea 16($inp),$inp | ||
948 | jmp .Lccm64_dec_outer | ||
949 | .align 16 | ||
950 | .Lccm64_dec_outer: | ||
951 | xorps $inout0,$in0 # inp ^= E(iv) | ||
952 | movdqa $iv,$inout0 | ||
953 | mov $rnds_,$rounds | ||
954 | movups $in0,($out) # save output | ||
955 | lea 16($out),$out | ||
956 | pshufb $bswap_mask,$inout0 | ||
957 | |||
958 | sub \$1,$len | ||
959 | jz .Lccm64_dec_break | ||
960 | |||
961 | $movkey ($key_),$rndkey0 | ||
962 | shr \$1,$rounds | ||
963 | $movkey 16($key_),$rndkey1 | ||
964 | xorps $rndkey0,$in0 | ||
965 | lea 32($key_),$key | ||
966 | xorps $rndkey0,$inout0 | ||
967 | xorps $in0,$inout1 # cmac^=out | ||
968 | $movkey ($key),$rndkey0 | ||
969 | |||
970 | .Lccm64_dec2_loop: | ||
971 | aesenc $rndkey1,$inout0 | ||
972 | dec $rounds | ||
973 | aesenc $rndkey1,$inout1 | ||
974 | $movkey 16($key),$rndkey1 | ||
975 | aesenc $rndkey0,$inout0 | ||
976 | lea 32($key),$key | ||
977 | aesenc $rndkey0,$inout1 | ||
978 | $movkey 0($key),$rndkey0 | ||
979 | jnz .Lccm64_dec2_loop | ||
980 | movups ($inp),$in0 # load inp | ||
981 | paddq $increment,$iv | ||
982 | aesenc $rndkey1,$inout0 | ||
983 | aesenc $rndkey1,$inout1 | ||
984 | lea 16($inp),$inp | ||
985 | aesenclast $rndkey0,$inout0 | ||
986 | aesenclast $rndkey0,$inout1 | ||
987 | jmp .Lccm64_dec_outer | ||
988 | |||
989 | .align 16 | ||
990 | .Lccm64_dec_break: | ||
991 | #xorps $in0,$inout1 # cmac^=out | ||
992 | ___ | ||
993 | &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); | ||
994 | $code.=<<___; | ||
995 | movups $inout1,($cmac) | ||
996 | ___ | ||
997 | $code.=<<___ if ($win64); | ||
998 | movaps (%rsp),%xmm6 | ||
999 | movaps 0x10(%rsp),%xmm7 | ||
1000 | movaps 0x20(%rsp),%xmm8 | ||
1001 | movaps 0x30(%rsp),%xmm9 | ||
1002 | lea 0x58(%rsp),%rsp | ||
1003 | .Lccm64_dec_ret: | ||
1004 | ___ | ||
1005 | $code.=<<___; | ||
1006 | ret | ||
1007 | .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks | ||
1008 | ___ | ||
1009 | } | ||
1010 | ###################################################################### | ||
1011 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | ||
1012 | # size_t blocks, const AES_KEY *key, | ||
1013 | # const char *ivec); | ||
1014 | # | ||
1015 | # Handles only complete blocks, operates on 32-bit counter and | ||
1016 | # does not update *ivec! (see engine/eng_aesni.c for details) | ||
1017 | # | ||
1018 | { | ||
1019 | my $frame_size = 0x20+($win64?160:0); | ||
1020 | my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); | ||
1021 | my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); | ||
1022 | my $bswap_mask="%xmm15"; | ||
1023 | |||
1024 | $code.=<<___; | ||
1025 | .globl aesni_ctr32_encrypt_blocks | ||
1026 | .type aesni_ctr32_encrypt_blocks,\@function,5 | ||
1027 | .align 16 | ||
1028 | aesni_ctr32_encrypt_blocks: | ||
1029 | _CET_ENDBR | ||
1030 | lea (%rsp),%rax | ||
1031 | push %rbp | ||
1032 | sub \$$frame_size,%rsp | ||
1033 | ___ | ||
1034 | $code.=<<___ if ($win64); | ||
1035 | movaps %xmm6,0x20(%rsp) | ||
1036 | movaps %xmm7,0x30(%rsp) | ||
1037 | movaps %xmm8,0x40(%rsp) | ||
1038 | movaps %xmm9,0x50(%rsp) | ||
1039 | movaps %xmm10,0x60(%rsp) | ||
1040 | movaps %xmm11,0x70(%rsp) | ||
1041 | movaps %xmm12,0x80(%rsp) | ||
1042 | movaps %xmm13,0x90(%rsp) | ||
1043 | movaps %xmm14,0xa0(%rsp) | ||
1044 | movaps %xmm15,0xb0(%rsp) | ||
1045 | .Lctr32_body: | ||
1046 | ___ | ||
1047 | $code.=<<___; | ||
1048 | lea -8(%rax),%rbp | ||
1049 | cmp \$1,$len | ||
1050 | je .Lctr32_one_shortcut | ||
1051 | |||
1052 | movdqu ($ivp),$ivec | ||
1053 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
1054 | xor $rounds,$rounds | ||
1055 | pextrd \$3,$ivec,$rnds_ # pull 32-bit counter | ||
1056 | pinsrd \$3,$rounds,$ivec # wipe 32-bit counter | ||
1057 | |||
1058 | mov 240($key),$rounds # key->rounds | ||
1059 | bswap $rnds_ | ||
1060 | pxor $iv0,$iv0 # vector of 3 32-bit counters | ||
1061 | pxor $iv1,$iv1 # vector of 3 32-bit counters | ||
1062 | pinsrd \$0,$rnds_,$iv0 | ||
1063 | lea 3($rnds_),$key_ | ||
1064 | pinsrd \$0,$key_,$iv1 | ||
1065 | inc $rnds_ | ||
1066 | pinsrd \$1,$rnds_,$iv0 | ||
1067 | inc $key_ | ||
1068 | pinsrd \$1,$key_,$iv1 | ||
1069 | inc $rnds_ | ||
1070 | pinsrd \$2,$rnds_,$iv0 | ||
1071 | inc $key_ | ||
1072 | pinsrd \$2,$key_,$iv1 | ||
1073 | movdqa $iv0,0x00(%rsp) | ||
1074 | pshufb $bswap_mask,$iv0 | ||
1075 | movdqa $iv1,0x10(%rsp) | ||
1076 | pshufb $bswap_mask,$iv1 | ||
1077 | |||
1078 | pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword | ||
1079 | pshufd \$`2<<6`,$iv0,$inout1 | ||
1080 | pshufd \$`1<<6`,$iv0,$inout2 | ||
1081 | cmp \$6,$len | ||
1082 | jb .Lctr32_tail | ||
1083 | shr \$1,$rounds | ||
1084 | mov $key,$key_ # backup $key | ||
1085 | mov $rounds,$rnds_ # backup $rounds | ||
1086 | sub \$6,$len | ||
1087 | jmp .Lctr32_loop6 | ||
1088 | |||
1089 | .align 16 | ||
1090 | .Lctr32_loop6: | ||
1091 | pshufd \$`3<<6`,$iv1,$inout3 | ||
1092 | por $ivec,$inout0 # merge counter-less ivec | ||
1093 | $movkey ($key_),$rndkey0 | ||
1094 | pshufd \$`2<<6`,$iv1,$inout4 | ||
1095 | por $ivec,$inout1 | ||
1096 | $movkey 16($key_),$rndkey1 | ||
1097 | pshufd \$`1<<6`,$iv1,$inout5 | ||
1098 | por $ivec,$inout2 | ||
1099 | por $ivec,$inout3 | ||
1100 | xorps $rndkey0,$inout0 | ||
1101 | por $ivec,$inout4 | ||
1102 | por $ivec,$inout5 | ||
1103 | |||
1104 | # inline _aesni_encrypt6 and interleave last rounds | ||
1105 | # with own code... | ||
1106 | |||
1107 | pxor $rndkey0,$inout1 | ||
1108 | aesenc $rndkey1,$inout0 | ||
1109 | lea 32($key_),$key | ||
1110 | pxor $rndkey0,$inout2 | ||
1111 | aesenc $rndkey1,$inout1 | ||
1112 | movdqa .Lincrement32(%rip),$iv1 | ||
1113 | pxor $rndkey0,$inout3 | ||
1114 | aesenc $rndkey1,$inout2 | ||
1115 | movdqa (%rsp),$iv0 | ||
1116 | pxor $rndkey0,$inout4 | ||
1117 | aesenc $rndkey1,$inout3 | ||
1118 | pxor $rndkey0,$inout5 | ||
1119 | $movkey ($key),$rndkey0 | ||
1120 | dec $rounds | ||
1121 | aesenc $rndkey1,$inout4 | ||
1122 | aesenc $rndkey1,$inout5 | ||
1123 | jmp .Lctr32_enc_loop6_enter | ||
1124 | .align 16 | ||
1125 | .Lctr32_enc_loop6: | ||
1126 | aesenc $rndkey1,$inout0 | ||
1127 | aesenc $rndkey1,$inout1 | ||
1128 | dec $rounds | ||
1129 | aesenc $rndkey1,$inout2 | ||
1130 | aesenc $rndkey1,$inout3 | ||
1131 | aesenc $rndkey1,$inout4 | ||
1132 | aesenc $rndkey1,$inout5 | ||
1133 | .Lctr32_enc_loop6_enter: | ||
1134 | $movkey 16($key),$rndkey1 | ||
1135 | aesenc $rndkey0,$inout0 | ||
1136 | aesenc $rndkey0,$inout1 | ||
1137 | lea 32($key),$key | ||
1138 | aesenc $rndkey0,$inout2 | ||
1139 | aesenc $rndkey0,$inout3 | ||
1140 | aesenc $rndkey0,$inout4 | ||
1141 | aesenc $rndkey0,$inout5 | ||
1142 | $movkey ($key),$rndkey0 | ||
1143 | jnz .Lctr32_enc_loop6 | ||
1144 | |||
1145 | aesenc $rndkey1,$inout0 | ||
1146 | paddd $iv1,$iv0 # increment counter vector | ||
1147 | aesenc $rndkey1,$inout1 | ||
1148 | paddd 0x10(%rsp),$iv1 | ||
1149 | aesenc $rndkey1,$inout2 | ||
1150 | movdqa $iv0,0x00(%rsp) # save counter vector | ||
1151 | aesenc $rndkey1,$inout3 | ||
1152 | movdqa $iv1,0x10(%rsp) | ||
1153 | aesenc $rndkey1,$inout4 | ||
1154 | pshufb $bswap_mask,$iv0 # byte swap | ||
1155 | aesenc $rndkey1,$inout5 | ||
1156 | pshufb $bswap_mask,$iv1 | ||
1157 | |||
1158 | aesenclast $rndkey0,$inout0 | ||
1159 | movups ($inp),$in0 # load input | ||
1160 | aesenclast $rndkey0,$inout1 | ||
1161 | movups 0x10($inp),$in1 | ||
1162 | aesenclast $rndkey0,$inout2 | ||
1163 | movups 0x20($inp),$in2 | ||
1164 | aesenclast $rndkey0,$inout3 | ||
1165 | movups 0x30($inp),$in3 | ||
1166 | aesenclast $rndkey0,$inout4 | ||
1167 | movups 0x40($inp),$rndkey1 | ||
1168 | aesenclast $rndkey0,$inout5 | ||
1169 | movups 0x50($inp),$rndkey0 | ||
1170 | lea 0x60($inp),$inp | ||
1171 | |||
1172 | xorps $inout0,$in0 # xor | ||
1173 | pshufd \$`3<<6`,$iv0,$inout0 | ||
1174 | xorps $inout1,$in1 | ||
1175 | pshufd \$`2<<6`,$iv0,$inout1 | ||
1176 | movups $in0,($out) # store output | ||
1177 | xorps $inout2,$in2 | ||
1178 | pshufd \$`1<<6`,$iv0,$inout2 | ||
1179 | movups $in1,0x10($out) | ||
1180 | xorps $inout3,$in3 | ||
1181 | movups $in2,0x20($out) | ||
1182 | xorps $inout4,$rndkey1 | ||
1183 | movups $in3,0x30($out) | ||
1184 | xorps $inout5,$rndkey0 | ||
1185 | movups $rndkey1,0x40($out) | ||
1186 | movups $rndkey0,0x50($out) | ||
1187 | lea 0x60($out),$out | ||
1188 | mov $rnds_,$rounds | ||
1189 | sub \$6,$len | ||
1190 | jnc .Lctr32_loop6 | ||
1191 | |||
1192 | add \$6,$len | ||
1193 | jz .Lctr32_done | ||
1194 | mov $key_,$key # restore $key | ||
1195 | lea 1($rounds,$rounds),$rounds # restore original value | ||
1196 | |||
1197 | .Lctr32_tail: | ||
1198 | por $ivec,$inout0 | ||
1199 | movups ($inp),$in0 | ||
1200 | cmp \$2,$len | ||
1201 | jb .Lctr32_one | ||
1202 | |||
1203 | por $ivec,$inout1 | ||
1204 | movups 0x10($inp),$in1 | ||
1205 | je .Lctr32_two | ||
1206 | |||
1207 | pshufd \$`3<<6`,$iv1,$inout3 | ||
1208 | por $ivec,$inout2 | ||
1209 | movups 0x20($inp),$in2 | ||
1210 | cmp \$4,$len | ||
1211 | jb .Lctr32_three | ||
1212 | |||
1213 | pshufd \$`2<<6`,$iv1,$inout4 | ||
1214 | por $ivec,$inout3 | ||
1215 | movups 0x30($inp),$in3 | ||
1216 | je .Lctr32_four | ||
1217 | |||
1218 | por $ivec,$inout4 | ||
1219 | xorps $inout5,$inout5 | ||
1220 | |||
1221 | call _aesni_encrypt6 | ||
1222 | |||
1223 | movups 0x40($inp),$rndkey1 | ||
1224 | xorps $inout0,$in0 | ||
1225 | xorps $inout1,$in1 | ||
1226 | movups $in0,($out) | ||
1227 | xorps $inout2,$in2 | ||
1228 | movups $in1,0x10($out) | ||
1229 | xorps $inout3,$in3 | ||
1230 | movups $in2,0x20($out) | ||
1231 | xorps $inout4,$rndkey1 | ||
1232 | movups $in3,0x30($out) | ||
1233 | movups $rndkey1,0x40($out) | ||
1234 | jmp .Lctr32_done | ||
1235 | |||
1236 | .align 16 | ||
1237 | .Lctr32_one_shortcut: | ||
1238 | movups ($ivp),$inout0 | ||
1239 | movups ($inp),$in0 | ||
1240 | mov 240($key),$rounds # key->rounds | ||
1241 | .Lctr32_one: | ||
1242 | ___ | ||
1243 | &aesni_generate1("enc",$key,$rounds); | ||
1244 | $code.=<<___; | ||
1245 | xorps $inout0,$in0 | ||
1246 | movups $in0,($out) | ||
1247 | jmp .Lctr32_done | ||
1248 | |||
1249 | .align 16 | ||
1250 | .Lctr32_two: | ||
1251 | xorps $inout2,$inout2 | ||
1252 | call _aesni_encrypt3 | ||
1253 | xorps $inout0,$in0 | ||
1254 | xorps $inout1,$in1 | ||
1255 | movups $in0,($out) | ||
1256 | movups $in1,0x10($out) | ||
1257 | jmp .Lctr32_done | ||
1258 | |||
1259 | .align 16 | ||
1260 | .Lctr32_three: | ||
1261 | call _aesni_encrypt3 | ||
1262 | xorps $inout0,$in0 | ||
1263 | xorps $inout1,$in1 | ||
1264 | movups $in0,($out) | ||
1265 | xorps $inout2,$in2 | ||
1266 | movups $in1,0x10($out) | ||
1267 | movups $in2,0x20($out) | ||
1268 | jmp .Lctr32_done | ||
1269 | |||
1270 | .align 16 | ||
1271 | .Lctr32_four: | ||
1272 | call _aesni_encrypt4 | ||
1273 | xorps $inout0,$in0 | ||
1274 | xorps $inout1,$in1 | ||
1275 | movups $in0,($out) | ||
1276 | xorps $inout2,$in2 | ||
1277 | movups $in1,0x10($out) | ||
1278 | xorps $inout3,$in3 | ||
1279 | movups $in2,0x20($out) | ||
1280 | movups $in3,0x30($out) | ||
1281 | |||
1282 | .Lctr32_done: | ||
1283 | ___ | ||
1284 | $code.=<<___ if ($win64); | ||
1285 | movaps 0x20(%rsp),%xmm6 | ||
1286 | movaps 0x30(%rsp),%xmm7 | ||
1287 | movaps 0x40(%rsp),%xmm8 | ||
1288 | movaps 0x50(%rsp),%xmm9 | ||
1289 | movaps 0x60(%rsp),%xmm10 | ||
1290 | movaps 0x70(%rsp),%xmm11 | ||
1291 | movaps 0x80(%rsp),%xmm12 | ||
1292 | movaps 0x90(%rsp),%xmm13 | ||
1293 | movaps 0xa0(%rsp),%xmm14 | ||
1294 | movaps 0xb0(%rsp),%xmm15 | ||
1295 | ___ | ||
1296 | $code.=<<___; | ||
1297 | lea (%rbp),%rsp | ||
1298 | pop %rbp | ||
1299 | .Lctr32_ret: | ||
1300 | ret | ||
1301 | .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks | ||
1302 | ___ | ||
1303 | } | ||
1304 | |||
1305 | ###################################################################### | ||
1306 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
1307 | # const AES_KEY *key1, const AES_KEY *key2 | ||
1308 | # const unsigned char iv[16]); | ||
1309 | # | ||
1310 | { | ||
1311 | my @tweak=map("%xmm$_",(10..15)); | ||
1312 | my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); | ||
1313 | my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); | ||
1314 | my $frame_size = 0x60 + ($win64?160:0); | ||
1315 | |||
1316 | $code.=<<___; | ||
1317 | .globl aesni_xts_encrypt | ||
1318 | .type aesni_xts_encrypt,\@function,6 | ||
1319 | .align 16 | ||
1320 | aesni_xts_encrypt: | ||
1321 | _CET_ENDBR | ||
1322 | lea (%rsp),%rax | ||
1323 | push %rbp | ||
1324 | sub \$$frame_size,%rsp | ||
1325 | ___ | ||
1326 | $code.=<<___ if ($win64); | ||
1327 | movaps %xmm6,0x60(%rsp) | ||
1328 | movaps %xmm7,0x70(%rsp) | ||
1329 | movaps %xmm8,0x80(%rsp) | ||
1330 | movaps %xmm9,0x90(%rsp) | ||
1331 | movaps %xmm10,0xa0(%rsp) | ||
1332 | movaps %xmm11,0xb0(%rsp) | ||
1333 | movaps %xmm12,0xc0(%rsp) | ||
1334 | movaps %xmm13,0xd0(%rsp) | ||
1335 | movaps %xmm14,0xe0(%rsp) | ||
1336 | movaps %xmm15,0xf0(%rsp) | ||
1337 | .Lxts_enc_body: | ||
1338 | ___ | ||
1339 | $code.=<<___; | ||
1340 | lea -8(%rax),%rbp | ||
1341 | movups ($ivp),@tweak[5] # load clear-text tweak | ||
1342 | mov 240(%r8),$rounds # key2->rounds | ||
1343 | mov 240($key),$rnds_ # key1->rounds | ||
1344 | ___ | ||
1345 | # generate the tweak | ||
1346 | &aesni_generate1("enc",$key2,$rounds,@tweak[5]); | ||
1347 | $code.=<<___; | ||
1348 | mov $key,$key_ # backup $key | ||
1349 | mov $rnds_,$rounds # backup $rounds | ||
1350 | mov $len,$len_ # backup $len | ||
1351 | and \$-16,$len | ||
1352 | |||
1353 | movdqa .Lxts_magic(%rip),$twmask | ||
1354 | pxor $twtmp,$twtmp | ||
1355 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1356 | ___ | ||
1357 | for ($i=0;$i<4;$i++) { | ||
1358 | $code.=<<___; | ||
1359 | pshufd \$0x13,$twtmp,$twres | ||
1360 | pxor $twtmp,$twtmp | ||
1361 | movdqa @tweak[5],@tweak[$i] | ||
1362 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1363 | pand $twmask,$twres # isolate carry and residue | ||
1364 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1365 | pxor $twres,@tweak[5] | ||
1366 | ___ | ||
1367 | } | ||
1368 | $code.=<<___; | ||
1369 | sub \$16*6,$len | ||
1370 | jc .Lxts_enc_short | ||
1371 | |||
1372 | shr \$1,$rounds | ||
1373 | sub \$1,$rounds | ||
1374 | mov $rounds,$rnds_ | ||
1375 | jmp .Lxts_enc_grandloop | ||
1376 | |||
1377 | .align 16 | ||
1378 | .Lxts_enc_grandloop: | ||
1379 | pshufd \$0x13,$twtmp,$twres | ||
1380 | movdqa @tweak[5],@tweak[4] | ||
1381 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1382 | movdqu `16*0`($inp),$inout0 # load input | ||
1383 | pand $twmask,$twres # isolate carry and residue | ||
1384 | movdqu `16*1`($inp),$inout1 | ||
1385 | pxor $twres,@tweak[5] | ||
1386 | |||
1387 | movdqu `16*2`($inp),$inout2 | ||
1388 | pxor @tweak[0],$inout0 # input^=tweak | ||
1389 | movdqu `16*3`($inp),$inout3 | ||
1390 | pxor @tweak[1],$inout1 | ||
1391 | movdqu `16*4`($inp),$inout4 | ||
1392 | pxor @tweak[2],$inout2 | ||
1393 | movdqu `16*5`($inp),$inout5 | ||
1394 | lea `16*6`($inp),$inp | ||
1395 | pxor @tweak[3],$inout3 | ||
1396 | $movkey ($key_),$rndkey0 | ||
1397 | pxor @tweak[4],$inout4 | ||
1398 | pxor @tweak[5],$inout5 | ||
1399 | |||
1400 | # inline _aesni_encrypt6 and interleave first and last rounds | ||
1401 | # with own code... | ||
1402 | $movkey 16($key_),$rndkey1 | ||
1403 | pxor $rndkey0,$inout0 | ||
1404 | pxor $rndkey0,$inout1 | ||
1405 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks | ||
1406 | aesenc $rndkey1,$inout0 | ||
1407 | lea 32($key_),$key | ||
1408 | pxor $rndkey0,$inout2 | ||
1409 | movdqa @tweak[1],`16*1`(%rsp) | ||
1410 | aesenc $rndkey1,$inout1 | ||
1411 | pxor $rndkey0,$inout3 | ||
1412 | movdqa @tweak[2],`16*2`(%rsp) | ||
1413 | aesenc $rndkey1,$inout2 | ||
1414 | pxor $rndkey0,$inout4 | ||
1415 | movdqa @tweak[3],`16*3`(%rsp) | ||
1416 | aesenc $rndkey1,$inout3 | ||
1417 | pxor $rndkey0,$inout5 | ||
1418 | $movkey ($key),$rndkey0 | ||
1419 | dec $rounds | ||
1420 | movdqa @tweak[4],`16*4`(%rsp) | ||
1421 | aesenc $rndkey1,$inout4 | ||
1422 | movdqa @tweak[5],`16*5`(%rsp) | ||
1423 | aesenc $rndkey1,$inout5 | ||
1424 | pxor $twtmp,$twtmp | ||
1425 | pcmpgtd @tweak[5],$twtmp | ||
1426 | jmp .Lxts_enc_loop6_enter | ||
1427 | |||
1428 | .align 16 | ||
1429 | .Lxts_enc_loop6: | ||
1430 | aesenc $rndkey1,$inout0 | ||
1431 | aesenc $rndkey1,$inout1 | ||
1432 | dec $rounds | ||
1433 | aesenc $rndkey1,$inout2 | ||
1434 | aesenc $rndkey1,$inout3 | ||
1435 | aesenc $rndkey1,$inout4 | ||
1436 | aesenc $rndkey1,$inout5 | ||
1437 | .Lxts_enc_loop6_enter: | ||
1438 | $movkey 16($key),$rndkey1 | ||
1439 | aesenc $rndkey0,$inout0 | ||
1440 | aesenc $rndkey0,$inout1 | ||
1441 | lea 32($key),$key | ||
1442 | aesenc $rndkey0,$inout2 | ||
1443 | aesenc $rndkey0,$inout3 | ||
1444 | aesenc $rndkey0,$inout4 | ||
1445 | aesenc $rndkey0,$inout5 | ||
1446 | $movkey ($key),$rndkey0 | ||
1447 | jnz .Lxts_enc_loop6 | ||
1448 | |||
1449 | pshufd \$0x13,$twtmp,$twres | ||
1450 | pxor $twtmp,$twtmp | ||
1451 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1452 | aesenc $rndkey1,$inout0 | ||
1453 | pand $twmask,$twres # isolate carry and residue | ||
1454 | aesenc $rndkey1,$inout1 | ||
1455 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1456 | aesenc $rndkey1,$inout2 | ||
1457 | pxor $twres,@tweak[5] | ||
1458 | aesenc $rndkey1,$inout3 | ||
1459 | aesenc $rndkey1,$inout4 | ||
1460 | aesenc $rndkey1,$inout5 | ||
1461 | $movkey 16($key),$rndkey1 | ||
1462 | |||
1463 | pshufd \$0x13,$twtmp,$twres | ||
1464 | pxor $twtmp,$twtmp | ||
1465 | movdqa @tweak[5],@tweak[0] | ||
1466 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1467 | aesenc $rndkey0,$inout0 | ||
1468 | pand $twmask,$twres # isolate carry and residue | ||
1469 | aesenc $rndkey0,$inout1 | ||
1470 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1471 | aesenc $rndkey0,$inout2 | ||
1472 | pxor $twres,@tweak[5] | ||
1473 | aesenc $rndkey0,$inout3 | ||
1474 | aesenc $rndkey0,$inout4 | ||
1475 | aesenc $rndkey0,$inout5 | ||
1476 | $movkey 32($key),$rndkey0 | ||
1477 | |||
1478 | pshufd \$0x13,$twtmp,$twres | ||
1479 | pxor $twtmp,$twtmp | ||
1480 | movdqa @tweak[5],@tweak[1] | ||
1481 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1482 | aesenc $rndkey1,$inout0 | ||
1483 | pand $twmask,$twres # isolate carry and residue | ||
1484 | aesenc $rndkey1,$inout1 | ||
1485 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1486 | aesenc $rndkey1,$inout2 | ||
1487 | pxor $twres,@tweak[5] | ||
1488 | aesenc $rndkey1,$inout3 | ||
1489 | aesenc $rndkey1,$inout4 | ||
1490 | aesenc $rndkey1,$inout5 | ||
1491 | |||
1492 | pshufd \$0x13,$twtmp,$twres | ||
1493 | pxor $twtmp,$twtmp | ||
1494 | movdqa @tweak[5],@tweak[2] | ||
1495 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1496 | aesenclast $rndkey0,$inout0 | ||
1497 | pand $twmask,$twres # isolate carry and residue | ||
1498 | aesenclast $rndkey0,$inout1 | ||
1499 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1500 | aesenclast $rndkey0,$inout2 | ||
1501 | pxor $twres,@tweak[5] | ||
1502 | aesenclast $rndkey0,$inout3 | ||
1503 | aesenclast $rndkey0,$inout4 | ||
1504 | aesenclast $rndkey0,$inout5 | ||
1505 | |||
1506 | pshufd \$0x13,$twtmp,$twres | ||
1507 | pxor $twtmp,$twtmp | ||
1508 | movdqa @tweak[5],@tweak[3] | ||
1509 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1510 | xorps `16*0`(%rsp),$inout0 # output^=tweak | ||
1511 | pand $twmask,$twres # isolate carry and residue | ||
1512 | xorps `16*1`(%rsp),$inout1 | ||
1513 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1514 | pxor $twres,@tweak[5] | ||
1515 | |||
1516 | xorps `16*2`(%rsp),$inout2 | ||
1517 | movups $inout0,`16*0`($out) # write output | ||
1518 | xorps `16*3`(%rsp),$inout3 | ||
1519 | movups $inout1,`16*1`($out) | ||
1520 | xorps `16*4`(%rsp),$inout4 | ||
1521 | movups $inout2,`16*2`($out) | ||
1522 | xorps `16*5`(%rsp),$inout5 | ||
1523 | movups $inout3,`16*3`($out) | ||
1524 | mov $rnds_,$rounds # restore $rounds | ||
1525 | movups $inout4,`16*4`($out) | ||
1526 | movups $inout5,`16*5`($out) | ||
1527 | lea `16*6`($out),$out | ||
1528 | sub \$16*6,$len | ||
1529 | jnc .Lxts_enc_grandloop | ||
1530 | |||
1531 | lea 3($rounds,$rounds),$rounds # restore original value | ||
1532 | mov $key_,$key # restore $key | ||
1533 | mov $rounds,$rnds_ # backup $rounds | ||
1534 | |||
1535 | .Lxts_enc_short: | ||
1536 | add \$16*6,$len | ||
1537 | jz .Lxts_enc_done | ||
1538 | |||
1539 | cmp \$0x20,$len | ||
1540 | jb .Lxts_enc_one | ||
1541 | je .Lxts_enc_two | ||
1542 | |||
1543 | cmp \$0x40,$len | ||
1544 | jb .Lxts_enc_three | ||
1545 | je .Lxts_enc_four | ||
1546 | |||
1547 | pshufd \$0x13,$twtmp,$twres | ||
1548 | movdqa @tweak[5],@tweak[4] | ||
1549 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1550 | movdqu ($inp),$inout0 | ||
1551 | pand $twmask,$twres # isolate carry and residue | ||
1552 | movdqu 16*1($inp),$inout1 | ||
1553 | pxor $twres,@tweak[5] | ||
1554 | |||
1555 | movdqu 16*2($inp),$inout2 | ||
1556 | pxor @tweak[0],$inout0 | ||
1557 | movdqu 16*3($inp),$inout3 | ||
1558 | pxor @tweak[1],$inout1 | ||
1559 | movdqu 16*4($inp),$inout4 | ||
1560 | lea 16*5($inp),$inp | ||
1561 | pxor @tweak[2],$inout2 | ||
1562 | pxor @tweak[3],$inout3 | ||
1563 | pxor @tweak[4],$inout4 | ||
1564 | |||
1565 | call _aesni_encrypt6 | ||
1566 | |||
1567 | xorps @tweak[0],$inout0 | ||
1568 | movdqa @tweak[5],@tweak[0] | ||
1569 | xorps @tweak[1],$inout1 | ||
1570 | xorps @tweak[2],$inout2 | ||
1571 | movdqu $inout0,($out) | ||
1572 | xorps @tweak[3],$inout3 | ||
1573 | movdqu $inout1,16*1($out) | ||
1574 | xorps @tweak[4],$inout4 | ||
1575 | movdqu $inout2,16*2($out) | ||
1576 | movdqu $inout3,16*3($out) | ||
1577 | movdqu $inout4,16*4($out) | ||
1578 | lea 16*5($out),$out | ||
1579 | jmp .Lxts_enc_done | ||
1580 | |||
1581 | .align 16 | ||
1582 | .Lxts_enc_one: | ||
1583 | movups ($inp),$inout0 | ||
1584 | lea 16*1($inp),$inp | ||
1585 | xorps @tweak[0],$inout0 | ||
1586 | ___ | ||
1587 | &aesni_generate1("enc",$key,$rounds); | ||
1588 | $code.=<<___; | ||
1589 | xorps @tweak[0],$inout0 | ||
1590 | movdqa @tweak[1],@tweak[0] | ||
1591 | movups $inout0,($out) | ||
1592 | lea 16*1($out),$out | ||
1593 | jmp .Lxts_enc_done | ||
1594 | |||
1595 | .align 16 | ||
1596 | .Lxts_enc_two: | ||
1597 | movups ($inp),$inout0 | ||
1598 | movups 16($inp),$inout1 | ||
1599 | lea 32($inp),$inp | ||
1600 | xorps @tweak[0],$inout0 | ||
1601 | xorps @tweak[1],$inout1 | ||
1602 | |||
1603 | call _aesni_encrypt3 | ||
1604 | |||
1605 | xorps @tweak[0],$inout0 | ||
1606 | movdqa @tweak[2],@tweak[0] | ||
1607 | xorps @tweak[1],$inout1 | ||
1608 | movups $inout0,($out) | ||
1609 | movups $inout1,16*1($out) | ||
1610 | lea 16*2($out),$out | ||
1611 | jmp .Lxts_enc_done | ||
1612 | |||
1613 | .align 16 | ||
1614 | .Lxts_enc_three: | ||
1615 | movups ($inp),$inout0 | ||
1616 | movups 16*1($inp),$inout1 | ||
1617 | movups 16*2($inp),$inout2 | ||
1618 | lea 16*3($inp),$inp | ||
1619 | xorps @tweak[0],$inout0 | ||
1620 | xorps @tweak[1],$inout1 | ||
1621 | xorps @tweak[2],$inout2 | ||
1622 | |||
1623 | call _aesni_encrypt3 | ||
1624 | |||
1625 | xorps @tweak[0],$inout0 | ||
1626 | movdqa @tweak[3],@tweak[0] | ||
1627 | xorps @tweak[1],$inout1 | ||
1628 | xorps @tweak[2],$inout2 | ||
1629 | movups $inout0,($out) | ||
1630 | movups $inout1,16*1($out) | ||
1631 | movups $inout2,16*2($out) | ||
1632 | lea 16*3($out),$out | ||
1633 | jmp .Lxts_enc_done | ||
1634 | |||
1635 | .align 16 | ||
1636 | .Lxts_enc_four: | ||
1637 | movups ($inp),$inout0 | ||
1638 | movups 16*1($inp),$inout1 | ||
1639 | movups 16*2($inp),$inout2 | ||
1640 | xorps @tweak[0],$inout0 | ||
1641 | movups 16*3($inp),$inout3 | ||
1642 | lea 16*4($inp),$inp | ||
1643 | xorps @tweak[1],$inout1 | ||
1644 | xorps @tweak[2],$inout2 | ||
1645 | xorps @tweak[3],$inout3 | ||
1646 | |||
1647 | call _aesni_encrypt4 | ||
1648 | |||
1649 | xorps @tweak[0],$inout0 | ||
1650 | movdqa @tweak[5],@tweak[0] | ||
1651 | xorps @tweak[1],$inout1 | ||
1652 | xorps @tweak[2],$inout2 | ||
1653 | movups $inout0,($out) | ||
1654 | xorps @tweak[3],$inout3 | ||
1655 | movups $inout1,16*1($out) | ||
1656 | movups $inout2,16*2($out) | ||
1657 | movups $inout3,16*3($out) | ||
1658 | lea 16*4($out),$out | ||
1659 | jmp .Lxts_enc_done | ||
1660 | |||
1661 | .align 16 | ||
1662 | .Lxts_enc_done: | ||
1663 | and \$15,$len_ | ||
1664 | jz .Lxts_enc_ret | ||
1665 | mov $len_,$len | ||
1666 | |||
1667 | .Lxts_enc_steal: | ||
1668 | movzb ($inp),%eax # borrow $rounds ... | ||
1669 | movzb -16($out),%ecx # ... and $key | ||
1670 | lea 1($inp),$inp | ||
1671 | mov %al,-16($out) | ||
1672 | mov %cl,0($out) | ||
1673 | lea 1($out),$out | ||
1674 | sub \$1,$len | ||
1675 | jnz .Lxts_enc_steal | ||
1676 | |||
1677 | sub $len_,$out # rewind $out | ||
1678 | mov $key_,$key # restore $key | ||
1679 | mov $rnds_,$rounds # restore $rounds | ||
1680 | |||
1681 | movups -16($out),$inout0 | ||
1682 | xorps @tweak[0],$inout0 | ||
1683 | ___ | ||
1684 | &aesni_generate1("enc",$key,$rounds); | ||
1685 | $code.=<<___; | ||
1686 | xorps @tweak[0],$inout0 | ||
1687 | movups $inout0,-16($out) | ||
1688 | |||
1689 | .Lxts_enc_ret: | ||
1690 | ___ | ||
1691 | $code.=<<___ if ($win64); | ||
1692 | movaps 0x60(%rsp),%xmm6 | ||
1693 | movaps 0x70(%rsp),%xmm7 | ||
1694 | movaps 0x80(%rsp),%xmm8 | ||
1695 | movaps 0x90(%rsp),%xmm9 | ||
1696 | movaps 0xa0(%rsp),%xmm10 | ||
1697 | movaps 0xb0(%rsp),%xmm11 | ||
1698 | movaps 0xc0(%rsp),%xmm12 | ||
1699 | movaps 0xd0(%rsp),%xmm13 | ||
1700 | movaps 0xe0(%rsp),%xmm14 | ||
1701 | movaps 0xf0(%rsp),%xmm15 | ||
1702 | ___ | ||
1703 | $code.=<<___; | ||
1704 | lea (%rbp),%rsp | ||
1705 | pop %rbp | ||
1706 | .Lxts_enc_epilogue: | ||
1707 | ret | ||
1708 | .size aesni_xts_encrypt,.-aesni_xts_encrypt | ||
1709 | ___ | ||
1710 | |||
1711 | $code.=<<___; | ||
1712 | .globl aesni_xts_decrypt | ||
1713 | .type aesni_xts_decrypt,\@function,6 | ||
1714 | .align 16 | ||
1715 | aesni_xts_decrypt: | ||
1716 | _CET_ENDBR | ||
1717 | lea (%rsp),%rax | ||
1718 | push %rbp | ||
1719 | sub \$$frame_size,%rsp | ||
1720 | ___ | ||
1721 | $code.=<<___ if ($win64); | ||
1722 | movaps %xmm6,0x60(%rsp) | ||
1723 | movaps %xmm7,0x70(%rsp) | ||
1724 | movaps %xmm8,0x80(%rsp) | ||
1725 | movaps %xmm9,0x90(%rsp) | ||
1726 | movaps %xmm10,0xa0(%rsp) | ||
1727 | movaps %xmm11,0xb0(%rsp) | ||
1728 | movaps %xmm12,0xc0(%rsp) | ||
1729 | movaps %xmm13,0xd0(%rsp) | ||
1730 | movaps %xmm14,0xe0(%rsp) | ||
1731 | movaps %xmm15,0xf0(%rsp) | ||
1732 | .Lxts_dec_body: | ||
1733 | ___ | ||
1734 | $code.=<<___; | ||
1735 | lea -8(%rax),%rbp | ||
1736 | movups ($ivp),@tweak[5] # load clear-text tweak | ||
1737 | mov 240($key2),$rounds # key2->rounds | ||
1738 | mov 240($key),$rnds_ # key1->rounds | ||
1739 | ___ | ||
1740 | # generate the tweak | ||
1741 | &aesni_generate1("enc",$key2,$rounds,@tweak[5]); | ||
1742 | $code.=<<___; | ||
1743 | xor %eax,%eax # if ($len%16) len-=16; | ||
1744 | test \$15,$len | ||
1745 | setnz %al | ||
1746 | shl \$4,%rax | ||
1747 | sub %rax,$len | ||
1748 | |||
1749 | mov $key,$key_ # backup $key | ||
1750 | mov $rnds_,$rounds # backup $rounds | ||
1751 | mov $len,$len_ # backup $len | ||
1752 | and \$-16,$len | ||
1753 | |||
1754 | movdqa .Lxts_magic(%rip),$twmask | ||
1755 | pxor $twtmp,$twtmp | ||
1756 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1757 | ___ | ||
1758 | for ($i=0;$i<4;$i++) { | ||
1759 | $code.=<<___; | ||
1760 | pshufd \$0x13,$twtmp,$twres | ||
1761 | pxor $twtmp,$twtmp | ||
1762 | movdqa @tweak[5],@tweak[$i] | ||
1763 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1764 | pand $twmask,$twres # isolate carry and residue | ||
1765 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1766 | pxor $twres,@tweak[5] | ||
1767 | ___ | ||
1768 | } | ||
1769 | $code.=<<___; | ||
1770 | sub \$16*6,$len | ||
1771 | jc .Lxts_dec_short | ||
1772 | |||
1773 | shr \$1,$rounds | ||
1774 | sub \$1,$rounds | ||
1775 | mov $rounds,$rnds_ | ||
1776 | jmp .Lxts_dec_grandloop | ||
1777 | |||
1778 | .align 16 | ||
1779 | .Lxts_dec_grandloop: | ||
1780 | pshufd \$0x13,$twtmp,$twres | ||
1781 | movdqa @tweak[5],@tweak[4] | ||
1782 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1783 | movdqu `16*0`($inp),$inout0 # load input | ||
1784 | pand $twmask,$twres # isolate carry and residue | ||
1785 | movdqu `16*1`($inp),$inout1 | ||
1786 | pxor $twres,@tweak[5] | ||
1787 | |||
1788 | movdqu `16*2`($inp),$inout2 | ||
1789 | pxor @tweak[0],$inout0 # input^=tweak | ||
1790 | movdqu `16*3`($inp),$inout3 | ||
1791 | pxor @tweak[1],$inout1 | ||
1792 | movdqu `16*4`($inp),$inout4 | ||
1793 | pxor @tweak[2],$inout2 | ||
1794 | movdqu `16*5`($inp),$inout5 | ||
1795 | lea `16*6`($inp),$inp | ||
1796 | pxor @tweak[3],$inout3 | ||
1797 | $movkey ($key_),$rndkey0 | ||
1798 | pxor @tweak[4],$inout4 | ||
1799 | pxor @tweak[5],$inout5 | ||
1800 | |||
1801 | # inline _aesni_decrypt6 and interleave first and last rounds | ||
1802 | # with own code... | ||
1803 | $movkey 16($key_),$rndkey1 | ||
1804 | pxor $rndkey0,$inout0 | ||
1805 | pxor $rndkey0,$inout1 | ||
1806 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks | ||
1807 | aesdec $rndkey1,$inout0 | ||
1808 | lea 32($key_),$key | ||
1809 | pxor $rndkey0,$inout2 | ||
1810 | movdqa @tweak[1],`16*1`(%rsp) | ||
1811 | aesdec $rndkey1,$inout1 | ||
1812 | pxor $rndkey0,$inout3 | ||
1813 | movdqa @tweak[2],`16*2`(%rsp) | ||
1814 | aesdec $rndkey1,$inout2 | ||
1815 | pxor $rndkey0,$inout4 | ||
1816 | movdqa @tweak[3],`16*3`(%rsp) | ||
1817 | aesdec $rndkey1,$inout3 | ||
1818 | pxor $rndkey0,$inout5 | ||
1819 | $movkey ($key),$rndkey0 | ||
1820 | dec $rounds | ||
1821 | movdqa @tweak[4],`16*4`(%rsp) | ||
1822 | aesdec $rndkey1,$inout4 | ||
1823 | movdqa @tweak[5],`16*5`(%rsp) | ||
1824 | aesdec $rndkey1,$inout5 | ||
1825 | pxor $twtmp,$twtmp | ||
1826 | pcmpgtd @tweak[5],$twtmp | ||
1827 | jmp .Lxts_dec_loop6_enter | ||
1828 | |||
1829 | .align 16 | ||
1830 | .Lxts_dec_loop6: | ||
1831 | aesdec $rndkey1,$inout0 | ||
1832 | aesdec $rndkey1,$inout1 | ||
1833 | dec $rounds | ||
1834 | aesdec $rndkey1,$inout2 | ||
1835 | aesdec $rndkey1,$inout3 | ||
1836 | aesdec $rndkey1,$inout4 | ||
1837 | aesdec $rndkey1,$inout5 | ||
1838 | .Lxts_dec_loop6_enter: | ||
1839 | $movkey 16($key),$rndkey1 | ||
1840 | aesdec $rndkey0,$inout0 | ||
1841 | aesdec $rndkey0,$inout1 | ||
1842 | lea 32($key),$key | ||
1843 | aesdec $rndkey0,$inout2 | ||
1844 | aesdec $rndkey0,$inout3 | ||
1845 | aesdec $rndkey0,$inout4 | ||
1846 | aesdec $rndkey0,$inout5 | ||
1847 | $movkey ($key),$rndkey0 | ||
1848 | jnz .Lxts_dec_loop6 | ||
1849 | |||
1850 | pshufd \$0x13,$twtmp,$twres | ||
1851 | pxor $twtmp,$twtmp | ||
1852 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1853 | aesdec $rndkey1,$inout0 | ||
1854 | pand $twmask,$twres # isolate carry and residue | ||
1855 | aesdec $rndkey1,$inout1 | ||
1856 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1857 | aesdec $rndkey1,$inout2 | ||
1858 | pxor $twres,@tweak[5] | ||
1859 | aesdec $rndkey1,$inout3 | ||
1860 | aesdec $rndkey1,$inout4 | ||
1861 | aesdec $rndkey1,$inout5 | ||
1862 | $movkey 16($key),$rndkey1 | ||
1863 | |||
1864 | pshufd \$0x13,$twtmp,$twres | ||
1865 | pxor $twtmp,$twtmp | ||
1866 | movdqa @tweak[5],@tweak[0] | ||
1867 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1868 | aesdec $rndkey0,$inout0 | ||
1869 | pand $twmask,$twres # isolate carry and residue | ||
1870 | aesdec $rndkey0,$inout1 | ||
1871 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1872 | aesdec $rndkey0,$inout2 | ||
1873 | pxor $twres,@tweak[5] | ||
1874 | aesdec $rndkey0,$inout3 | ||
1875 | aesdec $rndkey0,$inout4 | ||
1876 | aesdec $rndkey0,$inout5 | ||
1877 | $movkey 32($key),$rndkey0 | ||
1878 | |||
1879 | pshufd \$0x13,$twtmp,$twres | ||
1880 | pxor $twtmp,$twtmp | ||
1881 | movdqa @tweak[5],@tweak[1] | ||
1882 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1883 | aesdec $rndkey1,$inout0 | ||
1884 | pand $twmask,$twres # isolate carry and residue | ||
1885 | aesdec $rndkey1,$inout1 | ||
1886 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1887 | aesdec $rndkey1,$inout2 | ||
1888 | pxor $twres,@tweak[5] | ||
1889 | aesdec $rndkey1,$inout3 | ||
1890 | aesdec $rndkey1,$inout4 | ||
1891 | aesdec $rndkey1,$inout5 | ||
1892 | |||
1893 | pshufd \$0x13,$twtmp,$twres | ||
1894 | pxor $twtmp,$twtmp | ||
1895 | movdqa @tweak[5],@tweak[2] | ||
1896 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1897 | aesdeclast $rndkey0,$inout0 | ||
1898 | pand $twmask,$twres # isolate carry and residue | ||
1899 | aesdeclast $rndkey0,$inout1 | ||
1900 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1901 | aesdeclast $rndkey0,$inout2 | ||
1902 | pxor $twres,@tweak[5] | ||
1903 | aesdeclast $rndkey0,$inout3 | ||
1904 | aesdeclast $rndkey0,$inout4 | ||
1905 | aesdeclast $rndkey0,$inout5 | ||
1906 | |||
1907 | pshufd \$0x13,$twtmp,$twres | ||
1908 | pxor $twtmp,$twtmp | ||
1909 | movdqa @tweak[5],@tweak[3] | ||
1910 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1911 | xorps `16*0`(%rsp),$inout0 # output^=tweak | ||
1912 | pand $twmask,$twres # isolate carry and residue | ||
1913 | xorps `16*1`(%rsp),$inout1 | ||
1914 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1915 | pxor $twres,@tweak[5] | ||
1916 | |||
1917 | xorps `16*2`(%rsp),$inout2 | ||
1918 | movups $inout0,`16*0`($out) # write output | ||
1919 | xorps `16*3`(%rsp),$inout3 | ||
1920 | movups $inout1,`16*1`($out) | ||
1921 | xorps `16*4`(%rsp),$inout4 | ||
1922 | movups $inout2,`16*2`($out) | ||
1923 | xorps `16*5`(%rsp),$inout5 | ||
1924 | movups $inout3,`16*3`($out) | ||
1925 | mov $rnds_,$rounds # restore $rounds | ||
1926 | movups $inout4,`16*4`($out) | ||
1927 | movups $inout5,`16*5`($out) | ||
1928 | lea `16*6`($out),$out | ||
1929 | sub \$16*6,$len | ||
1930 | jnc .Lxts_dec_grandloop | ||
1931 | |||
1932 | lea 3($rounds,$rounds),$rounds # restore original value | ||
1933 | mov $key_,$key # restore $key | ||
1934 | mov $rounds,$rnds_ # backup $rounds | ||
1935 | |||
1936 | .Lxts_dec_short: | ||
1937 | add \$16*6,$len | ||
1938 | jz .Lxts_dec_done | ||
1939 | |||
1940 | cmp \$0x20,$len | ||
1941 | jb .Lxts_dec_one | ||
1942 | je .Lxts_dec_two | ||
1943 | |||
1944 | cmp \$0x40,$len | ||
1945 | jb .Lxts_dec_three | ||
1946 | je .Lxts_dec_four | ||
1947 | |||
1948 | pshufd \$0x13,$twtmp,$twres | ||
1949 | movdqa @tweak[5],@tweak[4] | ||
1950 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1951 | movdqu ($inp),$inout0 | ||
1952 | pand $twmask,$twres # isolate carry and residue | ||
1953 | movdqu 16*1($inp),$inout1 | ||
1954 | pxor $twres,@tweak[5] | ||
1955 | |||
1956 | movdqu 16*2($inp),$inout2 | ||
1957 | pxor @tweak[0],$inout0 | ||
1958 | movdqu 16*3($inp),$inout3 | ||
1959 | pxor @tweak[1],$inout1 | ||
1960 | movdqu 16*4($inp),$inout4 | ||
1961 | lea 16*5($inp),$inp | ||
1962 | pxor @tweak[2],$inout2 | ||
1963 | pxor @tweak[3],$inout3 | ||
1964 | pxor @tweak[4],$inout4 | ||
1965 | |||
1966 | call _aesni_decrypt6 | ||
1967 | |||
1968 | xorps @tweak[0],$inout0 | ||
1969 | xorps @tweak[1],$inout1 | ||
1970 | xorps @tweak[2],$inout2 | ||
1971 | movdqu $inout0,($out) | ||
1972 | xorps @tweak[3],$inout3 | ||
1973 | movdqu $inout1,16*1($out) | ||
1974 | xorps @tweak[4],$inout4 | ||
1975 | movdqu $inout2,16*2($out) | ||
1976 | pxor $twtmp,$twtmp | ||
1977 | movdqu $inout3,16*3($out) | ||
1978 | pcmpgtd @tweak[5],$twtmp | ||
1979 | movdqu $inout4,16*4($out) | ||
1980 | lea 16*5($out),$out | ||
1981 | pshufd \$0x13,$twtmp,@tweak[1] # $twres | ||
1982 | and \$15,$len_ | ||
1983 | jz .Lxts_dec_ret | ||
1984 | |||
1985 | movdqa @tweak[5],@tweak[0] | ||
1986 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1987 | pand $twmask,@tweak[1] # isolate carry and residue | ||
1988 | pxor @tweak[5],@tweak[1] | ||
1989 | jmp .Lxts_dec_done2 | ||
1990 | |||
1991 | .align 16 | ||
1992 | .Lxts_dec_one: | ||
1993 | movups ($inp),$inout0 | ||
1994 | lea 16*1($inp),$inp | ||
1995 | xorps @tweak[0],$inout0 | ||
1996 | ___ | ||
1997 | &aesni_generate1("dec",$key,$rounds); | ||
1998 | $code.=<<___; | ||
1999 | xorps @tweak[0],$inout0 | ||
2000 | movdqa @tweak[1],@tweak[0] | ||
2001 | movups $inout0,($out) | ||
2002 | movdqa @tweak[2],@tweak[1] | ||
2003 | lea 16*1($out),$out | ||
2004 | jmp .Lxts_dec_done | ||
2005 | |||
2006 | .align 16 | ||
2007 | .Lxts_dec_two: | ||
2008 | movups ($inp),$inout0 | ||
2009 | movups 16($inp),$inout1 | ||
2010 | lea 32($inp),$inp | ||
2011 | xorps @tweak[0],$inout0 | ||
2012 | xorps @tweak[1],$inout1 | ||
2013 | |||
2014 | call _aesni_decrypt3 | ||
2015 | |||
2016 | xorps @tweak[0],$inout0 | ||
2017 | movdqa @tweak[2],@tweak[0] | ||
2018 | xorps @tweak[1],$inout1 | ||
2019 | movdqa @tweak[3],@tweak[1] | ||
2020 | movups $inout0,($out) | ||
2021 | movups $inout1,16*1($out) | ||
2022 | lea 16*2($out),$out | ||
2023 | jmp .Lxts_dec_done | ||
2024 | |||
2025 | .align 16 | ||
2026 | .Lxts_dec_three: | ||
2027 | movups ($inp),$inout0 | ||
2028 | movups 16*1($inp),$inout1 | ||
2029 | movups 16*2($inp),$inout2 | ||
2030 | lea 16*3($inp),$inp | ||
2031 | xorps @tweak[0],$inout0 | ||
2032 | xorps @tweak[1],$inout1 | ||
2033 | xorps @tweak[2],$inout2 | ||
2034 | |||
2035 | call _aesni_decrypt3 | ||
2036 | |||
2037 | xorps @tweak[0],$inout0 | ||
2038 | movdqa @tweak[3],@tweak[0] | ||
2039 | xorps @tweak[1],$inout1 | ||
2040 | movdqa @tweak[5],@tweak[1] | ||
2041 | xorps @tweak[2],$inout2 | ||
2042 | movups $inout0,($out) | ||
2043 | movups $inout1,16*1($out) | ||
2044 | movups $inout2,16*2($out) | ||
2045 | lea 16*3($out),$out | ||
2046 | jmp .Lxts_dec_done | ||
2047 | |||
2048 | .align 16 | ||
2049 | .Lxts_dec_four: | ||
2050 | pshufd \$0x13,$twtmp,$twres | ||
2051 | movdqa @tweak[5],@tweak[4] | ||
2052 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
2053 | movups ($inp),$inout0 | ||
2054 | pand $twmask,$twres # isolate carry and residue | ||
2055 | movups 16*1($inp),$inout1 | ||
2056 | pxor $twres,@tweak[5] | ||
2057 | |||
2058 | movups 16*2($inp),$inout2 | ||
2059 | xorps @tweak[0],$inout0 | ||
2060 | movups 16*3($inp),$inout3 | ||
2061 | lea 16*4($inp),$inp | ||
2062 | xorps @tweak[1],$inout1 | ||
2063 | xorps @tweak[2],$inout2 | ||
2064 | xorps @tweak[3],$inout3 | ||
2065 | |||
2066 | call _aesni_decrypt4 | ||
2067 | |||
2068 | xorps @tweak[0],$inout0 | ||
2069 | movdqa @tweak[4],@tweak[0] | ||
2070 | xorps @tweak[1],$inout1 | ||
2071 | movdqa @tweak[5],@tweak[1] | ||
2072 | xorps @tweak[2],$inout2 | ||
2073 | movups $inout0,($out) | ||
2074 | xorps @tweak[3],$inout3 | ||
2075 | movups $inout1,16*1($out) | ||
2076 | movups $inout2,16*2($out) | ||
2077 | movups $inout3,16*3($out) | ||
2078 | lea 16*4($out),$out | ||
2079 | jmp .Lxts_dec_done | ||
2080 | |||
2081 | .align 16 | ||
2082 | .Lxts_dec_done: | ||
2083 | and \$15,$len_ | ||
2084 | jz .Lxts_dec_ret | ||
2085 | .Lxts_dec_done2: | ||
2086 | mov $len_,$len | ||
2087 | mov $key_,$key # restore $key | ||
2088 | mov $rnds_,$rounds # restore $rounds | ||
2089 | |||
2090 | movups ($inp),$inout0 | ||
2091 | xorps @tweak[1],$inout0 | ||
2092 | ___ | ||
2093 | &aesni_generate1("dec",$key,$rounds); | ||
2094 | $code.=<<___; | ||
2095 | xorps @tweak[1],$inout0 | ||
2096 | movups $inout0,($out) | ||
2097 | |||
2098 | .Lxts_dec_steal: | ||
2099 | movzb 16($inp),%eax # borrow $rounds ... | ||
2100 | movzb ($out),%ecx # ... and $key | ||
2101 | lea 1($inp),$inp | ||
2102 | mov %al,($out) | ||
2103 | mov %cl,16($out) | ||
2104 | lea 1($out),$out | ||
2105 | sub \$1,$len | ||
2106 | jnz .Lxts_dec_steal | ||
2107 | |||
2108 | sub $len_,$out # rewind $out | ||
2109 | mov $key_,$key # restore $key | ||
2110 | mov $rnds_,$rounds # restore $rounds | ||
2111 | |||
2112 | movups ($out),$inout0 | ||
2113 | xorps @tweak[0],$inout0 | ||
2114 | ___ | ||
2115 | &aesni_generate1("dec",$key,$rounds); | ||
2116 | $code.=<<___; | ||
2117 | xorps @tweak[0],$inout0 | ||
2118 | movups $inout0,($out) | ||
2119 | |||
2120 | .Lxts_dec_ret: | ||
2121 | ___ | ||
2122 | $code.=<<___ if ($win64); | ||
2123 | movaps 0x60(%rsp),%xmm6 | ||
2124 | movaps 0x70(%rsp),%xmm7 | ||
2125 | movaps 0x80(%rsp),%xmm8 | ||
2126 | movaps 0x90(%rsp),%xmm9 | ||
2127 | movaps 0xa0(%rsp),%xmm10 | ||
2128 | movaps 0xb0(%rsp),%xmm11 | ||
2129 | movaps 0xc0(%rsp),%xmm12 | ||
2130 | movaps 0xd0(%rsp),%xmm13 | ||
2131 | movaps 0xe0(%rsp),%xmm14 | ||
2132 | movaps 0xf0(%rsp),%xmm15 | ||
2133 | ___ | ||
2134 | $code.=<<___; | ||
2135 | lea (%rbp),%rsp | ||
2136 | pop %rbp | ||
2137 | .Lxts_dec_epilogue: | ||
2138 | ret | ||
2139 | .size aesni_xts_decrypt,.-aesni_xts_decrypt | ||
2140 | ___ | ||
2141 | } }} | ||
2142 | |||
2143 | ######################################################################## | ||
2144 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, | ||
2145 | # size_t length, const AES_KEY *key, | ||
2146 | # unsigned char *ivp,const int enc); | ||
2147 | { | ||
2148 | my $frame_size = 0x10 + ($win64?0x40:0); # used in decrypt | ||
2149 | $code.=<<___; | ||
2150 | .globl ${PREFIX}_cbc_encrypt | ||
2151 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
2152 | .align 16 | ||
2153 | ${PREFIX}_cbc_encrypt: | ||
2154 | _CET_ENDBR | ||
2155 | test $len,$len # check length | ||
2156 | jz .Lcbc_ret | ||
2157 | |||
2158 | mov 240($key),$rnds_ # key->rounds | ||
2159 | mov $key,$key_ # backup $key | ||
2160 | test %r9d,%r9d # 6th argument | ||
2161 | jz .Lcbc_decrypt | ||
2162 | #--------------------------- CBC ENCRYPT ------------------------------# | ||
2163 | movups ($ivp),$inout0 # load iv as initial state | ||
2164 | mov $rnds_,$rounds | ||
2165 | cmp \$16,$len | ||
2166 | jb .Lcbc_enc_tail | ||
2167 | sub \$16,$len | ||
2168 | jmp .Lcbc_enc_loop | ||
2169 | .align 16 | ||
2170 | .Lcbc_enc_loop: | ||
2171 | movups ($inp),$inout1 # load input | ||
2172 | lea 16($inp),$inp | ||
2173 | #xorps $inout1,$inout0 | ||
2174 | ___ | ||
2175 | &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); | ||
2176 | $code.=<<___; | ||
2177 | mov $rnds_,$rounds # restore $rounds | ||
2178 | mov $key_,$key # restore $key | ||
2179 | movups $inout0,0($out) # store output | ||
2180 | lea 16($out),$out | ||
2181 | sub \$16,$len | ||
2182 | jnc .Lcbc_enc_loop | ||
2183 | add \$16,$len | ||
2184 | jnz .Lcbc_enc_tail | ||
2185 | movups $inout0,($ivp) | ||
2186 | jmp .Lcbc_ret | ||
2187 | |||
2188 | .Lcbc_enc_tail: | ||
2189 | mov $len,%rcx # zaps $key | ||
2190 | xchg $inp,$out # $inp is %rsi and $out is %rdi now | ||
2191 | .long 0x9066A4F3 # rep movsb | ||
2192 | mov \$16,%ecx # zero tail | ||
2193 | sub $len,%rcx | ||
2194 | xor %eax,%eax | ||
2195 | .long 0x9066AAF3 # rep stosb | ||
2196 | lea -16(%rdi),%rdi # rewind $out by 1 block | ||
2197 | mov $rnds_,$rounds # restore $rounds | ||
2198 | mov %rdi,%rsi # $inp and $out are the same | ||
2199 | mov $key_,$key # restore $key | ||
2200 | xor $len,$len # len=16 | ||
2201 | jmp .Lcbc_enc_loop # one more spin | ||
2202 | #--------------------------- CBC DECRYPT ------------------------------# | ||
2203 | .align 16 | ||
2204 | .Lcbc_decrypt: | ||
2205 | lea (%rsp),%rax | ||
2206 | push %rbp | ||
2207 | sub \$$frame_size,%rsp | ||
2208 | ___ | ||
2209 | $code.=<<___ if ($win64); | ||
2210 | movaps %xmm6,0x10(%rsp) | ||
2211 | movaps %xmm7,0x20(%rsp) | ||
2212 | movaps %xmm8,0x30(%rsp) | ||
2213 | movaps %xmm9,0x40(%rsp) | ||
2214 | .Lcbc_decrypt_body: | ||
2215 | ___ | ||
2216 | $code.=<<___; | ||
2217 | lea -8(%rax),%rbp | ||
2218 | movups ($ivp),$iv | ||
2219 | mov $rnds_,$rounds | ||
2220 | cmp \$0x70,$len | ||
2221 | jbe .Lcbc_dec_tail | ||
2222 | shr \$1,$rnds_ | ||
2223 | sub \$0x70,$len | ||
2224 | mov $rnds_,$rounds | ||
2225 | movaps $iv,(%rsp) | ||
2226 | jmp .Lcbc_dec_loop8_enter | ||
2227 | .align 16 | ||
2228 | .Lcbc_dec_loop8: | ||
2229 | movaps $rndkey0,(%rsp) # save IV | ||
2230 | movups $inout7,($out) | ||
2231 | lea 0x10($out),$out | ||
2232 | .Lcbc_dec_loop8_enter: | ||
2233 | $movkey ($key),$rndkey0 | ||
2234 | movups ($inp),$inout0 # load input | ||
2235 | movups 0x10($inp),$inout1 | ||
2236 | $movkey 16($key),$rndkey1 | ||
2237 | |||
2238 | lea 32($key),$key | ||
2239 | movdqu 0x20($inp),$inout2 | ||
2240 | xorps $rndkey0,$inout0 | ||
2241 | movdqu 0x30($inp),$inout3 | ||
2242 | xorps $rndkey0,$inout1 | ||
2243 | movdqu 0x40($inp),$inout4 | ||
2244 | aesdec $rndkey1,$inout0 | ||
2245 | pxor $rndkey0,$inout2 | ||
2246 | movdqu 0x50($inp),$inout5 | ||
2247 | aesdec $rndkey1,$inout1 | ||
2248 | pxor $rndkey0,$inout3 | ||
2249 | movdqu 0x60($inp),$inout6 | ||
2250 | aesdec $rndkey1,$inout2 | ||
2251 | pxor $rndkey0,$inout4 | ||
2252 | movdqu 0x70($inp),$inout7 | ||
2253 | aesdec $rndkey1,$inout3 | ||
2254 | pxor $rndkey0,$inout5 | ||
2255 | dec $rounds | ||
2256 | aesdec $rndkey1,$inout4 | ||
2257 | pxor $rndkey0,$inout6 | ||
2258 | aesdec $rndkey1,$inout5 | ||
2259 | pxor $rndkey0,$inout7 | ||
2260 | $movkey ($key),$rndkey0 | ||
2261 | aesdec $rndkey1,$inout6 | ||
2262 | aesdec $rndkey1,$inout7 | ||
2263 | $movkey 16($key),$rndkey1 | ||
2264 | |||
2265 | call .Ldec_loop8_enter | ||
2266 | |||
2267 | movups ($inp),$rndkey1 # re-load input | ||
2268 | movups 0x10($inp),$rndkey0 | ||
2269 | xorps (%rsp),$inout0 # ^= IV | ||
2270 | xorps $rndkey1,$inout1 | ||
2271 | movups 0x20($inp),$rndkey1 | ||
2272 | xorps $rndkey0,$inout2 | ||
2273 | movups 0x30($inp),$rndkey0 | ||
2274 | xorps $rndkey1,$inout3 | ||
2275 | movups 0x40($inp),$rndkey1 | ||
2276 | xorps $rndkey0,$inout4 | ||
2277 | movups 0x50($inp),$rndkey0 | ||
2278 | xorps $rndkey1,$inout5 | ||
2279 | movups 0x60($inp),$rndkey1 | ||
2280 | xorps $rndkey0,$inout6 | ||
2281 | movups 0x70($inp),$rndkey0 # IV | ||
2282 | xorps $rndkey1,$inout7 | ||
2283 | movups $inout0,($out) | ||
2284 | movups $inout1,0x10($out) | ||
2285 | movups $inout2,0x20($out) | ||
2286 | movups $inout3,0x30($out) | ||
2287 | mov $rnds_,$rounds # restore $rounds | ||
2288 | movups $inout4,0x40($out) | ||
2289 | mov $key_,$key # restore $key | ||
2290 | movups $inout5,0x50($out) | ||
2291 | lea 0x80($inp),$inp | ||
2292 | movups $inout6,0x60($out) | ||
2293 | lea 0x70($out),$out | ||
2294 | sub \$0x80,$len | ||
2295 | ja .Lcbc_dec_loop8 | ||
2296 | |||
2297 | movaps $inout7,$inout0 | ||
2298 | movaps $rndkey0,$iv | ||
2299 | add \$0x70,$len | ||
2300 | jle .Lcbc_dec_tail_collected | ||
2301 | movups $inout0,($out) | ||
2302 | lea 1($rnds_,$rnds_),$rounds | ||
2303 | lea 0x10($out),$out | ||
2304 | .Lcbc_dec_tail: | ||
2305 | movups ($inp),$inout0 | ||
2306 | movaps $inout0,$in0 | ||
2307 | cmp \$0x10,$len | ||
2308 | jbe .Lcbc_dec_one | ||
2309 | |||
2310 | movups 0x10($inp),$inout1 | ||
2311 | movaps $inout1,$in1 | ||
2312 | cmp \$0x20,$len | ||
2313 | jbe .Lcbc_dec_two | ||
2314 | |||
2315 | movups 0x20($inp),$inout2 | ||
2316 | movaps $inout2,$in2 | ||
2317 | cmp \$0x30,$len | ||
2318 | jbe .Lcbc_dec_three | ||
2319 | |||
2320 | movups 0x30($inp),$inout3 | ||
2321 | cmp \$0x40,$len | ||
2322 | jbe .Lcbc_dec_four | ||
2323 | |||
2324 | movups 0x40($inp),$inout4 | ||
2325 | cmp \$0x50,$len | ||
2326 | jbe .Lcbc_dec_five | ||
2327 | |||
2328 | movups 0x50($inp),$inout5 | ||
2329 | cmp \$0x60,$len | ||
2330 | jbe .Lcbc_dec_six | ||
2331 | |||
2332 | movups 0x60($inp),$inout6 | ||
2333 | movaps $iv,(%rsp) # save IV | ||
2334 | call _aesni_decrypt8 | ||
2335 | movups ($inp),$rndkey1 | ||
2336 | movups 0x10($inp),$rndkey0 | ||
2337 | xorps (%rsp),$inout0 # ^= IV | ||
2338 | xorps $rndkey1,$inout1 | ||
2339 | movups 0x20($inp),$rndkey1 | ||
2340 | xorps $rndkey0,$inout2 | ||
2341 | movups 0x30($inp),$rndkey0 | ||
2342 | xorps $rndkey1,$inout3 | ||
2343 | movups 0x40($inp),$rndkey1 | ||
2344 | xorps $rndkey0,$inout4 | ||
2345 | movups 0x50($inp),$rndkey0 | ||
2346 | xorps $rndkey1,$inout5 | ||
2347 | movups 0x60($inp),$iv # IV | ||
2348 | xorps $rndkey0,$inout6 | ||
2349 | movups $inout0,($out) | ||
2350 | movups $inout1,0x10($out) | ||
2351 | movups $inout2,0x20($out) | ||
2352 | movups $inout3,0x30($out) | ||
2353 | movups $inout4,0x40($out) | ||
2354 | movups $inout5,0x50($out) | ||
2355 | lea 0x60($out),$out | ||
2356 | movaps $inout6,$inout0 | ||
2357 | sub \$0x70,$len | ||
2358 | jmp .Lcbc_dec_tail_collected | ||
2359 | .align 16 | ||
2360 | .Lcbc_dec_one: | ||
2361 | ___ | ||
2362 | &aesni_generate1("dec",$key,$rounds); | ||
2363 | $code.=<<___; | ||
2364 | xorps $iv,$inout0 | ||
2365 | movaps $in0,$iv | ||
2366 | sub \$0x10,$len | ||
2367 | jmp .Lcbc_dec_tail_collected | ||
2368 | .align 16 | ||
2369 | .Lcbc_dec_two: | ||
2370 | xorps $inout2,$inout2 | ||
2371 | call _aesni_decrypt3 | ||
2372 | xorps $iv,$inout0 | ||
2373 | xorps $in0,$inout1 | ||
2374 | movups $inout0,($out) | ||
2375 | movaps $in1,$iv | ||
2376 | movaps $inout1,$inout0 | ||
2377 | lea 0x10($out),$out | ||
2378 | sub \$0x20,$len | ||
2379 | jmp .Lcbc_dec_tail_collected | ||
2380 | .align 16 | ||
2381 | .Lcbc_dec_three: | ||
2382 | call _aesni_decrypt3 | ||
2383 | xorps $iv,$inout0 | ||
2384 | xorps $in0,$inout1 | ||
2385 | movups $inout0,($out) | ||
2386 | xorps $in1,$inout2 | ||
2387 | movups $inout1,0x10($out) | ||
2388 | movaps $in2,$iv | ||
2389 | movaps $inout2,$inout0 | ||
2390 | lea 0x20($out),$out | ||
2391 | sub \$0x30,$len | ||
2392 | jmp .Lcbc_dec_tail_collected | ||
2393 | .align 16 | ||
2394 | .Lcbc_dec_four: | ||
2395 | call _aesni_decrypt4 | ||
2396 | xorps $iv,$inout0 | ||
2397 | movups 0x30($inp),$iv | ||
2398 | xorps $in0,$inout1 | ||
2399 | movups $inout0,($out) | ||
2400 | xorps $in1,$inout2 | ||
2401 | movups $inout1,0x10($out) | ||
2402 | xorps $in2,$inout3 | ||
2403 | movups $inout2,0x20($out) | ||
2404 | movaps $inout3,$inout0 | ||
2405 | lea 0x30($out),$out | ||
2406 | sub \$0x40,$len | ||
2407 | jmp .Lcbc_dec_tail_collected | ||
2408 | .align 16 | ||
2409 | .Lcbc_dec_five: | ||
2410 | xorps $inout5,$inout5 | ||
2411 | call _aesni_decrypt6 | ||
2412 | movups 0x10($inp),$rndkey1 | ||
2413 | movups 0x20($inp),$rndkey0 | ||
2414 | xorps $iv,$inout0 | ||
2415 | xorps $in0,$inout1 | ||
2416 | xorps $rndkey1,$inout2 | ||
2417 | movups 0x30($inp),$rndkey1 | ||
2418 | xorps $rndkey0,$inout3 | ||
2419 | movups 0x40($inp),$iv | ||
2420 | xorps $rndkey1,$inout4 | ||
2421 | movups $inout0,($out) | ||
2422 | movups $inout1,0x10($out) | ||
2423 | movups $inout2,0x20($out) | ||
2424 | movups $inout3,0x30($out) | ||
2425 | lea 0x40($out),$out | ||
2426 | movaps $inout4,$inout0 | ||
2427 | sub \$0x50,$len | ||
2428 | jmp .Lcbc_dec_tail_collected | ||
2429 | .align 16 | ||
2430 | .Lcbc_dec_six: | ||
2431 | call _aesni_decrypt6 | ||
2432 | movups 0x10($inp),$rndkey1 | ||
2433 | movups 0x20($inp),$rndkey0 | ||
2434 | xorps $iv,$inout0 | ||
2435 | xorps $in0,$inout1 | ||
2436 | xorps $rndkey1,$inout2 | ||
2437 | movups 0x30($inp),$rndkey1 | ||
2438 | xorps $rndkey0,$inout3 | ||
2439 | movups 0x40($inp),$rndkey0 | ||
2440 | xorps $rndkey1,$inout4 | ||
2441 | movups 0x50($inp),$iv | ||
2442 | xorps $rndkey0,$inout5 | ||
2443 | movups $inout0,($out) | ||
2444 | movups $inout1,0x10($out) | ||
2445 | movups $inout2,0x20($out) | ||
2446 | movups $inout3,0x30($out) | ||
2447 | movups $inout4,0x40($out) | ||
2448 | lea 0x50($out),$out | ||
2449 | movaps $inout5,$inout0 | ||
2450 | sub \$0x60,$len | ||
2451 | jmp .Lcbc_dec_tail_collected | ||
2452 | .align 16 | ||
2453 | .Lcbc_dec_tail_collected: | ||
2454 | and \$15,$len | ||
2455 | movups $iv,($ivp) | ||
2456 | jnz .Lcbc_dec_tail_partial | ||
2457 | movups $inout0,($out) | ||
2458 | jmp .Lcbc_dec_ret | ||
2459 | .align 16 | ||
2460 | .Lcbc_dec_tail_partial: | ||
2461 | movaps $inout0,(%rsp) | ||
2462 | mov \$16,%rcx | ||
2463 | mov $out,%rdi | ||
2464 | sub $len,%rcx | ||
2465 | lea (%rsp),%rsi | ||
2466 | .long 0x9066A4F3 # rep movsb | ||
2467 | |||
2468 | .Lcbc_dec_ret: | ||
2469 | ___ | ||
2470 | $code.=<<___ if ($win64); | ||
2471 | movaps 0x10(%rsp),%xmm6 | ||
2472 | movaps 0x20(%rsp),%xmm7 | ||
2473 | movaps 0x30(%rsp),%xmm8 | ||
2474 | movaps 0x40(%rsp),%xmm9 | ||
2475 | ___ | ||
2476 | $code.=<<___; | ||
2477 | lea (%rbp),%rsp | ||
2478 | pop %rbp | ||
2479 | .Lcbc_ret: | ||
2480 | ret | ||
2481 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
2482 | ___ | ||
2483 | } | ||
2484 | # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, | ||
2485 | # int bits, AES_KEY *key) | ||
2486 | { my ($inp,$bits,$key) = @_4args; | ||
2487 | $bits =~ s/%r/%e/; | ||
2488 | |||
2489 | $code.=<<___; | ||
2490 | .globl ${PREFIX}_set_decrypt_key | ||
2491 | .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent | ||
2492 | .align 16 | ||
2493 | ${PREFIX}_set_decrypt_key: | ||
2494 | _CET_ENDBR | ||
2495 | sub \$8,%rsp | ||
2496 | call __aesni_set_encrypt_key | ||
2497 | shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key | ||
2498 | test %eax,%eax | ||
2499 | jnz .Ldec_key_ret | ||
2500 | lea 16($key,$bits),$inp # points at the end of key schedule | ||
2501 | |||
2502 | $movkey ($key),%xmm0 # just swap | ||
2503 | $movkey ($inp),%xmm1 | ||
2504 | $movkey %xmm0,($inp) | ||
2505 | $movkey %xmm1,($key) | ||
2506 | lea 16($key),$key | ||
2507 | lea -16($inp),$inp | ||
2508 | |||
2509 | .Ldec_key_inverse: | ||
2510 | $movkey ($key),%xmm0 # swap and inverse | ||
2511 | $movkey ($inp),%xmm1 | ||
2512 | aesimc %xmm0,%xmm0 | ||
2513 | aesimc %xmm1,%xmm1 | ||
2514 | lea 16($key),$key | ||
2515 | lea -16($inp),$inp | ||
2516 | $movkey %xmm0,16($inp) | ||
2517 | $movkey %xmm1,-16($key) | ||
2518 | cmp $key,$inp | ||
2519 | ja .Ldec_key_inverse | ||
2520 | |||
2521 | $movkey ($key),%xmm0 # inverse middle | ||
2522 | aesimc %xmm0,%xmm0 | ||
2523 | $movkey %xmm0,($inp) | ||
2524 | .Ldec_key_ret: | ||
2525 | add \$8,%rsp | ||
2526 | ret | ||
2527 | .LSEH_end_set_decrypt_key: | ||
2528 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
2529 | ___ | ||
2530 | |||
2531 | # This is based on submission by | ||
2532 | # | ||
2533 | # Huang Ying <ying.huang@intel.com> | ||
2534 | # Vinodh Gopal <vinodh.gopal@intel.com> | ||
2535 | # Kahraman Akdemir | ||
2536 | # | ||
2537 | # Aggressively optimized in respect to aeskeygenassist's critical path | ||
2538 | # and is contained in %xmm0-5 to meet Win64 ABI requirement. | ||
2539 | # | ||
2540 | $code.=<<___; | ||
2541 | .globl ${PREFIX}_set_encrypt_key | ||
2542 | .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent | ||
2543 | .align 16 | ||
2544 | ${PREFIX}_set_encrypt_key: | ||
2545 | _CET_ENDBR | ||
2546 | __aesni_set_encrypt_key: | ||
2547 | sub \$8,%rsp | ||
2548 | mov \$-1,%rax | ||
2549 | test $inp,$inp | ||
2550 | jz .Lenc_key_ret | ||
2551 | test $key,$key | ||
2552 | jz .Lenc_key_ret | ||
2553 | |||
2554 | movups ($inp),%xmm0 # pull first 128 bits of *userKey | ||
2555 | xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 | ||
2556 | lea 16($key),%rax | ||
2557 | cmp \$256,$bits | ||
2558 | je .L14rounds | ||
2559 | cmp \$192,$bits | ||
2560 | je .L12rounds | ||
2561 | cmp \$128,$bits | ||
2562 | jne .Lbad_keybits | ||
2563 | |||
2564 | .L10rounds: | ||
2565 | mov \$9,$bits # 10 rounds for 128-bit key | ||
2566 | $movkey %xmm0,($key) # round 0 | ||
2567 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 | ||
2568 | call .Lkey_expansion_128_cold | ||
2569 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 | ||
2570 | call .Lkey_expansion_128 | ||
2571 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 | ||
2572 | call .Lkey_expansion_128 | ||
2573 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 | ||
2574 | call .Lkey_expansion_128 | ||
2575 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 | ||
2576 | call .Lkey_expansion_128 | ||
2577 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 | ||
2578 | call .Lkey_expansion_128 | ||
2579 | aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 | ||
2580 | call .Lkey_expansion_128 | ||
2581 | aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 | ||
2582 | call .Lkey_expansion_128 | ||
2583 | aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 | ||
2584 | call .Lkey_expansion_128 | ||
2585 | aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 | ||
2586 | call .Lkey_expansion_128 | ||
2587 | $movkey %xmm0,(%rax) | ||
2588 | mov $bits,80(%rax) # 240(%rdx) | ||
2589 | xor %eax,%eax | ||
2590 | jmp .Lenc_key_ret | ||
2591 | |||
2592 | .align 16 | ||
2593 | .L12rounds: | ||
2594 | movq 16($inp),%xmm2 # remaining 1/3 of *userKey | ||
2595 | mov \$11,$bits # 12 rounds for 192 | ||
2596 | $movkey %xmm0,($key) # round 0 | ||
2597 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 | ||
2598 | call .Lkey_expansion_192a_cold | ||
2599 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 | ||
2600 | call .Lkey_expansion_192b | ||
2601 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 | ||
2602 | call .Lkey_expansion_192a | ||
2603 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 | ||
2604 | call .Lkey_expansion_192b | ||
2605 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 | ||
2606 | call .Lkey_expansion_192a | ||
2607 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 | ||
2608 | call .Lkey_expansion_192b | ||
2609 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 | ||
2610 | call .Lkey_expansion_192a | ||
2611 | aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 | ||
2612 | call .Lkey_expansion_192b | ||
2613 | $movkey %xmm0,(%rax) | ||
2614 | mov $bits,48(%rax) # 240(%rdx) | ||
2615 | xor %rax, %rax | ||
2616 | jmp .Lenc_key_ret | ||
2617 | |||
2618 | .align 16 | ||
2619 | .L14rounds: | ||
2620 | movups 16($inp),%xmm2 # remaining half of *userKey | ||
2621 | mov \$13,$bits # 14 rounds for 256 | ||
2622 | lea 16(%rax),%rax | ||
2623 | $movkey %xmm0,($key) # round 0 | ||
2624 | $movkey %xmm2,16($key) # round 1 | ||
2625 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 | ||
2626 | call .Lkey_expansion_256a_cold | ||
2627 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 | ||
2628 | call .Lkey_expansion_256b | ||
2629 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 | ||
2630 | call .Lkey_expansion_256a | ||
2631 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 | ||
2632 | call .Lkey_expansion_256b | ||
2633 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 | ||
2634 | call .Lkey_expansion_256a | ||
2635 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 | ||
2636 | call .Lkey_expansion_256b | ||
2637 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 | ||
2638 | call .Lkey_expansion_256a | ||
2639 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 | ||
2640 | call .Lkey_expansion_256b | ||
2641 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 | ||
2642 | call .Lkey_expansion_256a | ||
2643 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 | ||
2644 | call .Lkey_expansion_256b | ||
2645 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 | ||
2646 | call .Lkey_expansion_256a | ||
2647 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 | ||
2648 | call .Lkey_expansion_256b | ||
2649 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 | ||
2650 | call .Lkey_expansion_256a | ||
2651 | $movkey %xmm0,(%rax) | ||
2652 | mov $bits,16(%rax) # 240(%rdx) | ||
2653 | xor %rax,%rax | ||
2654 | jmp .Lenc_key_ret | ||
2655 | |||
2656 | .align 16 | ||
2657 | .Lbad_keybits: | ||
2658 | mov \$-2,%rax | ||
2659 | .Lenc_key_ret: | ||
2660 | add \$8,%rsp | ||
2661 | ret | ||
2662 | .LSEH_end_set_encrypt_key: | ||
2663 | |||
2664 | .align 16 | ||
2665 | .Lkey_expansion_128: | ||
2666 | $movkey %xmm0,(%rax) | ||
2667 | lea 16(%rax),%rax | ||
2668 | .Lkey_expansion_128_cold: | ||
2669 | shufps \$0b00010000,%xmm0,%xmm4 | ||
2670 | xorps %xmm4, %xmm0 | ||
2671 | shufps \$0b10001100,%xmm0,%xmm4 | ||
2672 | xorps %xmm4, %xmm0 | ||
2673 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | ||
2674 | xorps %xmm1,%xmm0 | ||
2675 | ret | ||
2676 | |||
2677 | .align 16 | ||
2678 | .Lkey_expansion_192a: | ||
2679 | $movkey %xmm0,(%rax) | ||
2680 | lea 16(%rax),%rax | ||
2681 | .Lkey_expansion_192a_cold: | ||
2682 | movaps %xmm2, %xmm5 | ||
2683 | .Lkey_expansion_192b_warm: | ||
2684 | shufps \$0b00010000,%xmm0,%xmm4 | ||
2685 | movdqa %xmm2,%xmm3 | ||
2686 | xorps %xmm4,%xmm0 | ||
2687 | shufps \$0b10001100,%xmm0,%xmm4 | ||
2688 | pslldq \$4,%xmm3 | ||
2689 | xorps %xmm4,%xmm0 | ||
2690 | pshufd \$0b01010101,%xmm1,%xmm1 # critical path | ||
2691 | pxor %xmm3,%xmm2 | ||
2692 | pxor %xmm1,%xmm0 | ||
2693 | pshufd \$0b11111111,%xmm0,%xmm3 | ||
2694 | pxor %xmm3,%xmm2 | ||
2695 | ret | ||
2696 | |||
2697 | .align 16 | ||
2698 | .Lkey_expansion_192b: | ||
2699 | movaps %xmm0,%xmm3 | ||
2700 | shufps \$0b01000100,%xmm0,%xmm5 | ||
2701 | $movkey %xmm5,(%rax) | ||
2702 | shufps \$0b01001110,%xmm2,%xmm3 | ||
2703 | $movkey %xmm3,16(%rax) | ||
2704 | lea 32(%rax),%rax | ||
2705 | jmp .Lkey_expansion_192b_warm | ||
2706 | |||
2707 | .align 16 | ||
2708 | .Lkey_expansion_256a: | ||
2709 | $movkey %xmm2,(%rax) | ||
2710 | lea 16(%rax),%rax | ||
2711 | .Lkey_expansion_256a_cold: | ||
2712 | shufps \$0b00010000,%xmm0,%xmm4 | ||
2713 | xorps %xmm4,%xmm0 | ||
2714 | shufps \$0b10001100,%xmm0,%xmm4 | ||
2715 | xorps %xmm4,%xmm0 | ||
2716 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | ||
2717 | xorps %xmm1,%xmm0 | ||
2718 | ret | ||
2719 | |||
2720 | .align 16 | ||
2721 | .Lkey_expansion_256b: | ||
2722 | $movkey %xmm0,(%rax) | ||
2723 | lea 16(%rax),%rax | ||
2724 | |||
2725 | shufps \$0b00010000,%xmm2,%xmm4 | ||
2726 | xorps %xmm4,%xmm2 | ||
2727 | shufps \$0b10001100,%xmm2,%xmm4 | ||
2728 | xorps %xmm4,%xmm2 | ||
2729 | shufps \$0b10101010,%xmm1,%xmm1 # critical path | ||
2730 | xorps %xmm1,%xmm2 | ||
2731 | ret | ||
2732 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
2733 | .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key | ||
2734 | ___ | ||
2735 | } | ||
2736 | |||
2737 | $code.=<<___; | ||
2738 | .section .rodata | ||
2739 | .align 64 | ||
2740 | .Lbswap_mask: | ||
2741 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | ||
2742 | .Lincrement32: | ||
2743 | .long 6,6,6,0 | ||
2744 | .Lincrement64: | ||
2745 | .long 1,0,0,0 | ||
2746 | .Lxts_magic: | ||
2747 | .long 0x87,0,1,0 | ||
2748 | .align 64 | ||
2749 | .text | ||
2750 | ___ | ||
2751 | |||
2752 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2753 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2754 | if ($win64) { | ||
2755 | $rec="%rcx"; | ||
2756 | $frame="%rdx"; | ||
2757 | $context="%r8"; | ||
2758 | $disp="%r9"; | ||
2759 | |||
2760 | $code.=<<___; | ||
2761 | .extern __imp_RtlVirtualUnwind | ||
2762 | ___ | ||
2763 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
2764 | .type ecb_se_handler,\@abi-omnipotent | ||
2765 | .align 16 | ||
2766 | ecb_se_handler: | ||
2767 | _CET_ENDBR | ||
2768 | push %rsi | ||
2769 | push %rdi | ||
2770 | push %rbx | ||
2771 | push %rbp | ||
2772 | push %r12 | ||
2773 | push %r13 | ||
2774 | push %r14 | ||
2775 | push %r15 | ||
2776 | pushfq | ||
2777 | sub \$64,%rsp | ||
2778 | |||
2779 | mov 152($context),%rax # pull context->Rsp | ||
2780 | |||
2781 | jmp .Lcommon_seh_tail | ||
2782 | .size ecb_se_handler,.-ecb_se_handler | ||
2783 | |||
2784 | .type ccm64_se_handler,\@abi-omnipotent | ||
2785 | .align 16 | ||
2786 | ccm64_se_handler: | ||
2787 | _CET_ENDBR | ||
2788 | push %rsi | ||
2789 | push %rdi | ||
2790 | push %rbx | ||
2791 | push %rbp | ||
2792 | push %r12 | ||
2793 | push %r13 | ||
2794 | push %r14 | ||
2795 | push %r15 | ||
2796 | pushfq | ||
2797 | sub \$64,%rsp | ||
2798 | |||
2799 | mov 120($context),%rax # pull context->Rax | ||
2800 | mov 248($context),%rbx # pull context->Rip | ||
2801 | |||
2802 | mov 8($disp),%rsi # disp->ImageBase | ||
2803 | mov 56($disp),%r11 # disp->HandlerData | ||
2804 | |||
2805 | mov 0(%r11),%r10d # HandlerData[0] | ||
2806 | lea (%rsi,%r10),%r10 # prologue label | ||
2807 | cmp %r10,%rbx # context->Rip<prologue label | ||
2808 | jb .Lcommon_seh_tail | ||
2809 | |||
2810 | mov 152($context),%rax # pull context->Rsp | ||
2811 | |||
2812 | mov 4(%r11),%r10d # HandlerData[1] | ||
2813 | lea (%rsi,%r10),%r10 # epilogue label | ||
2814 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2815 | jae .Lcommon_seh_tail | ||
2816 | |||
2817 | lea 0(%rax),%rsi # %xmm save area | ||
2818 | lea 512($context),%rdi # &context.Xmm6 | ||
2819 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | ||
2820 | .long 0xa548f3fc # cld; rep movsq | ||
2821 | lea 0x58(%rax),%rax # adjust stack pointer | ||
2822 | |||
2823 | jmp .Lcommon_seh_tail | ||
2824 | .size ccm64_se_handler,.-ccm64_se_handler | ||
2825 | |||
2826 | .type ctr32_se_handler,\@abi-omnipotent | ||
2827 | .align 16 | ||
2828 | ctr32_se_handler: | ||
2829 | _CET_ENDBR | ||
2830 | push %rsi | ||
2831 | push %rdi | ||
2832 | push %rbx | ||
2833 | push %rbp | ||
2834 | push %r12 | ||
2835 | push %r13 | ||
2836 | push %r14 | ||
2837 | push %r15 | ||
2838 | pushfq | ||
2839 | sub \$64,%rsp | ||
2840 | |||
2841 | mov 120($context),%rax # pull context->Rax | ||
2842 | mov 248($context),%rbx # pull context->Rip | ||
2843 | |||
2844 | lea .Lctr32_body(%rip),%r10 | ||
2845 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
2846 | jb .Lcommon_seh_tail | ||
2847 | |||
2848 | mov 152($context),%rax # pull context->Rsp | ||
2849 | |||
2850 | lea .Lctr32_ret(%rip),%r10 | ||
2851 | cmp %r10,%rbx | ||
2852 | jae .Lcommon_seh_tail | ||
2853 | |||
2854 | lea 0x20(%rax),%rsi # %xmm save area | ||
2855 | lea 512($context),%rdi # &context.Xmm6 | ||
2856 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2857 | .long 0xa548f3fc # cld; rep movsq | ||
2858 | |||
2859 | jmp .Lcommon_rbp_tail | ||
2860 | .size ctr32_se_handler,.-ctr32_se_handler | ||
2861 | |||
2862 | .type xts_se_handler,\@abi-omnipotent | ||
2863 | .align 16 | ||
2864 | xts_se_handler: | ||
2865 | _CET_ENDBR | ||
2866 | push %rsi | ||
2867 | push %rdi | ||
2868 | push %rbx | ||
2869 | push %rbp | ||
2870 | push %r12 | ||
2871 | push %r13 | ||
2872 | push %r14 | ||
2873 | push %r15 | ||
2874 | pushfq | ||
2875 | sub \$64,%rsp | ||
2876 | |||
2877 | mov 120($context),%rax # pull context->Rax | ||
2878 | mov 248($context),%rbx # pull context->Rip | ||
2879 | |||
2880 | mov 8($disp),%rsi # disp->ImageBase | ||
2881 | mov 56($disp),%r11 # disp->HandlerData | ||
2882 | |||
2883 | mov 0(%r11),%r10d # HandlerData[0] | ||
2884 | lea (%rsi,%r10),%r10 # prologue label | ||
2885 | cmp %r10,%rbx # context->Rip<prologue label | ||
2886 | jb .Lcommon_seh_tail | ||
2887 | |||
2888 | mov 152($context),%rax # pull context->Rsp | ||
2889 | |||
2890 | mov 4(%r11),%r10d # HandlerData[1] | ||
2891 | lea (%rsi,%r10),%r10 # epilogue label | ||
2892 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2893 | jae .Lcommon_seh_tail | ||
2894 | |||
2895 | lea 0x60(%rax),%rsi # %xmm save area | ||
2896 | lea 512($context),%rdi # & context.Xmm6 | ||
2897 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2898 | .long 0xa548f3fc # cld; rep movsq | ||
2899 | |||
2900 | jmp .Lcommon_rbp_tail | ||
2901 | .size xts_se_handler,.-xts_se_handler | ||
2902 | ___ | ||
2903 | $code.=<<___; | ||
2904 | .type cbc_se_handler,\@abi-omnipotent | ||
2905 | .align 16 | ||
2906 | cbc_se_handler: | ||
2907 | _CET_ENDBR | ||
2908 | push %rsi | ||
2909 | push %rdi | ||
2910 | push %rbx | ||
2911 | push %rbp | ||
2912 | push %r12 | ||
2913 | push %r13 | ||
2914 | push %r14 | ||
2915 | push %r15 | ||
2916 | pushfq | ||
2917 | sub \$64,%rsp | ||
2918 | |||
2919 | mov 152($context),%rax # pull context->Rsp | ||
2920 | mov 248($context),%rbx # pull context->Rip | ||
2921 | |||
2922 | lea .Lcbc_decrypt(%rip),%r10 | ||
2923 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
2924 | jb .Lcommon_seh_tail | ||
2925 | |||
2926 | lea .Lcbc_decrypt_body(%rip),%r10 | ||
2927 | cmp %r10,%rbx # context->Rip<cbc_decrypt_body | ||
2928 | jb .Lrestore_cbc_rax | ||
2929 | |||
2930 | lea .Lcbc_ret(%rip),%r10 | ||
2931 | cmp %r10,%rbx # context->Rip>="epilogue" label | ||
2932 | jae .Lcommon_seh_tail | ||
2933 | |||
2934 | lea 16(%rax),%rsi # %xmm save area | ||
2935 | lea 512($context),%rdi # &context.Xmm6 | ||
2936 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | ||
2937 | .long 0xa548f3fc # cld; rep movsq | ||
2938 | |||
2939 | .Lcommon_rbp_tail: | ||
2940 | mov 160($context),%rax # pull context->Rbp | ||
2941 | mov (%rax),%rbp # restore saved %rbp | ||
2942 | lea 8(%rax),%rax # adjust stack pointer | ||
2943 | mov %rbp,160($context) # restore context->Rbp | ||
2944 | jmp .Lcommon_seh_tail | ||
2945 | |||
2946 | .Lrestore_cbc_rax: | ||
2947 | mov 120($context),%rax | ||
2948 | |||
2949 | .Lcommon_seh_tail: | ||
2950 | mov 8(%rax),%rdi | ||
2951 | mov 16(%rax),%rsi | ||
2952 | mov %rax,152($context) # restore context->Rsp | ||
2953 | mov %rsi,168($context) # restore context->Rsi | ||
2954 | mov %rdi,176($context) # restore context->Rdi | ||
2955 | |||
2956 | mov 40($disp),%rdi # disp->ContextRecord | ||
2957 | mov $context,%rsi # context | ||
2958 | mov \$154,%ecx # sizeof(CONTEXT) | ||
2959 | .long 0xa548f3fc # cld; rep movsq | ||
2960 | |||
2961 | mov $disp,%rsi | ||
2962 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
2963 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
2964 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
2965 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
2966 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
2967 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
2968 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
2969 | mov %r10,32(%rsp) # arg5 | ||
2970 | mov %r11,40(%rsp) # arg6 | ||
2971 | mov %r12,48(%rsp) # arg7 | ||
2972 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
2973 | call *__imp_RtlVirtualUnwind(%rip) | ||
2974 | |||
2975 | mov \$1,%eax # ExceptionContinueSearch | ||
2976 | add \$64,%rsp | ||
2977 | popfq | ||
2978 | pop %r15 | ||
2979 | pop %r14 | ||
2980 | pop %r13 | ||
2981 | pop %r12 | ||
2982 | pop %rbp | ||
2983 | pop %rbx | ||
2984 | pop %rdi | ||
2985 | pop %rsi | ||
2986 | ret | ||
2987 | .size cbc_se_handler,.-cbc_se_handler | ||
2988 | |||
2989 | .section .pdata | ||
2990 | .align 4 | ||
2991 | ___ | ||
2992 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
2993 | .rva .LSEH_begin_aesni_ecb_encrypt | ||
2994 | .rva .LSEH_end_aesni_ecb_encrypt | ||
2995 | .rva .LSEH_info_ecb | ||
2996 | |||
2997 | .rva .LSEH_begin_aesni_ccm64_encrypt_blocks | ||
2998 | .rva .LSEH_end_aesni_ccm64_encrypt_blocks | ||
2999 | .rva .LSEH_info_ccm64_enc | ||
3000 | |||
3001 | .rva .LSEH_begin_aesni_ccm64_decrypt_blocks | ||
3002 | .rva .LSEH_end_aesni_ccm64_decrypt_blocks | ||
3003 | .rva .LSEH_info_ccm64_dec | ||
3004 | |||
3005 | .rva .LSEH_begin_aesni_ctr32_encrypt_blocks | ||
3006 | .rva .LSEH_end_aesni_ctr32_encrypt_blocks | ||
3007 | .rva .LSEH_info_ctr32 | ||
3008 | |||
3009 | .rva .LSEH_begin_aesni_xts_encrypt | ||
3010 | .rva .LSEH_end_aesni_xts_encrypt | ||
3011 | .rva .LSEH_info_xts_enc | ||
3012 | |||
3013 | .rva .LSEH_begin_aesni_xts_decrypt | ||
3014 | .rva .LSEH_end_aesni_xts_decrypt | ||
3015 | .rva .LSEH_info_xts_dec | ||
3016 | ___ | ||
3017 | $code.=<<___; | ||
3018 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
3019 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
3020 | .rva .LSEH_info_cbc | ||
3021 | |||
3022 | .rva ${PREFIX}_set_decrypt_key | ||
3023 | .rva .LSEH_end_set_decrypt_key | ||
3024 | .rva .LSEH_info_key | ||
3025 | |||
3026 | .rva ${PREFIX}_set_encrypt_key | ||
3027 | .rva .LSEH_end_set_encrypt_key | ||
3028 | .rva .LSEH_info_key | ||
3029 | .section .xdata | ||
3030 | .align 8 | ||
3031 | ___ | ||
3032 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
3033 | .LSEH_info_ecb: | ||
3034 | .byte 9,0,0,0 | ||
3035 | .rva ecb_se_handler | ||
3036 | .LSEH_info_ccm64_enc: | ||
3037 | .byte 9,0,0,0 | ||
3038 | .rva ccm64_se_handler | ||
3039 | .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] | ||
3040 | .LSEH_info_ccm64_dec: | ||
3041 | .byte 9,0,0,0 | ||
3042 | .rva ccm64_se_handler | ||
3043 | .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] | ||
3044 | .LSEH_info_ctr32: | ||
3045 | .byte 9,0,0,0 | ||
3046 | .rva ctr32_se_handler | ||
3047 | .LSEH_info_xts_enc: | ||
3048 | .byte 9,0,0,0 | ||
3049 | .rva xts_se_handler | ||
3050 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
3051 | .LSEH_info_xts_dec: | ||
3052 | .byte 9,0,0,0 | ||
3053 | .rva xts_se_handler | ||
3054 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
3055 | ___ | ||
3056 | $code.=<<___; | ||
3057 | .LSEH_info_cbc: | ||
3058 | .byte 9,0,0,0 | ||
3059 | .rva cbc_se_handler | ||
3060 | .LSEH_info_key: | ||
3061 | .byte 0x01,0x04,0x01,0x00 | ||
3062 | .byte 0x04,0x02,0x00,0x00 # sub rsp,8 | ||
3063 | ___ | ||
3064 | } | ||
3065 | |||
3066 | sub rex { | ||
3067 | local *opcode=shift; | ||
3068 | my ($dst,$src)=@_; | ||
3069 | my $rex=0; | ||
3070 | |||
3071 | $rex|=0x04 if($dst>=8); | ||
3072 | $rex|=0x01 if($src>=8); | ||
3073 | push @opcode,$rex|0x40 if($rex); | ||
3074 | } | ||
3075 | |||
3076 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
3077 | |||
3078 | print $code; | ||
3079 | |||
3080 | close STDOUT; | ||