diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm/aesni-x86_64.pl')
-rw-r--r-- | src/lib/libcrypto/aes/asm/aesni-x86_64.pl | 3041 |
1 files changed, 0 insertions, 3041 deletions
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl deleted file mode 100644 index c073667fcb..0000000000 --- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl +++ /dev/null | |||
@@ -1,3041 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # This module implements support for Intel AES-NI extension. In | ||
11 | # OpenSSL context it's used with Intel engine, but can also be used as | ||
12 | # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for | ||
13 | # details]. | ||
14 | # | ||
15 | # Performance. | ||
16 | # | ||
17 | # Given aes(enc|dec) instructions' latency asymptotic performance for | ||
18 | # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte | ||
19 | # processed with 128-bit key. And given their throughput asymptotic | ||
20 | # performance for parallelizable modes is 1.25 cycles per byte. Being | ||
21 | # asymptotic limit it's not something you commonly achieve in reality, | ||
22 | # but how close does one get? Below are results collected for | ||
23 | # different modes and block sized. Pairs of numbers are for en-/ | ||
24 | # decryption. | ||
25 | # | ||
26 | # 16-byte 64-byte 256-byte 1-KB 8-KB | ||
27 | # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 | ||
28 | # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 | ||
29 | # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 | ||
30 | # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 | ||
31 | # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 | ||
32 | # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 | ||
33 | # | ||
34 | # ECB, CTR, CBC and CCM results are free from EVP overhead. This means | ||
35 | # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni | ||
36 | # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. | ||
37 | # The results were collected with specially crafted speed.c benchmark | ||
38 | # in order to compare them with results reported in "Intel Advanced | ||
39 | # Encryption Standard (AES) New Instruction Set" White Paper Revision | ||
40 | # 3.0 dated May 2010. All above results are consistently better. This | ||
41 | # module also provides better performance for block sizes smaller than | ||
42 | # 128 bytes in points *not* represented in the above table. | ||
43 | # | ||
44 | # Looking at the results for 8-KB buffer. | ||
45 | # | ||
46 | # CFB and OFB results are far from the limit, because implementation | ||
47 | # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on | ||
48 | # single-block aesni_encrypt, which is not the most optimal way to go. | ||
49 | # CBC encrypt result is unexpectedly high and there is no documented | ||
50 | # explanation for it. Seemingly there is a small penalty for feeding | ||
51 | # the result back to AES unit the way it's done in CBC mode. There is | ||
52 | # nothing one can do and the result appears optimal. CCM result is | ||
53 | # identical to CBC, because CBC-MAC is essentially CBC encrypt without | ||
54 | # saving output. CCM CTR "stays invisible," because it's neatly | ||
55 | # interleaved wih CBC-MAC. This provides ~30% improvement over | ||
56 | # "straghtforward" CCM implementation with CTR and CBC-MAC performed | ||
57 | # disjointly. Parallelizable modes practically achieve the theoretical | ||
58 | # limit. | ||
59 | # | ||
60 | # Looking at how results vary with buffer size. | ||
61 | # | ||
62 | # Curves are practically saturated at 1-KB buffer size. In most cases | ||
63 | # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. | ||
64 | # CTR curve doesn't follow this pattern and is "slowest" changing one | ||
65 | # with "256-byte" result being 87% of "8-KB." This is because overhead | ||
66 | # in CTR mode is most computationally intensive. Small-block CCM | ||
67 | # decrypt is slower than encrypt, because first CTR and last CBC-MAC | ||
68 | # iterations can't be interleaved. | ||
69 | # | ||
70 | # Results for 192- and 256-bit keys. | ||
71 | # | ||
72 | # EVP-free results were observed to scale perfectly with number of | ||
73 | # rounds for larger block sizes, i.e. 192-bit result being 10/12 times | ||
74 | # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences | ||
75 | # are a tad smaller, because the above mentioned penalty biases all | ||
76 | # results by same constant value. In similar way function call | ||
77 | # overhead affects small-block performance, as well as OFB and CFB | ||
78 | # results. Differences are not large, most common coefficients are | ||
79 | # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one | ||
80 | # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... | ||
81 | |||
82 | # January 2011 | ||
83 | # | ||
84 | # While Westmere processor features 6 cycles latency for aes[enc|dec] | ||
85 | # instructions, which can be scheduled every second cycle, Sandy | ||
86 | # Bridge spends 8 cycles per instruction, but it can schedule them | ||
87 | # every cycle. This means that code targeting Westmere would perform | ||
88 | # suboptimally on Sandy Bridge. Therefore this update. | ||
89 | # | ||
90 | # In addition, non-parallelizable CBC encrypt (as well as CCM) is | ||
91 | # optimized. Relative improvement might appear modest, 8% on Westmere, | ||
92 | # but in absolute terms it's 3.77 cycles per byte encrypted with | ||
93 | # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers | ||
94 | # should be compared to asymptotic limits of 3.75 for Westmere and | ||
95 | # 5.00 for Sandy Bridge. Actually, the fact that they get this close | ||
96 | # to asymptotic limits is quite amazing. Indeed, the limit is | ||
97 | # calculated as latency times number of rounds, 10 for 128-bit key, | ||
98 | # and divided by 16, the number of bytes in block, or in other words | ||
99 | # it accounts *solely* for aesenc instructions. But there are extra | ||
100 | # instructions, and numbers so close to the asymptotic limits mean | ||
101 | # that it's as if it takes as little as *one* additional cycle to | ||
102 | # execute all of them. How is it possible? It is possible thanks to | ||
103 | # out-of-order execution logic, which manages to overlap post- | ||
104 | # processing of previous block, things like saving the output, with | ||
105 | # actual encryption of current block, as well as pre-processing of | ||
106 | # current block, things like fetching input and xor-ing it with | ||
107 | # 0-round element of the key schedule, with actual encryption of | ||
108 | # previous block. Keep this in mind... | ||
109 | # | ||
110 | # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher | ||
111 | # performance is achieved by interleaving instructions working on | ||
112 | # independent blocks. In which case asymptotic limit for such modes | ||
113 | # can be obtained by dividing above mentioned numbers by AES | ||
114 | # instructions' interleave factor. Westmere can execute at most 3 | ||
115 | # instructions at a time, meaning that optimal interleave factor is 3, | ||
116 | # and that's where the "magic" number of 1.25 come from. "Optimal | ||
117 | # interleave factor" means that increase of interleave factor does | ||
118 | # not improve performance. The formula has proven to reflect reality | ||
119 | # pretty well on Westmere... Sandy Bridge on the other hand can | ||
120 | # execute up to 8 AES instructions at a time, so how does varying | ||
121 | # interleave factor affect the performance? Here is table for ECB | ||
122 | # (numbers are cycles per byte processed with 128-bit key): | ||
123 | # | ||
124 | # instruction interleave factor 3x 6x 8x | ||
125 | # theoretical asymptotic limit 1.67 0.83 0.625 | ||
126 | # measured performance for 8KB block 1.05 0.86 0.84 | ||
127 | # | ||
128 | # "as if" interleave factor 4.7x 5.8x 6.0x | ||
129 | # | ||
130 | # Further data for other parallelizable modes: | ||
131 | # | ||
132 | # CBC decrypt 1.16 0.93 0.93 | ||
133 | # CTR 1.14 0.91 n/a | ||
134 | # | ||
135 | # Well, given 3x column it's probably inappropriate to call the limit | ||
136 | # asymptotic, if it can be surpassed, isn't it? What happens there? | ||
137 | # Rewind to CBC paragraph for the answer. Yes, out-of-order execution | ||
138 | # magic is responsible for this. Processor overlaps not only the | ||
139 | # additional instructions with AES ones, but even AES instuctions | ||
140 | # processing adjacent triplets of independent blocks. In the 6x case | ||
141 | # additional instructions still claim disproportionally small amount | ||
142 | # of additional cycles, but in 8x case number of instructions must be | ||
143 | # a tad too high for out-of-order logic to cope with, and AES unit | ||
144 | # remains underutilized... As you can see 8x interleave is hardly | ||
145 | # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl | ||
146 | # utilizies 6x interleave because of limited register bank capacity. | ||
147 | # | ||
148 | # Higher interleave factors do have negative impact on Westmere | ||
149 | # performance. While for ECB mode it's negligible ~1.5%, other | ||
150 | # parallelizables perform ~5% worse, which is outweighed by ~25% | ||
151 | # improvement on Sandy Bridge. To balance regression on Westmere | ||
152 | # CTR mode was implemented with 6x aesenc interleave factor. | ||
153 | |||
154 | # April 2011 | ||
155 | # | ||
156 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing | ||
157 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like | ||
158 | # in CTR mode AES instruction interleave factor was chosen to be 6x. | ||
159 | |||
160 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script | ||
161 | # generates drop-in replacement for | ||
162 | # crypto/aes/asm/aes-x86_64.pl:-) | ||
163 | |||
164 | $flavour = shift; | ||
165 | $output = shift; | ||
166 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
167 | |||
168 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
169 | |||
170 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
171 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
172 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
173 | die "can't locate x86_64-xlate.pl"; | ||
174 | |||
175 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
176 | *STDOUT=*OUT; | ||
177 | |||
178 | $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; | ||
179 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order | ||
180 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | ||
181 | |||
182 | $code=".text\n"; | ||
183 | |||
184 | $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! | ||
185 | # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... | ||
186 | $inp="%rdi"; | ||
187 | $out="%rsi"; | ||
188 | $len="%rdx"; | ||
189 | $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! | ||
190 | $ivp="%r8"; # cbc, ctr, ... | ||
191 | |||
192 | $rnds_="%r10d"; # backup copy for $rounds | ||
193 | $key_="%r11"; # backup copy for $key | ||
194 | |||
195 | # %xmm register layout | ||
196 | $rndkey0="%xmm0"; $rndkey1="%xmm1"; | ||
197 | $inout0="%xmm2"; $inout1="%xmm3"; | ||
198 | $inout2="%xmm4"; $inout3="%xmm5"; | ||
199 | $inout4="%xmm6"; $inout5="%xmm7"; | ||
200 | $inout6="%xmm8"; $inout7="%xmm9"; | ||
201 | |||
202 | $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... | ||
203 | $in0="%xmm8"; $iv="%xmm9"; | ||
204 | |||
205 | # Inline version of internal aesni_[en|de]crypt1. | ||
206 | # | ||
207 | # Why folded loop? Because aes[enc|dec] is slow enough to accommodate | ||
208 | # cycles which take care of loop variables... | ||
209 | { my $sn; | ||
210 | sub aesni_generate1 { | ||
211 | my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); | ||
212 | ++$sn; | ||
213 | $code.=<<___; | ||
214 | $movkey ($key),$rndkey0 | ||
215 | $movkey 16($key),$rndkey1 | ||
216 | ___ | ||
217 | $code.=<<___ if (defined($ivec)); | ||
218 | xorps $rndkey0,$ivec | ||
219 | lea 32($key),$key | ||
220 | xorps $ivec,$inout | ||
221 | ___ | ||
222 | $code.=<<___ if (!defined($ivec)); | ||
223 | lea 32($key),$key | ||
224 | xorps $rndkey0,$inout | ||
225 | ___ | ||
226 | $code.=<<___; | ||
227 | .Loop_${p}1_$sn: | ||
228 | aes${p} $rndkey1,$inout | ||
229 | dec $rounds | ||
230 | $movkey ($key),$rndkey1 | ||
231 | lea 16($key),$key | ||
232 | jnz .Loop_${p}1_$sn # loop body is 16 bytes | ||
233 | aes${p}last $rndkey1,$inout | ||
234 | ___ | ||
235 | }} | ||
236 | # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); | ||
237 | # | ||
238 | { my ($inp,$out,$key) = @_4args; | ||
239 | |||
240 | $code.=<<___; | ||
241 | .globl ${PREFIX}_encrypt | ||
242 | .type ${PREFIX}_encrypt,\@abi-omnipotent | ||
243 | .align 16 | ||
244 | ${PREFIX}_encrypt: | ||
245 | movups ($inp),$inout0 # load input | ||
246 | mov 240($key),$rounds # key->rounds | ||
247 | ___ | ||
248 | &aesni_generate1("enc",$key,$rounds); | ||
249 | $code.=<<___; | ||
250 | movups $inout0,($out) # output | ||
251 | ret | ||
252 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
253 | |||
254 | .globl ${PREFIX}_decrypt | ||
255 | .type ${PREFIX}_decrypt,\@abi-omnipotent | ||
256 | .align 16 | ||
257 | ${PREFIX}_decrypt: | ||
258 | movups ($inp),$inout0 # load input | ||
259 | mov 240($key),$rounds # key->rounds | ||
260 | ___ | ||
261 | &aesni_generate1("dec",$key,$rounds); | ||
262 | $code.=<<___; | ||
263 | movups $inout0,($out) # output | ||
264 | ret | ||
265 | .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt | ||
266 | ___ | ||
267 | } | ||
268 | |||
269 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave | ||
270 | # factor. Why 3x subroutine were originally used in loops? Even though | ||
271 | # aes[enc|dec] latency was originally 6, it could be scheduled only | ||
272 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal | ||
273 | # utilization, i.e. when subroutine's throughput is virtually same as | ||
274 | # of non-interleaved subroutine [for number of input blocks up to 3]. | ||
275 | # This is why it makes no sense to implement 2x subroutine. | ||
276 | # aes[enc|dec] latency in next processor generation is 8, but the | ||
277 | # instructions can be scheduled every cycle. Optimal interleave for | ||
278 | # new processor is therefore 8x... | ||
279 | sub aesni_generate3 { | ||
280 | my $dir=shift; | ||
281 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
282 | # preserved. $inout[0-2] is cipher/clear text... | ||
283 | $code.=<<___; | ||
284 | .type _aesni_${dir}rypt3,\@abi-omnipotent | ||
285 | .align 16 | ||
286 | _aesni_${dir}rypt3: | ||
287 | $movkey ($key),$rndkey0 | ||
288 | shr \$1,$rounds | ||
289 | $movkey 16($key),$rndkey1 | ||
290 | lea 32($key),$key | ||
291 | xorps $rndkey0,$inout0 | ||
292 | xorps $rndkey0,$inout1 | ||
293 | xorps $rndkey0,$inout2 | ||
294 | $movkey ($key),$rndkey0 | ||
295 | |||
296 | .L${dir}_loop3: | ||
297 | aes${dir} $rndkey1,$inout0 | ||
298 | aes${dir} $rndkey1,$inout1 | ||
299 | dec $rounds | ||
300 | aes${dir} $rndkey1,$inout2 | ||
301 | $movkey 16($key),$rndkey1 | ||
302 | aes${dir} $rndkey0,$inout0 | ||
303 | aes${dir} $rndkey0,$inout1 | ||
304 | lea 32($key),$key | ||
305 | aes${dir} $rndkey0,$inout2 | ||
306 | $movkey ($key),$rndkey0 | ||
307 | jnz .L${dir}_loop3 | ||
308 | |||
309 | aes${dir} $rndkey1,$inout0 | ||
310 | aes${dir} $rndkey1,$inout1 | ||
311 | aes${dir} $rndkey1,$inout2 | ||
312 | aes${dir}last $rndkey0,$inout0 | ||
313 | aes${dir}last $rndkey0,$inout1 | ||
314 | aes${dir}last $rndkey0,$inout2 | ||
315 | ret | ||
316 | .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 | ||
317 | ___ | ||
318 | } | ||
319 | # 4x interleave is implemented to improve small block performance, | ||
320 | # most notably [and naturally] 4 block by ~30%. One can argue that one | ||
321 | # should have implemented 5x as well, but improvement would be <20%, | ||
322 | # so it's not worth it... | ||
323 | sub aesni_generate4 { | ||
324 | my $dir=shift; | ||
325 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
326 | # preserved. $inout[0-3] is cipher/clear text... | ||
327 | $code.=<<___; | ||
328 | .type _aesni_${dir}rypt4,\@abi-omnipotent | ||
329 | .align 16 | ||
330 | _aesni_${dir}rypt4: | ||
331 | $movkey ($key),$rndkey0 | ||
332 | shr \$1,$rounds | ||
333 | $movkey 16($key),$rndkey1 | ||
334 | lea 32($key),$key | ||
335 | xorps $rndkey0,$inout0 | ||
336 | xorps $rndkey0,$inout1 | ||
337 | xorps $rndkey0,$inout2 | ||
338 | xorps $rndkey0,$inout3 | ||
339 | $movkey ($key),$rndkey0 | ||
340 | |||
341 | .L${dir}_loop4: | ||
342 | aes${dir} $rndkey1,$inout0 | ||
343 | aes${dir} $rndkey1,$inout1 | ||
344 | dec $rounds | ||
345 | aes${dir} $rndkey1,$inout2 | ||
346 | aes${dir} $rndkey1,$inout3 | ||
347 | $movkey 16($key),$rndkey1 | ||
348 | aes${dir} $rndkey0,$inout0 | ||
349 | aes${dir} $rndkey0,$inout1 | ||
350 | lea 32($key),$key | ||
351 | aes${dir} $rndkey0,$inout2 | ||
352 | aes${dir} $rndkey0,$inout3 | ||
353 | $movkey ($key),$rndkey0 | ||
354 | jnz .L${dir}_loop4 | ||
355 | |||
356 | aes${dir} $rndkey1,$inout0 | ||
357 | aes${dir} $rndkey1,$inout1 | ||
358 | aes${dir} $rndkey1,$inout2 | ||
359 | aes${dir} $rndkey1,$inout3 | ||
360 | aes${dir}last $rndkey0,$inout0 | ||
361 | aes${dir}last $rndkey0,$inout1 | ||
362 | aes${dir}last $rndkey0,$inout2 | ||
363 | aes${dir}last $rndkey0,$inout3 | ||
364 | ret | ||
365 | .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 | ||
366 | ___ | ||
367 | } | ||
368 | sub aesni_generate6 { | ||
369 | my $dir=shift; | ||
370 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
371 | # preserved. $inout[0-5] is cipher/clear text... | ||
372 | $code.=<<___; | ||
373 | .type _aesni_${dir}rypt6,\@abi-omnipotent | ||
374 | .align 16 | ||
375 | _aesni_${dir}rypt6: | ||
376 | $movkey ($key),$rndkey0 | ||
377 | shr \$1,$rounds | ||
378 | $movkey 16($key),$rndkey1 | ||
379 | lea 32($key),$key | ||
380 | xorps $rndkey0,$inout0 | ||
381 | pxor $rndkey0,$inout1 | ||
382 | aes${dir} $rndkey1,$inout0 | ||
383 | pxor $rndkey0,$inout2 | ||
384 | aes${dir} $rndkey1,$inout1 | ||
385 | pxor $rndkey0,$inout3 | ||
386 | aes${dir} $rndkey1,$inout2 | ||
387 | pxor $rndkey0,$inout4 | ||
388 | aes${dir} $rndkey1,$inout3 | ||
389 | pxor $rndkey0,$inout5 | ||
390 | dec $rounds | ||
391 | aes${dir} $rndkey1,$inout4 | ||
392 | $movkey ($key),$rndkey0 | ||
393 | aes${dir} $rndkey1,$inout5 | ||
394 | jmp .L${dir}_loop6_enter | ||
395 | .align 16 | ||
396 | .L${dir}_loop6: | ||
397 | aes${dir} $rndkey1,$inout0 | ||
398 | aes${dir} $rndkey1,$inout1 | ||
399 | dec $rounds | ||
400 | aes${dir} $rndkey1,$inout2 | ||
401 | aes${dir} $rndkey1,$inout3 | ||
402 | aes${dir} $rndkey1,$inout4 | ||
403 | aes${dir} $rndkey1,$inout5 | ||
404 | .L${dir}_loop6_enter: # happens to be 16-byte aligned | ||
405 | $movkey 16($key),$rndkey1 | ||
406 | aes${dir} $rndkey0,$inout0 | ||
407 | aes${dir} $rndkey0,$inout1 | ||
408 | lea 32($key),$key | ||
409 | aes${dir} $rndkey0,$inout2 | ||
410 | aes${dir} $rndkey0,$inout3 | ||
411 | aes${dir} $rndkey0,$inout4 | ||
412 | aes${dir} $rndkey0,$inout5 | ||
413 | $movkey ($key),$rndkey0 | ||
414 | jnz .L${dir}_loop6 | ||
415 | |||
416 | aes${dir} $rndkey1,$inout0 | ||
417 | aes${dir} $rndkey1,$inout1 | ||
418 | aes${dir} $rndkey1,$inout2 | ||
419 | aes${dir} $rndkey1,$inout3 | ||
420 | aes${dir} $rndkey1,$inout4 | ||
421 | aes${dir} $rndkey1,$inout5 | ||
422 | aes${dir}last $rndkey0,$inout0 | ||
423 | aes${dir}last $rndkey0,$inout1 | ||
424 | aes${dir}last $rndkey0,$inout2 | ||
425 | aes${dir}last $rndkey0,$inout3 | ||
426 | aes${dir}last $rndkey0,$inout4 | ||
427 | aes${dir}last $rndkey0,$inout5 | ||
428 | ret | ||
429 | .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 | ||
430 | ___ | ||
431 | } | ||
432 | sub aesni_generate8 { | ||
433 | my $dir=shift; | ||
434 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
435 | # preserved. $inout[0-7] is cipher/clear text... | ||
436 | $code.=<<___; | ||
437 | .type _aesni_${dir}rypt8,\@abi-omnipotent | ||
438 | .align 16 | ||
439 | _aesni_${dir}rypt8: | ||
440 | $movkey ($key),$rndkey0 | ||
441 | shr \$1,$rounds | ||
442 | $movkey 16($key),$rndkey1 | ||
443 | lea 32($key),$key | ||
444 | xorps $rndkey0,$inout0 | ||
445 | xorps $rndkey0,$inout1 | ||
446 | aes${dir} $rndkey1,$inout0 | ||
447 | pxor $rndkey0,$inout2 | ||
448 | aes${dir} $rndkey1,$inout1 | ||
449 | pxor $rndkey0,$inout3 | ||
450 | aes${dir} $rndkey1,$inout2 | ||
451 | pxor $rndkey0,$inout4 | ||
452 | aes${dir} $rndkey1,$inout3 | ||
453 | pxor $rndkey0,$inout5 | ||
454 | dec $rounds | ||
455 | aes${dir} $rndkey1,$inout4 | ||
456 | pxor $rndkey0,$inout6 | ||
457 | aes${dir} $rndkey1,$inout5 | ||
458 | pxor $rndkey0,$inout7 | ||
459 | $movkey ($key),$rndkey0 | ||
460 | aes${dir} $rndkey1,$inout6 | ||
461 | aes${dir} $rndkey1,$inout7 | ||
462 | $movkey 16($key),$rndkey1 | ||
463 | jmp .L${dir}_loop8_enter | ||
464 | .align 16 | ||
465 | .L${dir}_loop8: | ||
466 | aes${dir} $rndkey1,$inout0 | ||
467 | aes${dir} $rndkey1,$inout1 | ||
468 | dec $rounds | ||
469 | aes${dir} $rndkey1,$inout2 | ||
470 | aes${dir} $rndkey1,$inout3 | ||
471 | aes${dir} $rndkey1,$inout4 | ||
472 | aes${dir} $rndkey1,$inout5 | ||
473 | aes${dir} $rndkey1,$inout6 | ||
474 | aes${dir} $rndkey1,$inout7 | ||
475 | $movkey 16($key),$rndkey1 | ||
476 | .L${dir}_loop8_enter: # happens to be 16-byte aligned | ||
477 | aes${dir} $rndkey0,$inout0 | ||
478 | aes${dir} $rndkey0,$inout1 | ||
479 | lea 32($key),$key | ||
480 | aes${dir} $rndkey0,$inout2 | ||
481 | aes${dir} $rndkey0,$inout3 | ||
482 | aes${dir} $rndkey0,$inout4 | ||
483 | aes${dir} $rndkey0,$inout5 | ||
484 | aes${dir} $rndkey0,$inout6 | ||
485 | aes${dir} $rndkey0,$inout7 | ||
486 | $movkey ($key),$rndkey0 | ||
487 | jnz .L${dir}_loop8 | ||
488 | |||
489 | aes${dir} $rndkey1,$inout0 | ||
490 | aes${dir} $rndkey1,$inout1 | ||
491 | aes${dir} $rndkey1,$inout2 | ||
492 | aes${dir} $rndkey1,$inout3 | ||
493 | aes${dir} $rndkey1,$inout4 | ||
494 | aes${dir} $rndkey1,$inout5 | ||
495 | aes${dir} $rndkey1,$inout6 | ||
496 | aes${dir} $rndkey1,$inout7 | ||
497 | aes${dir}last $rndkey0,$inout0 | ||
498 | aes${dir}last $rndkey0,$inout1 | ||
499 | aes${dir}last $rndkey0,$inout2 | ||
500 | aes${dir}last $rndkey0,$inout3 | ||
501 | aes${dir}last $rndkey0,$inout4 | ||
502 | aes${dir}last $rndkey0,$inout5 | ||
503 | aes${dir}last $rndkey0,$inout6 | ||
504 | aes${dir}last $rndkey0,$inout7 | ||
505 | ret | ||
506 | .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 | ||
507 | ___ | ||
508 | } | ||
509 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); | ||
510 | &aesni_generate3("dec"); | ||
511 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | ||
512 | &aesni_generate4("dec"); | ||
513 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); | ||
514 | &aesni_generate6("dec"); | ||
515 | &aesni_generate8("enc") if ($PREFIX eq "aesni"); | ||
516 | &aesni_generate8("dec"); | ||
517 | |||
518 | if ($PREFIX eq "aesni") { | ||
519 | ######################################################################## | ||
520 | # void aesni_ecb_encrypt (const void *in, void *out, | ||
521 | # size_t length, const AES_KEY *key, | ||
522 | # int enc); | ||
523 | $code.=<<___; | ||
524 | .globl aesni_ecb_encrypt | ||
525 | .type aesni_ecb_encrypt,\@function,5 | ||
526 | .align 16 | ||
527 | aesni_ecb_encrypt: | ||
528 | and \$-16,$len | ||
529 | jz .Lecb_ret | ||
530 | |||
531 | mov 240($key),$rounds # key->rounds | ||
532 | $movkey ($key),$rndkey0 | ||
533 | mov $key,$key_ # backup $key | ||
534 | mov $rounds,$rnds_ # backup $rounds | ||
535 | test %r8d,%r8d # 5th argument | ||
536 | jz .Lecb_decrypt | ||
537 | #--------------------------- ECB ENCRYPT ------------------------------# | ||
538 | cmp \$0x80,$len | ||
539 | jb .Lecb_enc_tail | ||
540 | |||
541 | movdqu ($inp),$inout0 | ||
542 | movdqu 0x10($inp),$inout1 | ||
543 | movdqu 0x20($inp),$inout2 | ||
544 | movdqu 0x30($inp),$inout3 | ||
545 | movdqu 0x40($inp),$inout4 | ||
546 | movdqu 0x50($inp),$inout5 | ||
547 | movdqu 0x60($inp),$inout6 | ||
548 | movdqu 0x70($inp),$inout7 | ||
549 | lea 0x80($inp),$inp | ||
550 | sub \$0x80,$len | ||
551 | jmp .Lecb_enc_loop8_enter | ||
552 | .align 16 | ||
553 | .Lecb_enc_loop8: | ||
554 | movups $inout0,($out) | ||
555 | mov $key_,$key # restore $key | ||
556 | movdqu ($inp),$inout0 | ||
557 | mov $rnds_,$rounds # restore $rounds | ||
558 | movups $inout1,0x10($out) | ||
559 | movdqu 0x10($inp),$inout1 | ||
560 | movups $inout2,0x20($out) | ||
561 | movdqu 0x20($inp),$inout2 | ||
562 | movups $inout3,0x30($out) | ||
563 | movdqu 0x30($inp),$inout3 | ||
564 | movups $inout4,0x40($out) | ||
565 | movdqu 0x40($inp),$inout4 | ||
566 | movups $inout5,0x50($out) | ||
567 | movdqu 0x50($inp),$inout5 | ||
568 | movups $inout6,0x60($out) | ||
569 | movdqu 0x60($inp),$inout6 | ||
570 | movups $inout7,0x70($out) | ||
571 | lea 0x80($out),$out | ||
572 | movdqu 0x70($inp),$inout7 | ||
573 | lea 0x80($inp),$inp | ||
574 | .Lecb_enc_loop8_enter: | ||
575 | |||
576 | call _aesni_encrypt8 | ||
577 | |||
578 | sub \$0x80,$len | ||
579 | jnc .Lecb_enc_loop8 | ||
580 | |||
581 | movups $inout0,($out) | ||
582 | mov $key_,$key # restore $key | ||
583 | movups $inout1,0x10($out) | ||
584 | mov $rnds_,$rounds # restore $rounds | ||
585 | movups $inout2,0x20($out) | ||
586 | movups $inout3,0x30($out) | ||
587 | movups $inout4,0x40($out) | ||
588 | movups $inout5,0x50($out) | ||
589 | movups $inout6,0x60($out) | ||
590 | movups $inout7,0x70($out) | ||
591 | lea 0x80($out),$out | ||
592 | add \$0x80,$len | ||
593 | jz .Lecb_ret | ||
594 | |||
595 | .Lecb_enc_tail: | ||
596 | movups ($inp),$inout0 | ||
597 | cmp \$0x20,$len | ||
598 | jb .Lecb_enc_one | ||
599 | movups 0x10($inp),$inout1 | ||
600 | je .Lecb_enc_two | ||
601 | movups 0x20($inp),$inout2 | ||
602 | cmp \$0x40,$len | ||
603 | jb .Lecb_enc_three | ||
604 | movups 0x30($inp),$inout3 | ||
605 | je .Lecb_enc_four | ||
606 | movups 0x40($inp),$inout4 | ||
607 | cmp \$0x60,$len | ||
608 | jb .Lecb_enc_five | ||
609 | movups 0x50($inp),$inout5 | ||
610 | je .Lecb_enc_six | ||
611 | movdqu 0x60($inp),$inout6 | ||
612 | call _aesni_encrypt8 | ||
613 | movups $inout0,($out) | ||
614 | movups $inout1,0x10($out) | ||
615 | movups $inout2,0x20($out) | ||
616 | movups $inout3,0x30($out) | ||
617 | movups $inout4,0x40($out) | ||
618 | movups $inout5,0x50($out) | ||
619 | movups $inout6,0x60($out) | ||
620 | jmp .Lecb_ret | ||
621 | .align 16 | ||
622 | .Lecb_enc_one: | ||
623 | ___ | ||
624 | &aesni_generate1("enc",$key,$rounds); | ||
625 | $code.=<<___; | ||
626 | movups $inout0,($out) | ||
627 | jmp .Lecb_ret | ||
628 | .align 16 | ||
629 | .Lecb_enc_two: | ||
630 | xorps $inout2,$inout2 | ||
631 | call _aesni_encrypt3 | ||
632 | movups $inout0,($out) | ||
633 | movups $inout1,0x10($out) | ||
634 | jmp .Lecb_ret | ||
635 | .align 16 | ||
636 | .Lecb_enc_three: | ||
637 | call _aesni_encrypt3 | ||
638 | movups $inout0,($out) | ||
639 | movups $inout1,0x10($out) | ||
640 | movups $inout2,0x20($out) | ||
641 | jmp .Lecb_ret | ||
642 | .align 16 | ||
643 | .Lecb_enc_four: | ||
644 | call _aesni_encrypt4 | ||
645 | movups $inout0,($out) | ||
646 | movups $inout1,0x10($out) | ||
647 | movups $inout2,0x20($out) | ||
648 | movups $inout3,0x30($out) | ||
649 | jmp .Lecb_ret | ||
650 | .align 16 | ||
651 | .Lecb_enc_five: | ||
652 | xorps $inout5,$inout5 | ||
653 | call _aesni_encrypt6 | ||
654 | movups $inout0,($out) | ||
655 | movups $inout1,0x10($out) | ||
656 | movups $inout2,0x20($out) | ||
657 | movups $inout3,0x30($out) | ||
658 | movups $inout4,0x40($out) | ||
659 | jmp .Lecb_ret | ||
660 | .align 16 | ||
661 | .Lecb_enc_six: | ||
662 | call _aesni_encrypt6 | ||
663 | movups $inout0,($out) | ||
664 | movups $inout1,0x10($out) | ||
665 | movups $inout2,0x20($out) | ||
666 | movups $inout3,0x30($out) | ||
667 | movups $inout4,0x40($out) | ||
668 | movups $inout5,0x50($out) | ||
669 | jmp .Lecb_ret | ||
670 | #--------------------------- ECB DECRYPT ------------------------------# | ||
671 | .align 16 | ||
672 | .Lecb_decrypt: | ||
673 | cmp \$0x80,$len | ||
674 | jb .Lecb_dec_tail | ||
675 | |||
676 | movdqu ($inp),$inout0 | ||
677 | movdqu 0x10($inp),$inout1 | ||
678 | movdqu 0x20($inp),$inout2 | ||
679 | movdqu 0x30($inp),$inout3 | ||
680 | movdqu 0x40($inp),$inout4 | ||
681 | movdqu 0x50($inp),$inout5 | ||
682 | movdqu 0x60($inp),$inout6 | ||
683 | movdqu 0x70($inp),$inout7 | ||
684 | lea 0x80($inp),$inp | ||
685 | sub \$0x80,$len | ||
686 | jmp .Lecb_dec_loop8_enter | ||
687 | .align 16 | ||
688 | .Lecb_dec_loop8: | ||
689 | movups $inout0,($out) | ||
690 | mov $key_,$key # restore $key | ||
691 | movdqu ($inp),$inout0 | ||
692 | mov $rnds_,$rounds # restore $rounds | ||
693 | movups $inout1,0x10($out) | ||
694 | movdqu 0x10($inp),$inout1 | ||
695 | movups $inout2,0x20($out) | ||
696 | movdqu 0x20($inp),$inout2 | ||
697 | movups $inout3,0x30($out) | ||
698 | movdqu 0x30($inp),$inout3 | ||
699 | movups $inout4,0x40($out) | ||
700 | movdqu 0x40($inp),$inout4 | ||
701 | movups $inout5,0x50($out) | ||
702 | movdqu 0x50($inp),$inout5 | ||
703 | movups $inout6,0x60($out) | ||
704 | movdqu 0x60($inp),$inout6 | ||
705 | movups $inout7,0x70($out) | ||
706 | lea 0x80($out),$out | ||
707 | movdqu 0x70($inp),$inout7 | ||
708 | lea 0x80($inp),$inp | ||
709 | .Lecb_dec_loop8_enter: | ||
710 | |||
711 | call _aesni_decrypt8 | ||
712 | |||
713 | $movkey ($key_),$rndkey0 | ||
714 | sub \$0x80,$len | ||
715 | jnc .Lecb_dec_loop8 | ||
716 | |||
717 | movups $inout0,($out) | ||
718 | mov $key_,$key # restore $key | ||
719 | movups $inout1,0x10($out) | ||
720 | mov $rnds_,$rounds # restore $rounds | ||
721 | movups $inout2,0x20($out) | ||
722 | movups $inout3,0x30($out) | ||
723 | movups $inout4,0x40($out) | ||
724 | movups $inout5,0x50($out) | ||
725 | movups $inout6,0x60($out) | ||
726 | movups $inout7,0x70($out) | ||
727 | lea 0x80($out),$out | ||
728 | add \$0x80,$len | ||
729 | jz .Lecb_ret | ||
730 | |||
731 | .Lecb_dec_tail: | ||
732 | movups ($inp),$inout0 | ||
733 | cmp \$0x20,$len | ||
734 | jb .Lecb_dec_one | ||
735 | movups 0x10($inp),$inout1 | ||
736 | je .Lecb_dec_two | ||
737 | movups 0x20($inp),$inout2 | ||
738 | cmp \$0x40,$len | ||
739 | jb .Lecb_dec_three | ||
740 | movups 0x30($inp),$inout3 | ||
741 | je .Lecb_dec_four | ||
742 | movups 0x40($inp),$inout4 | ||
743 | cmp \$0x60,$len | ||
744 | jb .Lecb_dec_five | ||
745 | movups 0x50($inp),$inout5 | ||
746 | je .Lecb_dec_six | ||
747 | movups 0x60($inp),$inout6 | ||
748 | $movkey ($key),$rndkey0 | ||
749 | call _aesni_decrypt8 | ||
750 | movups $inout0,($out) | ||
751 | movups $inout1,0x10($out) | ||
752 | movups $inout2,0x20($out) | ||
753 | movups $inout3,0x30($out) | ||
754 | movups $inout4,0x40($out) | ||
755 | movups $inout5,0x50($out) | ||
756 | movups $inout6,0x60($out) | ||
757 | jmp .Lecb_ret | ||
758 | .align 16 | ||
759 | .Lecb_dec_one: | ||
760 | ___ | ||
761 | &aesni_generate1("dec",$key,$rounds); | ||
762 | $code.=<<___; | ||
763 | movups $inout0,($out) | ||
764 | jmp .Lecb_ret | ||
765 | .align 16 | ||
766 | .Lecb_dec_two: | ||
767 | xorps $inout2,$inout2 | ||
768 | call _aesni_decrypt3 | ||
769 | movups $inout0,($out) | ||
770 | movups $inout1,0x10($out) | ||
771 | jmp .Lecb_ret | ||
772 | .align 16 | ||
773 | .Lecb_dec_three: | ||
774 | call _aesni_decrypt3 | ||
775 | movups $inout0,($out) | ||
776 | movups $inout1,0x10($out) | ||
777 | movups $inout2,0x20($out) | ||
778 | jmp .Lecb_ret | ||
779 | .align 16 | ||
780 | .Lecb_dec_four: | ||
781 | call _aesni_decrypt4 | ||
782 | movups $inout0,($out) | ||
783 | movups $inout1,0x10($out) | ||
784 | movups $inout2,0x20($out) | ||
785 | movups $inout3,0x30($out) | ||
786 | jmp .Lecb_ret | ||
787 | .align 16 | ||
788 | .Lecb_dec_five: | ||
789 | xorps $inout5,$inout5 | ||
790 | call _aesni_decrypt6 | ||
791 | movups $inout0,($out) | ||
792 | movups $inout1,0x10($out) | ||
793 | movups $inout2,0x20($out) | ||
794 | movups $inout3,0x30($out) | ||
795 | movups $inout4,0x40($out) | ||
796 | jmp .Lecb_ret | ||
797 | .align 16 | ||
798 | .Lecb_dec_six: | ||
799 | call _aesni_decrypt6 | ||
800 | movups $inout0,($out) | ||
801 | movups $inout1,0x10($out) | ||
802 | movups $inout2,0x20($out) | ||
803 | movups $inout3,0x30($out) | ||
804 | movups $inout4,0x40($out) | ||
805 | movups $inout5,0x50($out) | ||
806 | |||
807 | .Lecb_ret: | ||
808 | ret | ||
809 | .size aesni_ecb_encrypt,.-aesni_ecb_encrypt | ||
810 | ___ | ||
811 | |||
812 | { | ||
813 | ###################################################################### | ||
814 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, | ||
815 | # size_t blocks, const AES_KEY *key, | ||
816 | # const char *ivec,char *cmac); | ||
817 | # | ||
818 | # Handles only complete blocks, operates on 64-bit counter and | ||
819 | # does not update *ivec! Nor does it finalize CMAC value | ||
820 | # (see engine/eng_aesni.c for details) | ||
821 | # | ||
822 | { | ||
823 | my $cmac="%r9"; # 6th argument | ||
824 | |||
825 | my $increment="%xmm6"; | ||
826 | my $bswap_mask="%xmm7"; | ||
827 | |||
828 | $code.=<<___; | ||
829 | .globl aesni_ccm64_encrypt_blocks | ||
830 | .type aesni_ccm64_encrypt_blocks,\@function,6 | ||
831 | .align 16 | ||
832 | aesni_ccm64_encrypt_blocks: | ||
833 | ___ | ||
834 | $code.=<<___ if ($win64); | ||
835 | lea -0x58(%rsp),%rsp | ||
836 | movaps %xmm6,(%rsp) | ||
837 | movaps %xmm7,0x10(%rsp) | ||
838 | movaps %xmm8,0x20(%rsp) | ||
839 | movaps %xmm9,0x30(%rsp) | ||
840 | .Lccm64_enc_body: | ||
841 | ___ | ||
842 | $code.=<<___; | ||
843 | mov 240($key),$rounds # key->rounds | ||
844 | movdqu ($ivp),$iv | ||
845 | movdqa .Lincrement64(%rip),$increment | ||
846 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
847 | |||
848 | shr \$1,$rounds | ||
849 | lea 0($key),$key_ | ||
850 | movdqu ($cmac),$inout1 | ||
851 | movdqa $iv,$inout0 | ||
852 | mov $rounds,$rnds_ | ||
853 | pshufb $bswap_mask,$iv | ||
854 | jmp .Lccm64_enc_outer | ||
855 | .align 16 | ||
856 | .Lccm64_enc_outer: | ||
857 | $movkey ($key_),$rndkey0 | ||
858 | mov $rnds_,$rounds | ||
859 | movups ($inp),$in0 # load inp | ||
860 | |||
861 | xorps $rndkey0,$inout0 # counter | ||
862 | $movkey 16($key_),$rndkey1 | ||
863 | xorps $in0,$rndkey0 | ||
864 | lea 32($key_),$key | ||
865 | xorps $rndkey0,$inout1 # cmac^=inp | ||
866 | $movkey ($key),$rndkey0 | ||
867 | |||
868 | .Lccm64_enc2_loop: | ||
869 | aesenc $rndkey1,$inout0 | ||
870 | dec $rounds | ||
871 | aesenc $rndkey1,$inout1 | ||
872 | $movkey 16($key),$rndkey1 | ||
873 | aesenc $rndkey0,$inout0 | ||
874 | lea 32($key),$key | ||
875 | aesenc $rndkey0,$inout1 | ||
876 | $movkey 0($key),$rndkey0 | ||
877 | jnz .Lccm64_enc2_loop | ||
878 | aesenc $rndkey1,$inout0 | ||
879 | aesenc $rndkey1,$inout1 | ||
880 | paddq $increment,$iv | ||
881 | aesenclast $rndkey0,$inout0 | ||
882 | aesenclast $rndkey0,$inout1 | ||
883 | |||
884 | dec $len | ||
885 | lea 16($inp),$inp | ||
886 | xorps $inout0,$in0 # inp ^= E(iv) | ||
887 | movdqa $iv,$inout0 | ||
888 | movups $in0,($out) # save output | ||
889 | lea 16($out),$out | ||
890 | pshufb $bswap_mask,$inout0 | ||
891 | jnz .Lccm64_enc_outer | ||
892 | |||
893 | movups $inout1,($cmac) | ||
894 | ___ | ||
895 | $code.=<<___ if ($win64); | ||
896 | movaps (%rsp),%xmm6 | ||
897 | movaps 0x10(%rsp),%xmm7 | ||
898 | movaps 0x20(%rsp),%xmm8 | ||
899 | movaps 0x30(%rsp),%xmm9 | ||
900 | lea 0x58(%rsp),%rsp | ||
901 | .Lccm64_enc_ret: | ||
902 | ___ | ||
903 | $code.=<<___; | ||
904 | ret | ||
905 | .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks | ||
906 | ___ | ||
907 | ###################################################################### | ||
908 | $code.=<<___; | ||
909 | .globl aesni_ccm64_decrypt_blocks | ||
910 | .type aesni_ccm64_decrypt_blocks,\@function,6 | ||
911 | .align 16 | ||
912 | aesni_ccm64_decrypt_blocks: | ||
913 | ___ | ||
914 | $code.=<<___ if ($win64); | ||
915 | lea -0x58(%rsp),%rsp | ||
916 | movaps %xmm6,(%rsp) | ||
917 | movaps %xmm7,0x10(%rsp) | ||
918 | movaps %xmm8,0x20(%rsp) | ||
919 | movaps %xmm9,0x30(%rsp) | ||
920 | .Lccm64_dec_body: | ||
921 | ___ | ||
922 | $code.=<<___; | ||
923 | mov 240($key),$rounds # key->rounds | ||
924 | movups ($ivp),$iv | ||
925 | movdqu ($cmac),$inout1 | ||
926 | movdqa .Lincrement64(%rip),$increment | ||
927 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
928 | |||
929 | movaps $iv,$inout0 | ||
930 | mov $rounds,$rnds_ | ||
931 | mov $key,$key_ | ||
932 | pshufb $bswap_mask,$iv | ||
933 | ___ | ||
934 | &aesni_generate1("enc",$key,$rounds); | ||
935 | $code.=<<___; | ||
936 | movups ($inp),$in0 # load inp | ||
937 | paddq $increment,$iv | ||
938 | lea 16($inp),$inp | ||
939 | jmp .Lccm64_dec_outer | ||
940 | .align 16 | ||
941 | .Lccm64_dec_outer: | ||
942 | xorps $inout0,$in0 # inp ^= E(iv) | ||
943 | movdqa $iv,$inout0 | ||
944 | mov $rnds_,$rounds | ||
945 | movups $in0,($out) # save output | ||
946 | lea 16($out),$out | ||
947 | pshufb $bswap_mask,$inout0 | ||
948 | |||
949 | sub \$1,$len | ||
950 | jz .Lccm64_dec_break | ||
951 | |||
952 | $movkey ($key_),$rndkey0 | ||
953 | shr \$1,$rounds | ||
954 | $movkey 16($key_),$rndkey1 | ||
955 | xorps $rndkey0,$in0 | ||
956 | lea 32($key_),$key | ||
957 | xorps $rndkey0,$inout0 | ||
958 | xorps $in0,$inout1 # cmac^=out | ||
959 | $movkey ($key),$rndkey0 | ||
960 | |||
961 | .Lccm64_dec2_loop: | ||
962 | aesenc $rndkey1,$inout0 | ||
963 | dec $rounds | ||
964 | aesenc $rndkey1,$inout1 | ||
965 | $movkey 16($key),$rndkey1 | ||
966 | aesenc $rndkey0,$inout0 | ||
967 | lea 32($key),$key | ||
968 | aesenc $rndkey0,$inout1 | ||
969 | $movkey 0($key),$rndkey0 | ||
970 | jnz .Lccm64_dec2_loop | ||
971 | movups ($inp),$in0 # load inp | ||
972 | paddq $increment,$iv | ||
973 | aesenc $rndkey1,$inout0 | ||
974 | aesenc $rndkey1,$inout1 | ||
975 | lea 16($inp),$inp | ||
976 | aesenclast $rndkey0,$inout0 | ||
977 | aesenclast $rndkey0,$inout1 | ||
978 | jmp .Lccm64_dec_outer | ||
979 | |||
980 | .align 16 | ||
981 | .Lccm64_dec_break: | ||
982 | #xorps $in0,$inout1 # cmac^=out | ||
983 | ___ | ||
984 | &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); | ||
985 | $code.=<<___; | ||
986 | movups $inout1,($cmac) | ||
987 | ___ | ||
988 | $code.=<<___ if ($win64); | ||
989 | movaps (%rsp),%xmm6 | ||
990 | movaps 0x10(%rsp),%xmm7 | ||
991 | movaps 0x20(%rsp),%xmm8 | ||
992 | movaps 0x30(%rsp),%xmm9 | ||
993 | lea 0x58(%rsp),%rsp | ||
994 | .Lccm64_dec_ret: | ||
995 | ___ | ||
996 | $code.=<<___; | ||
997 | ret | ||
998 | .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks | ||
999 | ___ | ||
1000 | } | ||
1001 | ###################################################################### | ||
1002 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | ||
1003 | # size_t blocks, const AES_KEY *key, | ||
1004 | # const char *ivec); | ||
1005 | # | ||
1006 | # Handles only complete blocks, operates on 32-bit counter and | ||
1007 | # does not update *ivec! (see engine/eng_aesni.c for details) | ||
1008 | # | ||
1009 | { | ||
1010 | my $reserved = $win64?0:-0x28; | ||
1011 | my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); | ||
1012 | my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); | ||
1013 | my $bswap_mask="%xmm15"; | ||
1014 | |||
1015 | $code.=<<___; | ||
1016 | .globl aesni_ctr32_encrypt_blocks | ||
1017 | .type aesni_ctr32_encrypt_blocks,\@function,5 | ||
1018 | .align 16 | ||
1019 | aesni_ctr32_encrypt_blocks: | ||
1020 | ___ | ||
1021 | $code.=<<___ if ($win64); | ||
1022 | lea -0xc8(%rsp),%rsp | ||
1023 | movaps %xmm6,0x20(%rsp) | ||
1024 | movaps %xmm7,0x30(%rsp) | ||
1025 | movaps %xmm8,0x40(%rsp) | ||
1026 | movaps %xmm9,0x50(%rsp) | ||
1027 | movaps %xmm10,0x60(%rsp) | ||
1028 | movaps %xmm11,0x70(%rsp) | ||
1029 | movaps %xmm12,0x80(%rsp) | ||
1030 | movaps %xmm13,0x90(%rsp) | ||
1031 | movaps %xmm14,0xa0(%rsp) | ||
1032 | movaps %xmm15,0xb0(%rsp) | ||
1033 | .Lctr32_body: | ||
1034 | ___ | ||
1035 | $code.=<<___; | ||
1036 | cmp \$1,$len | ||
1037 | je .Lctr32_one_shortcut | ||
1038 | |||
1039 | movdqu ($ivp),$ivec | ||
1040 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
1041 | xor $rounds,$rounds | ||
1042 | pextrd \$3,$ivec,$rnds_ # pull 32-bit counter | ||
1043 | pinsrd \$3,$rounds,$ivec # wipe 32-bit counter | ||
1044 | |||
1045 | mov 240($key),$rounds # key->rounds | ||
1046 | bswap $rnds_ | ||
1047 | pxor $iv0,$iv0 # vector of 3 32-bit counters | ||
1048 | pxor $iv1,$iv1 # vector of 3 32-bit counters | ||
1049 | pinsrd \$0,$rnds_,$iv0 | ||
1050 | lea 3($rnds_),$key_ | ||
1051 | pinsrd \$0,$key_,$iv1 | ||
1052 | inc $rnds_ | ||
1053 | pinsrd \$1,$rnds_,$iv0 | ||
1054 | inc $key_ | ||
1055 | pinsrd \$1,$key_,$iv1 | ||
1056 | inc $rnds_ | ||
1057 | pinsrd \$2,$rnds_,$iv0 | ||
1058 | inc $key_ | ||
1059 | pinsrd \$2,$key_,$iv1 | ||
1060 | movdqa $iv0,$reserved(%rsp) | ||
1061 | pshufb $bswap_mask,$iv0 | ||
1062 | movdqa $iv1,`$reserved+0x10`(%rsp) | ||
1063 | pshufb $bswap_mask,$iv1 | ||
1064 | |||
1065 | pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword | ||
1066 | pshufd \$`2<<6`,$iv0,$inout1 | ||
1067 | pshufd \$`1<<6`,$iv0,$inout2 | ||
1068 | cmp \$6,$len | ||
1069 | jb .Lctr32_tail | ||
1070 | shr \$1,$rounds | ||
1071 | mov $key,$key_ # backup $key | ||
1072 | mov $rounds,$rnds_ # backup $rounds | ||
1073 | sub \$6,$len | ||
1074 | jmp .Lctr32_loop6 | ||
1075 | |||
1076 | .align 16 | ||
1077 | .Lctr32_loop6: | ||
1078 | pshufd \$`3<<6`,$iv1,$inout3 | ||
1079 | por $ivec,$inout0 # merge counter-less ivec | ||
1080 | $movkey ($key_),$rndkey0 | ||
1081 | pshufd \$`2<<6`,$iv1,$inout4 | ||
1082 | por $ivec,$inout1 | ||
1083 | $movkey 16($key_),$rndkey1 | ||
1084 | pshufd \$`1<<6`,$iv1,$inout5 | ||
1085 | por $ivec,$inout2 | ||
1086 | por $ivec,$inout3 | ||
1087 | xorps $rndkey0,$inout0 | ||
1088 | por $ivec,$inout4 | ||
1089 | por $ivec,$inout5 | ||
1090 | |||
1091 | # inline _aesni_encrypt6 and interleave last rounds | ||
1092 | # with own code... | ||
1093 | |||
1094 | pxor $rndkey0,$inout1 | ||
1095 | aesenc $rndkey1,$inout0 | ||
1096 | lea 32($key_),$key | ||
1097 | pxor $rndkey0,$inout2 | ||
1098 | aesenc $rndkey1,$inout1 | ||
1099 | movdqa .Lincrement32(%rip),$iv1 | ||
1100 | pxor $rndkey0,$inout3 | ||
1101 | aesenc $rndkey1,$inout2 | ||
1102 | movdqa $reserved(%rsp),$iv0 | ||
1103 | pxor $rndkey0,$inout4 | ||
1104 | aesenc $rndkey1,$inout3 | ||
1105 | pxor $rndkey0,$inout5 | ||
1106 | $movkey ($key),$rndkey0 | ||
1107 | dec $rounds | ||
1108 | aesenc $rndkey1,$inout4 | ||
1109 | aesenc $rndkey1,$inout5 | ||
1110 | jmp .Lctr32_enc_loop6_enter | ||
1111 | .align 16 | ||
1112 | .Lctr32_enc_loop6: | ||
1113 | aesenc $rndkey1,$inout0 | ||
1114 | aesenc $rndkey1,$inout1 | ||
1115 | dec $rounds | ||
1116 | aesenc $rndkey1,$inout2 | ||
1117 | aesenc $rndkey1,$inout3 | ||
1118 | aesenc $rndkey1,$inout4 | ||
1119 | aesenc $rndkey1,$inout5 | ||
1120 | .Lctr32_enc_loop6_enter: | ||
1121 | $movkey 16($key),$rndkey1 | ||
1122 | aesenc $rndkey0,$inout0 | ||
1123 | aesenc $rndkey0,$inout1 | ||
1124 | lea 32($key),$key | ||
1125 | aesenc $rndkey0,$inout2 | ||
1126 | aesenc $rndkey0,$inout3 | ||
1127 | aesenc $rndkey0,$inout4 | ||
1128 | aesenc $rndkey0,$inout5 | ||
1129 | $movkey ($key),$rndkey0 | ||
1130 | jnz .Lctr32_enc_loop6 | ||
1131 | |||
1132 | aesenc $rndkey1,$inout0 | ||
1133 | paddd $iv1,$iv0 # increment counter vector | ||
1134 | aesenc $rndkey1,$inout1 | ||
1135 | paddd `$reserved+0x10`(%rsp),$iv1 | ||
1136 | aesenc $rndkey1,$inout2 | ||
1137 | movdqa $iv0,$reserved(%rsp) # save counter vector | ||
1138 | aesenc $rndkey1,$inout3 | ||
1139 | movdqa $iv1,`$reserved+0x10`(%rsp) | ||
1140 | aesenc $rndkey1,$inout4 | ||
1141 | pshufb $bswap_mask,$iv0 # byte swap | ||
1142 | aesenc $rndkey1,$inout5 | ||
1143 | pshufb $bswap_mask,$iv1 | ||
1144 | |||
1145 | aesenclast $rndkey0,$inout0 | ||
1146 | movups ($inp),$in0 # load input | ||
1147 | aesenclast $rndkey0,$inout1 | ||
1148 | movups 0x10($inp),$in1 | ||
1149 | aesenclast $rndkey0,$inout2 | ||
1150 | movups 0x20($inp),$in2 | ||
1151 | aesenclast $rndkey0,$inout3 | ||
1152 | movups 0x30($inp),$in3 | ||
1153 | aesenclast $rndkey0,$inout4 | ||
1154 | movups 0x40($inp),$rndkey1 | ||
1155 | aesenclast $rndkey0,$inout5 | ||
1156 | movups 0x50($inp),$rndkey0 | ||
1157 | lea 0x60($inp),$inp | ||
1158 | |||
1159 | xorps $inout0,$in0 # xor | ||
1160 | pshufd \$`3<<6`,$iv0,$inout0 | ||
1161 | xorps $inout1,$in1 | ||
1162 | pshufd \$`2<<6`,$iv0,$inout1 | ||
1163 | movups $in0,($out) # store output | ||
1164 | xorps $inout2,$in2 | ||
1165 | pshufd \$`1<<6`,$iv0,$inout2 | ||
1166 | movups $in1,0x10($out) | ||
1167 | xorps $inout3,$in3 | ||
1168 | movups $in2,0x20($out) | ||
1169 | xorps $inout4,$rndkey1 | ||
1170 | movups $in3,0x30($out) | ||
1171 | xorps $inout5,$rndkey0 | ||
1172 | movups $rndkey1,0x40($out) | ||
1173 | movups $rndkey0,0x50($out) | ||
1174 | lea 0x60($out),$out | ||
1175 | mov $rnds_,$rounds | ||
1176 | sub \$6,$len | ||
1177 | jnc .Lctr32_loop6 | ||
1178 | |||
1179 | add \$6,$len | ||
1180 | jz .Lctr32_done | ||
1181 | mov $key_,$key # restore $key | ||
1182 | lea 1($rounds,$rounds),$rounds # restore original value | ||
1183 | |||
1184 | .Lctr32_tail: | ||
1185 | por $ivec,$inout0 | ||
1186 | movups ($inp),$in0 | ||
1187 | cmp \$2,$len | ||
1188 | jb .Lctr32_one | ||
1189 | |||
1190 | por $ivec,$inout1 | ||
1191 | movups 0x10($inp),$in1 | ||
1192 | je .Lctr32_two | ||
1193 | |||
1194 | pshufd \$`3<<6`,$iv1,$inout3 | ||
1195 | por $ivec,$inout2 | ||
1196 | movups 0x20($inp),$in2 | ||
1197 | cmp \$4,$len | ||
1198 | jb .Lctr32_three | ||
1199 | |||
1200 | pshufd \$`2<<6`,$iv1,$inout4 | ||
1201 | por $ivec,$inout3 | ||
1202 | movups 0x30($inp),$in3 | ||
1203 | je .Lctr32_four | ||
1204 | |||
1205 | por $ivec,$inout4 | ||
1206 | xorps $inout5,$inout5 | ||
1207 | |||
1208 | call _aesni_encrypt6 | ||
1209 | |||
1210 | movups 0x40($inp),$rndkey1 | ||
1211 | xorps $inout0,$in0 | ||
1212 | xorps $inout1,$in1 | ||
1213 | movups $in0,($out) | ||
1214 | xorps $inout2,$in2 | ||
1215 | movups $in1,0x10($out) | ||
1216 | xorps $inout3,$in3 | ||
1217 | movups $in2,0x20($out) | ||
1218 | xorps $inout4,$rndkey1 | ||
1219 | movups $in3,0x30($out) | ||
1220 | movups $rndkey1,0x40($out) | ||
1221 | jmp .Lctr32_done | ||
1222 | |||
1223 | .align 16 | ||
1224 | .Lctr32_one_shortcut: | ||
1225 | movups ($ivp),$inout0 | ||
1226 | movups ($inp),$in0 | ||
1227 | mov 240($key),$rounds # key->rounds | ||
1228 | .Lctr32_one: | ||
1229 | ___ | ||
1230 | &aesni_generate1("enc",$key,$rounds); | ||
1231 | $code.=<<___; | ||
1232 | xorps $inout0,$in0 | ||
1233 | movups $in0,($out) | ||
1234 | jmp .Lctr32_done | ||
1235 | |||
1236 | .align 16 | ||
1237 | .Lctr32_two: | ||
1238 | xorps $inout2,$inout2 | ||
1239 | call _aesni_encrypt3 | ||
1240 | xorps $inout0,$in0 | ||
1241 | xorps $inout1,$in1 | ||
1242 | movups $in0,($out) | ||
1243 | movups $in1,0x10($out) | ||
1244 | jmp .Lctr32_done | ||
1245 | |||
1246 | .align 16 | ||
1247 | .Lctr32_three: | ||
1248 | call _aesni_encrypt3 | ||
1249 | xorps $inout0,$in0 | ||
1250 | xorps $inout1,$in1 | ||
1251 | movups $in0,($out) | ||
1252 | xorps $inout2,$in2 | ||
1253 | movups $in1,0x10($out) | ||
1254 | movups $in2,0x20($out) | ||
1255 | jmp .Lctr32_done | ||
1256 | |||
1257 | .align 16 | ||
1258 | .Lctr32_four: | ||
1259 | call _aesni_encrypt4 | ||
1260 | xorps $inout0,$in0 | ||
1261 | xorps $inout1,$in1 | ||
1262 | movups $in0,($out) | ||
1263 | xorps $inout2,$in2 | ||
1264 | movups $in1,0x10($out) | ||
1265 | xorps $inout3,$in3 | ||
1266 | movups $in2,0x20($out) | ||
1267 | movups $in3,0x30($out) | ||
1268 | |||
1269 | .Lctr32_done: | ||
1270 | ___ | ||
1271 | $code.=<<___ if ($win64); | ||
1272 | movaps 0x20(%rsp),%xmm6 | ||
1273 | movaps 0x30(%rsp),%xmm7 | ||
1274 | movaps 0x40(%rsp),%xmm8 | ||
1275 | movaps 0x50(%rsp),%xmm9 | ||
1276 | movaps 0x60(%rsp),%xmm10 | ||
1277 | movaps 0x70(%rsp),%xmm11 | ||
1278 | movaps 0x80(%rsp),%xmm12 | ||
1279 | movaps 0x90(%rsp),%xmm13 | ||
1280 | movaps 0xa0(%rsp),%xmm14 | ||
1281 | movaps 0xb0(%rsp),%xmm15 | ||
1282 | lea 0xc8(%rsp),%rsp | ||
1283 | .Lctr32_ret: | ||
1284 | ___ | ||
1285 | $code.=<<___; | ||
1286 | ret | ||
1287 | .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks | ||
1288 | ___ | ||
1289 | } | ||
1290 | |||
1291 | ###################################################################### | ||
1292 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
1293 | # const AES_KEY *key1, const AES_KEY *key2 | ||
1294 | # const unsigned char iv[16]); | ||
1295 | # | ||
1296 | { | ||
1297 | my @tweak=map("%xmm$_",(10..15)); | ||
1298 | my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); | ||
1299 | my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); | ||
1300 | my $frame_size = 0x68 + ($win64?160:0); | ||
1301 | |||
1302 | $code.=<<___; | ||
1303 | .globl aesni_xts_encrypt | ||
1304 | .type aesni_xts_encrypt,\@function,6 | ||
1305 | .align 16 | ||
1306 | aesni_xts_encrypt: | ||
1307 | lea -$frame_size(%rsp),%rsp | ||
1308 | ___ | ||
1309 | $code.=<<___ if ($win64); | ||
1310 | movaps %xmm6,0x60(%rsp) | ||
1311 | movaps %xmm7,0x70(%rsp) | ||
1312 | movaps %xmm8,0x80(%rsp) | ||
1313 | movaps %xmm9,0x90(%rsp) | ||
1314 | movaps %xmm10,0xa0(%rsp) | ||
1315 | movaps %xmm11,0xb0(%rsp) | ||
1316 | movaps %xmm12,0xc0(%rsp) | ||
1317 | movaps %xmm13,0xd0(%rsp) | ||
1318 | movaps %xmm14,0xe0(%rsp) | ||
1319 | movaps %xmm15,0xf0(%rsp) | ||
1320 | .Lxts_enc_body: | ||
1321 | ___ | ||
1322 | $code.=<<___; | ||
1323 | movups ($ivp),@tweak[5] # load clear-text tweak | ||
1324 | mov 240(%r8),$rounds # key2->rounds | ||
1325 | mov 240($key),$rnds_ # key1->rounds | ||
1326 | ___ | ||
1327 | # generate the tweak | ||
1328 | &aesni_generate1("enc",$key2,$rounds,@tweak[5]); | ||
1329 | $code.=<<___; | ||
1330 | mov $key,$key_ # backup $key | ||
1331 | mov $rnds_,$rounds # backup $rounds | ||
1332 | mov $len,$len_ # backup $len | ||
1333 | and \$-16,$len | ||
1334 | |||
1335 | movdqa .Lxts_magic(%rip),$twmask | ||
1336 | pxor $twtmp,$twtmp | ||
1337 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1338 | ___ | ||
1339 | for ($i=0;$i<4;$i++) { | ||
1340 | $code.=<<___; | ||
1341 | pshufd \$0x13,$twtmp,$twres | ||
1342 | pxor $twtmp,$twtmp | ||
1343 | movdqa @tweak[5],@tweak[$i] | ||
1344 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1345 | pand $twmask,$twres # isolate carry and residue | ||
1346 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1347 | pxor $twres,@tweak[5] | ||
1348 | ___ | ||
1349 | } | ||
1350 | $code.=<<___; | ||
1351 | sub \$16*6,$len | ||
1352 | jc .Lxts_enc_short | ||
1353 | |||
1354 | shr \$1,$rounds | ||
1355 | sub \$1,$rounds | ||
1356 | mov $rounds,$rnds_ | ||
1357 | jmp .Lxts_enc_grandloop | ||
1358 | |||
1359 | .align 16 | ||
1360 | .Lxts_enc_grandloop: | ||
1361 | pshufd \$0x13,$twtmp,$twres | ||
1362 | movdqa @tweak[5],@tweak[4] | ||
1363 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1364 | movdqu `16*0`($inp),$inout0 # load input | ||
1365 | pand $twmask,$twres # isolate carry and residue | ||
1366 | movdqu `16*1`($inp),$inout1 | ||
1367 | pxor $twres,@tweak[5] | ||
1368 | |||
1369 | movdqu `16*2`($inp),$inout2 | ||
1370 | pxor @tweak[0],$inout0 # input^=tweak | ||
1371 | movdqu `16*3`($inp),$inout3 | ||
1372 | pxor @tweak[1],$inout1 | ||
1373 | movdqu `16*4`($inp),$inout4 | ||
1374 | pxor @tweak[2],$inout2 | ||
1375 | movdqu `16*5`($inp),$inout5 | ||
1376 | lea `16*6`($inp),$inp | ||
1377 | pxor @tweak[3],$inout3 | ||
1378 | $movkey ($key_),$rndkey0 | ||
1379 | pxor @tweak[4],$inout4 | ||
1380 | pxor @tweak[5],$inout5 | ||
1381 | |||
1382 | # inline _aesni_encrypt6 and interleave first and last rounds | ||
1383 | # with own code... | ||
1384 | $movkey 16($key_),$rndkey1 | ||
1385 | pxor $rndkey0,$inout0 | ||
1386 | pxor $rndkey0,$inout1 | ||
1387 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks | ||
1388 | aesenc $rndkey1,$inout0 | ||
1389 | lea 32($key_),$key | ||
1390 | pxor $rndkey0,$inout2 | ||
1391 | movdqa @tweak[1],`16*1`(%rsp) | ||
1392 | aesenc $rndkey1,$inout1 | ||
1393 | pxor $rndkey0,$inout3 | ||
1394 | movdqa @tweak[2],`16*2`(%rsp) | ||
1395 | aesenc $rndkey1,$inout2 | ||
1396 | pxor $rndkey0,$inout4 | ||
1397 | movdqa @tweak[3],`16*3`(%rsp) | ||
1398 | aesenc $rndkey1,$inout3 | ||
1399 | pxor $rndkey0,$inout5 | ||
1400 | $movkey ($key),$rndkey0 | ||
1401 | dec $rounds | ||
1402 | movdqa @tweak[4],`16*4`(%rsp) | ||
1403 | aesenc $rndkey1,$inout4 | ||
1404 | movdqa @tweak[5],`16*5`(%rsp) | ||
1405 | aesenc $rndkey1,$inout5 | ||
1406 | pxor $twtmp,$twtmp | ||
1407 | pcmpgtd @tweak[5],$twtmp | ||
1408 | jmp .Lxts_enc_loop6_enter | ||
1409 | |||
1410 | .align 16 | ||
1411 | .Lxts_enc_loop6: | ||
1412 | aesenc $rndkey1,$inout0 | ||
1413 | aesenc $rndkey1,$inout1 | ||
1414 | dec $rounds | ||
1415 | aesenc $rndkey1,$inout2 | ||
1416 | aesenc $rndkey1,$inout3 | ||
1417 | aesenc $rndkey1,$inout4 | ||
1418 | aesenc $rndkey1,$inout5 | ||
1419 | .Lxts_enc_loop6_enter: | ||
1420 | $movkey 16($key),$rndkey1 | ||
1421 | aesenc $rndkey0,$inout0 | ||
1422 | aesenc $rndkey0,$inout1 | ||
1423 | lea 32($key),$key | ||
1424 | aesenc $rndkey0,$inout2 | ||
1425 | aesenc $rndkey0,$inout3 | ||
1426 | aesenc $rndkey0,$inout4 | ||
1427 | aesenc $rndkey0,$inout5 | ||
1428 | $movkey ($key),$rndkey0 | ||
1429 | jnz .Lxts_enc_loop6 | ||
1430 | |||
1431 | pshufd \$0x13,$twtmp,$twres | ||
1432 | pxor $twtmp,$twtmp | ||
1433 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1434 | aesenc $rndkey1,$inout0 | ||
1435 | pand $twmask,$twres # isolate carry and residue | ||
1436 | aesenc $rndkey1,$inout1 | ||
1437 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1438 | aesenc $rndkey1,$inout2 | ||
1439 | pxor $twres,@tweak[5] | ||
1440 | aesenc $rndkey1,$inout3 | ||
1441 | aesenc $rndkey1,$inout4 | ||
1442 | aesenc $rndkey1,$inout5 | ||
1443 | $movkey 16($key),$rndkey1 | ||
1444 | |||
1445 | pshufd \$0x13,$twtmp,$twres | ||
1446 | pxor $twtmp,$twtmp | ||
1447 | movdqa @tweak[5],@tweak[0] | ||
1448 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1449 | aesenc $rndkey0,$inout0 | ||
1450 | pand $twmask,$twres # isolate carry and residue | ||
1451 | aesenc $rndkey0,$inout1 | ||
1452 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1453 | aesenc $rndkey0,$inout2 | ||
1454 | pxor $twres,@tweak[5] | ||
1455 | aesenc $rndkey0,$inout3 | ||
1456 | aesenc $rndkey0,$inout4 | ||
1457 | aesenc $rndkey0,$inout5 | ||
1458 | $movkey 32($key),$rndkey0 | ||
1459 | |||
1460 | pshufd \$0x13,$twtmp,$twres | ||
1461 | pxor $twtmp,$twtmp | ||
1462 | movdqa @tweak[5],@tweak[1] | ||
1463 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1464 | aesenc $rndkey1,$inout0 | ||
1465 | pand $twmask,$twres # isolate carry and residue | ||
1466 | aesenc $rndkey1,$inout1 | ||
1467 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1468 | aesenc $rndkey1,$inout2 | ||
1469 | pxor $twres,@tweak[5] | ||
1470 | aesenc $rndkey1,$inout3 | ||
1471 | aesenc $rndkey1,$inout4 | ||
1472 | aesenc $rndkey1,$inout5 | ||
1473 | |||
1474 | pshufd \$0x13,$twtmp,$twres | ||
1475 | pxor $twtmp,$twtmp | ||
1476 | movdqa @tweak[5],@tweak[2] | ||
1477 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1478 | aesenclast $rndkey0,$inout0 | ||
1479 | pand $twmask,$twres # isolate carry and residue | ||
1480 | aesenclast $rndkey0,$inout1 | ||
1481 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1482 | aesenclast $rndkey0,$inout2 | ||
1483 | pxor $twres,@tweak[5] | ||
1484 | aesenclast $rndkey0,$inout3 | ||
1485 | aesenclast $rndkey0,$inout4 | ||
1486 | aesenclast $rndkey0,$inout5 | ||
1487 | |||
1488 | pshufd \$0x13,$twtmp,$twres | ||
1489 | pxor $twtmp,$twtmp | ||
1490 | movdqa @tweak[5],@tweak[3] | ||
1491 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1492 | xorps `16*0`(%rsp),$inout0 # output^=tweak | ||
1493 | pand $twmask,$twres # isolate carry and residue | ||
1494 | xorps `16*1`(%rsp),$inout1 | ||
1495 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1496 | pxor $twres,@tweak[5] | ||
1497 | |||
1498 | xorps `16*2`(%rsp),$inout2 | ||
1499 | movups $inout0,`16*0`($out) # write output | ||
1500 | xorps `16*3`(%rsp),$inout3 | ||
1501 | movups $inout1,`16*1`($out) | ||
1502 | xorps `16*4`(%rsp),$inout4 | ||
1503 | movups $inout2,`16*2`($out) | ||
1504 | xorps `16*5`(%rsp),$inout5 | ||
1505 | movups $inout3,`16*3`($out) | ||
1506 | mov $rnds_,$rounds # restore $rounds | ||
1507 | movups $inout4,`16*4`($out) | ||
1508 | movups $inout5,`16*5`($out) | ||
1509 | lea `16*6`($out),$out | ||
1510 | sub \$16*6,$len | ||
1511 | jnc .Lxts_enc_grandloop | ||
1512 | |||
1513 | lea 3($rounds,$rounds),$rounds # restore original value | ||
1514 | mov $key_,$key # restore $key | ||
1515 | mov $rounds,$rnds_ # backup $rounds | ||
1516 | |||
1517 | .Lxts_enc_short: | ||
1518 | add \$16*6,$len | ||
1519 | jz .Lxts_enc_done | ||
1520 | |||
1521 | cmp \$0x20,$len | ||
1522 | jb .Lxts_enc_one | ||
1523 | je .Lxts_enc_two | ||
1524 | |||
1525 | cmp \$0x40,$len | ||
1526 | jb .Lxts_enc_three | ||
1527 | je .Lxts_enc_four | ||
1528 | |||
1529 | pshufd \$0x13,$twtmp,$twres | ||
1530 | movdqa @tweak[5],@tweak[4] | ||
1531 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1532 | movdqu ($inp),$inout0 | ||
1533 | pand $twmask,$twres # isolate carry and residue | ||
1534 | movdqu 16*1($inp),$inout1 | ||
1535 | pxor $twres,@tweak[5] | ||
1536 | |||
1537 | movdqu 16*2($inp),$inout2 | ||
1538 | pxor @tweak[0],$inout0 | ||
1539 | movdqu 16*3($inp),$inout3 | ||
1540 | pxor @tweak[1],$inout1 | ||
1541 | movdqu 16*4($inp),$inout4 | ||
1542 | lea 16*5($inp),$inp | ||
1543 | pxor @tweak[2],$inout2 | ||
1544 | pxor @tweak[3],$inout3 | ||
1545 | pxor @tweak[4],$inout4 | ||
1546 | |||
1547 | call _aesni_encrypt6 | ||
1548 | |||
1549 | xorps @tweak[0],$inout0 | ||
1550 | movdqa @tweak[5],@tweak[0] | ||
1551 | xorps @tweak[1],$inout1 | ||
1552 | xorps @tweak[2],$inout2 | ||
1553 | movdqu $inout0,($out) | ||
1554 | xorps @tweak[3],$inout3 | ||
1555 | movdqu $inout1,16*1($out) | ||
1556 | xorps @tweak[4],$inout4 | ||
1557 | movdqu $inout2,16*2($out) | ||
1558 | movdqu $inout3,16*3($out) | ||
1559 | movdqu $inout4,16*4($out) | ||
1560 | lea 16*5($out),$out | ||
1561 | jmp .Lxts_enc_done | ||
1562 | |||
1563 | .align 16 | ||
1564 | .Lxts_enc_one: | ||
1565 | movups ($inp),$inout0 | ||
1566 | lea 16*1($inp),$inp | ||
1567 | xorps @tweak[0],$inout0 | ||
1568 | ___ | ||
1569 | &aesni_generate1("enc",$key,$rounds); | ||
1570 | $code.=<<___; | ||
1571 | xorps @tweak[0],$inout0 | ||
1572 | movdqa @tweak[1],@tweak[0] | ||
1573 | movups $inout0,($out) | ||
1574 | lea 16*1($out),$out | ||
1575 | jmp .Lxts_enc_done | ||
1576 | |||
1577 | .align 16 | ||
1578 | .Lxts_enc_two: | ||
1579 | movups ($inp),$inout0 | ||
1580 | movups 16($inp),$inout1 | ||
1581 | lea 32($inp),$inp | ||
1582 | xorps @tweak[0],$inout0 | ||
1583 | xorps @tweak[1],$inout1 | ||
1584 | |||
1585 | call _aesni_encrypt3 | ||
1586 | |||
1587 | xorps @tweak[0],$inout0 | ||
1588 | movdqa @tweak[2],@tweak[0] | ||
1589 | xorps @tweak[1],$inout1 | ||
1590 | movups $inout0,($out) | ||
1591 | movups $inout1,16*1($out) | ||
1592 | lea 16*2($out),$out | ||
1593 | jmp .Lxts_enc_done | ||
1594 | |||
1595 | .align 16 | ||
1596 | .Lxts_enc_three: | ||
1597 | movups ($inp),$inout0 | ||
1598 | movups 16*1($inp),$inout1 | ||
1599 | movups 16*2($inp),$inout2 | ||
1600 | lea 16*3($inp),$inp | ||
1601 | xorps @tweak[0],$inout0 | ||
1602 | xorps @tweak[1],$inout1 | ||
1603 | xorps @tweak[2],$inout2 | ||
1604 | |||
1605 | call _aesni_encrypt3 | ||
1606 | |||
1607 | xorps @tweak[0],$inout0 | ||
1608 | movdqa @tweak[3],@tweak[0] | ||
1609 | xorps @tweak[1],$inout1 | ||
1610 | xorps @tweak[2],$inout2 | ||
1611 | movups $inout0,($out) | ||
1612 | movups $inout1,16*1($out) | ||
1613 | movups $inout2,16*2($out) | ||
1614 | lea 16*3($out),$out | ||
1615 | jmp .Lxts_enc_done | ||
1616 | |||
1617 | .align 16 | ||
1618 | .Lxts_enc_four: | ||
1619 | movups ($inp),$inout0 | ||
1620 | movups 16*1($inp),$inout1 | ||
1621 | movups 16*2($inp),$inout2 | ||
1622 | xorps @tweak[0],$inout0 | ||
1623 | movups 16*3($inp),$inout3 | ||
1624 | lea 16*4($inp),$inp | ||
1625 | xorps @tweak[1],$inout1 | ||
1626 | xorps @tweak[2],$inout2 | ||
1627 | xorps @tweak[3],$inout3 | ||
1628 | |||
1629 | call _aesni_encrypt4 | ||
1630 | |||
1631 | xorps @tweak[0],$inout0 | ||
1632 | movdqa @tweak[5],@tweak[0] | ||
1633 | xorps @tweak[1],$inout1 | ||
1634 | xorps @tweak[2],$inout2 | ||
1635 | movups $inout0,($out) | ||
1636 | xorps @tweak[3],$inout3 | ||
1637 | movups $inout1,16*1($out) | ||
1638 | movups $inout2,16*2($out) | ||
1639 | movups $inout3,16*3($out) | ||
1640 | lea 16*4($out),$out | ||
1641 | jmp .Lxts_enc_done | ||
1642 | |||
1643 | .align 16 | ||
1644 | .Lxts_enc_done: | ||
1645 | and \$15,$len_ | ||
1646 | jz .Lxts_enc_ret | ||
1647 | mov $len_,$len | ||
1648 | |||
1649 | .Lxts_enc_steal: | ||
1650 | movzb ($inp),%eax # borrow $rounds ... | ||
1651 | movzb -16($out),%ecx # ... and $key | ||
1652 | lea 1($inp),$inp | ||
1653 | mov %al,-16($out) | ||
1654 | mov %cl,0($out) | ||
1655 | lea 1($out),$out | ||
1656 | sub \$1,$len | ||
1657 | jnz .Lxts_enc_steal | ||
1658 | |||
1659 | sub $len_,$out # rewind $out | ||
1660 | mov $key_,$key # restore $key | ||
1661 | mov $rnds_,$rounds # restore $rounds | ||
1662 | |||
1663 | movups -16($out),$inout0 | ||
1664 | xorps @tweak[0],$inout0 | ||
1665 | ___ | ||
1666 | &aesni_generate1("enc",$key,$rounds); | ||
1667 | $code.=<<___; | ||
1668 | xorps @tweak[0],$inout0 | ||
1669 | movups $inout0,-16($out) | ||
1670 | |||
1671 | .Lxts_enc_ret: | ||
1672 | ___ | ||
1673 | $code.=<<___ if ($win64); | ||
1674 | movaps 0x60(%rsp),%xmm6 | ||
1675 | movaps 0x70(%rsp),%xmm7 | ||
1676 | movaps 0x80(%rsp),%xmm8 | ||
1677 | movaps 0x90(%rsp),%xmm9 | ||
1678 | movaps 0xa0(%rsp),%xmm10 | ||
1679 | movaps 0xb0(%rsp),%xmm11 | ||
1680 | movaps 0xc0(%rsp),%xmm12 | ||
1681 | movaps 0xd0(%rsp),%xmm13 | ||
1682 | movaps 0xe0(%rsp),%xmm14 | ||
1683 | movaps 0xf0(%rsp),%xmm15 | ||
1684 | ___ | ||
1685 | $code.=<<___; | ||
1686 | lea $frame_size(%rsp),%rsp | ||
1687 | .Lxts_enc_epilogue: | ||
1688 | ret | ||
1689 | .size aesni_xts_encrypt,.-aesni_xts_encrypt | ||
1690 | ___ | ||
1691 | |||
1692 | $code.=<<___; | ||
1693 | .globl aesni_xts_decrypt | ||
1694 | .type aesni_xts_decrypt,\@function,6 | ||
1695 | .align 16 | ||
1696 | aesni_xts_decrypt: | ||
1697 | lea -$frame_size(%rsp),%rsp | ||
1698 | ___ | ||
1699 | $code.=<<___ if ($win64); | ||
1700 | movaps %xmm6,0x60(%rsp) | ||
1701 | movaps %xmm7,0x70(%rsp) | ||
1702 | movaps %xmm8,0x80(%rsp) | ||
1703 | movaps %xmm9,0x90(%rsp) | ||
1704 | movaps %xmm10,0xa0(%rsp) | ||
1705 | movaps %xmm11,0xb0(%rsp) | ||
1706 | movaps %xmm12,0xc0(%rsp) | ||
1707 | movaps %xmm13,0xd0(%rsp) | ||
1708 | movaps %xmm14,0xe0(%rsp) | ||
1709 | movaps %xmm15,0xf0(%rsp) | ||
1710 | .Lxts_dec_body: | ||
1711 | ___ | ||
1712 | $code.=<<___; | ||
1713 | movups ($ivp),@tweak[5] # load clear-text tweak | ||
1714 | mov 240($key2),$rounds # key2->rounds | ||
1715 | mov 240($key),$rnds_ # key1->rounds | ||
1716 | ___ | ||
1717 | # generate the tweak | ||
1718 | &aesni_generate1("enc",$key2,$rounds,@tweak[5]); | ||
1719 | $code.=<<___; | ||
1720 | xor %eax,%eax # if ($len%16) len-=16; | ||
1721 | test \$15,$len | ||
1722 | setnz %al | ||
1723 | shl \$4,%rax | ||
1724 | sub %rax,$len | ||
1725 | |||
1726 | mov $key,$key_ # backup $key | ||
1727 | mov $rnds_,$rounds # backup $rounds | ||
1728 | mov $len,$len_ # backup $len | ||
1729 | and \$-16,$len | ||
1730 | |||
1731 | movdqa .Lxts_magic(%rip),$twmask | ||
1732 | pxor $twtmp,$twtmp | ||
1733 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1734 | ___ | ||
1735 | for ($i=0;$i<4;$i++) { | ||
1736 | $code.=<<___; | ||
1737 | pshufd \$0x13,$twtmp,$twres | ||
1738 | pxor $twtmp,$twtmp | ||
1739 | movdqa @tweak[5],@tweak[$i] | ||
1740 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1741 | pand $twmask,$twres # isolate carry and residue | ||
1742 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1743 | pxor $twres,@tweak[5] | ||
1744 | ___ | ||
1745 | } | ||
1746 | $code.=<<___; | ||
1747 | sub \$16*6,$len | ||
1748 | jc .Lxts_dec_short | ||
1749 | |||
1750 | shr \$1,$rounds | ||
1751 | sub \$1,$rounds | ||
1752 | mov $rounds,$rnds_ | ||
1753 | jmp .Lxts_dec_grandloop | ||
1754 | |||
1755 | .align 16 | ||
1756 | .Lxts_dec_grandloop: | ||
1757 | pshufd \$0x13,$twtmp,$twres | ||
1758 | movdqa @tweak[5],@tweak[4] | ||
1759 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1760 | movdqu `16*0`($inp),$inout0 # load input | ||
1761 | pand $twmask,$twres # isolate carry and residue | ||
1762 | movdqu `16*1`($inp),$inout1 | ||
1763 | pxor $twres,@tweak[5] | ||
1764 | |||
1765 | movdqu `16*2`($inp),$inout2 | ||
1766 | pxor @tweak[0],$inout0 # input^=tweak | ||
1767 | movdqu `16*3`($inp),$inout3 | ||
1768 | pxor @tweak[1],$inout1 | ||
1769 | movdqu `16*4`($inp),$inout4 | ||
1770 | pxor @tweak[2],$inout2 | ||
1771 | movdqu `16*5`($inp),$inout5 | ||
1772 | lea `16*6`($inp),$inp | ||
1773 | pxor @tweak[3],$inout3 | ||
1774 | $movkey ($key_),$rndkey0 | ||
1775 | pxor @tweak[4],$inout4 | ||
1776 | pxor @tweak[5],$inout5 | ||
1777 | |||
1778 | # inline _aesni_decrypt6 and interleave first and last rounds | ||
1779 | # with own code... | ||
1780 | $movkey 16($key_),$rndkey1 | ||
1781 | pxor $rndkey0,$inout0 | ||
1782 | pxor $rndkey0,$inout1 | ||
1783 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks | ||
1784 | aesdec $rndkey1,$inout0 | ||
1785 | lea 32($key_),$key | ||
1786 | pxor $rndkey0,$inout2 | ||
1787 | movdqa @tweak[1],`16*1`(%rsp) | ||
1788 | aesdec $rndkey1,$inout1 | ||
1789 | pxor $rndkey0,$inout3 | ||
1790 | movdqa @tweak[2],`16*2`(%rsp) | ||
1791 | aesdec $rndkey1,$inout2 | ||
1792 | pxor $rndkey0,$inout4 | ||
1793 | movdqa @tweak[3],`16*3`(%rsp) | ||
1794 | aesdec $rndkey1,$inout3 | ||
1795 | pxor $rndkey0,$inout5 | ||
1796 | $movkey ($key),$rndkey0 | ||
1797 | dec $rounds | ||
1798 | movdqa @tweak[4],`16*4`(%rsp) | ||
1799 | aesdec $rndkey1,$inout4 | ||
1800 | movdqa @tweak[5],`16*5`(%rsp) | ||
1801 | aesdec $rndkey1,$inout5 | ||
1802 | pxor $twtmp,$twtmp | ||
1803 | pcmpgtd @tweak[5],$twtmp | ||
1804 | jmp .Lxts_dec_loop6_enter | ||
1805 | |||
1806 | .align 16 | ||
1807 | .Lxts_dec_loop6: | ||
1808 | aesdec $rndkey1,$inout0 | ||
1809 | aesdec $rndkey1,$inout1 | ||
1810 | dec $rounds | ||
1811 | aesdec $rndkey1,$inout2 | ||
1812 | aesdec $rndkey1,$inout3 | ||
1813 | aesdec $rndkey1,$inout4 | ||
1814 | aesdec $rndkey1,$inout5 | ||
1815 | .Lxts_dec_loop6_enter: | ||
1816 | $movkey 16($key),$rndkey1 | ||
1817 | aesdec $rndkey0,$inout0 | ||
1818 | aesdec $rndkey0,$inout1 | ||
1819 | lea 32($key),$key | ||
1820 | aesdec $rndkey0,$inout2 | ||
1821 | aesdec $rndkey0,$inout3 | ||
1822 | aesdec $rndkey0,$inout4 | ||
1823 | aesdec $rndkey0,$inout5 | ||
1824 | $movkey ($key),$rndkey0 | ||
1825 | jnz .Lxts_dec_loop6 | ||
1826 | |||
1827 | pshufd \$0x13,$twtmp,$twres | ||
1828 | pxor $twtmp,$twtmp | ||
1829 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1830 | aesdec $rndkey1,$inout0 | ||
1831 | pand $twmask,$twres # isolate carry and residue | ||
1832 | aesdec $rndkey1,$inout1 | ||
1833 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1834 | aesdec $rndkey1,$inout2 | ||
1835 | pxor $twres,@tweak[5] | ||
1836 | aesdec $rndkey1,$inout3 | ||
1837 | aesdec $rndkey1,$inout4 | ||
1838 | aesdec $rndkey1,$inout5 | ||
1839 | $movkey 16($key),$rndkey1 | ||
1840 | |||
1841 | pshufd \$0x13,$twtmp,$twres | ||
1842 | pxor $twtmp,$twtmp | ||
1843 | movdqa @tweak[5],@tweak[0] | ||
1844 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1845 | aesdec $rndkey0,$inout0 | ||
1846 | pand $twmask,$twres # isolate carry and residue | ||
1847 | aesdec $rndkey0,$inout1 | ||
1848 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1849 | aesdec $rndkey0,$inout2 | ||
1850 | pxor $twres,@tweak[5] | ||
1851 | aesdec $rndkey0,$inout3 | ||
1852 | aesdec $rndkey0,$inout4 | ||
1853 | aesdec $rndkey0,$inout5 | ||
1854 | $movkey 32($key),$rndkey0 | ||
1855 | |||
1856 | pshufd \$0x13,$twtmp,$twres | ||
1857 | pxor $twtmp,$twtmp | ||
1858 | movdqa @tweak[5],@tweak[1] | ||
1859 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1860 | aesdec $rndkey1,$inout0 | ||
1861 | pand $twmask,$twres # isolate carry and residue | ||
1862 | aesdec $rndkey1,$inout1 | ||
1863 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1864 | aesdec $rndkey1,$inout2 | ||
1865 | pxor $twres,@tweak[5] | ||
1866 | aesdec $rndkey1,$inout3 | ||
1867 | aesdec $rndkey1,$inout4 | ||
1868 | aesdec $rndkey1,$inout5 | ||
1869 | |||
1870 | pshufd \$0x13,$twtmp,$twres | ||
1871 | pxor $twtmp,$twtmp | ||
1872 | movdqa @tweak[5],@tweak[2] | ||
1873 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1874 | aesdeclast $rndkey0,$inout0 | ||
1875 | pand $twmask,$twres # isolate carry and residue | ||
1876 | aesdeclast $rndkey0,$inout1 | ||
1877 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1878 | aesdeclast $rndkey0,$inout2 | ||
1879 | pxor $twres,@tweak[5] | ||
1880 | aesdeclast $rndkey0,$inout3 | ||
1881 | aesdeclast $rndkey0,$inout4 | ||
1882 | aesdeclast $rndkey0,$inout5 | ||
1883 | |||
1884 | pshufd \$0x13,$twtmp,$twres | ||
1885 | pxor $twtmp,$twtmp | ||
1886 | movdqa @tweak[5],@tweak[3] | ||
1887 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1888 | xorps `16*0`(%rsp),$inout0 # output^=tweak | ||
1889 | pand $twmask,$twres # isolate carry and residue | ||
1890 | xorps `16*1`(%rsp),$inout1 | ||
1891 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1892 | pxor $twres,@tweak[5] | ||
1893 | |||
1894 | xorps `16*2`(%rsp),$inout2 | ||
1895 | movups $inout0,`16*0`($out) # write output | ||
1896 | xorps `16*3`(%rsp),$inout3 | ||
1897 | movups $inout1,`16*1`($out) | ||
1898 | xorps `16*4`(%rsp),$inout4 | ||
1899 | movups $inout2,`16*2`($out) | ||
1900 | xorps `16*5`(%rsp),$inout5 | ||
1901 | movups $inout3,`16*3`($out) | ||
1902 | mov $rnds_,$rounds # restore $rounds | ||
1903 | movups $inout4,`16*4`($out) | ||
1904 | movups $inout5,`16*5`($out) | ||
1905 | lea `16*6`($out),$out | ||
1906 | sub \$16*6,$len | ||
1907 | jnc .Lxts_dec_grandloop | ||
1908 | |||
1909 | lea 3($rounds,$rounds),$rounds # restore original value | ||
1910 | mov $key_,$key # restore $key | ||
1911 | mov $rounds,$rnds_ # backup $rounds | ||
1912 | |||
1913 | .Lxts_dec_short: | ||
1914 | add \$16*6,$len | ||
1915 | jz .Lxts_dec_done | ||
1916 | |||
1917 | cmp \$0x20,$len | ||
1918 | jb .Lxts_dec_one | ||
1919 | je .Lxts_dec_two | ||
1920 | |||
1921 | cmp \$0x40,$len | ||
1922 | jb .Lxts_dec_three | ||
1923 | je .Lxts_dec_four | ||
1924 | |||
1925 | pshufd \$0x13,$twtmp,$twres | ||
1926 | movdqa @tweak[5],@tweak[4] | ||
1927 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1928 | movdqu ($inp),$inout0 | ||
1929 | pand $twmask,$twres # isolate carry and residue | ||
1930 | movdqu 16*1($inp),$inout1 | ||
1931 | pxor $twres,@tweak[5] | ||
1932 | |||
1933 | movdqu 16*2($inp),$inout2 | ||
1934 | pxor @tweak[0],$inout0 | ||
1935 | movdqu 16*3($inp),$inout3 | ||
1936 | pxor @tweak[1],$inout1 | ||
1937 | movdqu 16*4($inp),$inout4 | ||
1938 | lea 16*5($inp),$inp | ||
1939 | pxor @tweak[2],$inout2 | ||
1940 | pxor @tweak[3],$inout3 | ||
1941 | pxor @tweak[4],$inout4 | ||
1942 | |||
1943 | call _aesni_decrypt6 | ||
1944 | |||
1945 | xorps @tweak[0],$inout0 | ||
1946 | xorps @tweak[1],$inout1 | ||
1947 | xorps @tweak[2],$inout2 | ||
1948 | movdqu $inout0,($out) | ||
1949 | xorps @tweak[3],$inout3 | ||
1950 | movdqu $inout1,16*1($out) | ||
1951 | xorps @tweak[4],$inout4 | ||
1952 | movdqu $inout2,16*2($out) | ||
1953 | pxor $twtmp,$twtmp | ||
1954 | movdqu $inout3,16*3($out) | ||
1955 | pcmpgtd @tweak[5],$twtmp | ||
1956 | movdqu $inout4,16*4($out) | ||
1957 | lea 16*5($out),$out | ||
1958 | pshufd \$0x13,$twtmp,@tweak[1] # $twres | ||
1959 | and \$15,$len_ | ||
1960 | jz .Lxts_dec_ret | ||
1961 | |||
1962 | movdqa @tweak[5],@tweak[0] | ||
1963 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1964 | pand $twmask,@tweak[1] # isolate carry and residue | ||
1965 | pxor @tweak[5],@tweak[1] | ||
1966 | jmp .Lxts_dec_done2 | ||
1967 | |||
1968 | .align 16 | ||
1969 | .Lxts_dec_one: | ||
1970 | movups ($inp),$inout0 | ||
1971 | lea 16*1($inp),$inp | ||
1972 | xorps @tweak[0],$inout0 | ||
1973 | ___ | ||
1974 | &aesni_generate1("dec",$key,$rounds); | ||
1975 | $code.=<<___; | ||
1976 | xorps @tweak[0],$inout0 | ||
1977 | movdqa @tweak[1],@tweak[0] | ||
1978 | movups $inout0,($out) | ||
1979 | movdqa @tweak[2],@tweak[1] | ||
1980 | lea 16*1($out),$out | ||
1981 | jmp .Lxts_dec_done | ||
1982 | |||
1983 | .align 16 | ||
1984 | .Lxts_dec_two: | ||
1985 | movups ($inp),$inout0 | ||
1986 | movups 16($inp),$inout1 | ||
1987 | lea 32($inp),$inp | ||
1988 | xorps @tweak[0],$inout0 | ||
1989 | xorps @tweak[1],$inout1 | ||
1990 | |||
1991 | call _aesni_decrypt3 | ||
1992 | |||
1993 | xorps @tweak[0],$inout0 | ||
1994 | movdqa @tweak[2],@tweak[0] | ||
1995 | xorps @tweak[1],$inout1 | ||
1996 | movdqa @tweak[3],@tweak[1] | ||
1997 | movups $inout0,($out) | ||
1998 | movups $inout1,16*1($out) | ||
1999 | lea 16*2($out),$out | ||
2000 | jmp .Lxts_dec_done | ||
2001 | |||
2002 | .align 16 | ||
2003 | .Lxts_dec_three: | ||
2004 | movups ($inp),$inout0 | ||
2005 | movups 16*1($inp),$inout1 | ||
2006 | movups 16*2($inp),$inout2 | ||
2007 | lea 16*3($inp),$inp | ||
2008 | xorps @tweak[0],$inout0 | ||
2009 | xorps @tweak[1],$inout1 | ||
2010 | xorps @tweak[2],$inout2 | ||
2011 | |||
2012 | call _aesni_decrypt3 | ||
2013 | |||
2014 | xorps @tweak[0],$inout0 | ||
2015 | movdqa @tweak[3],@tweak[0] | ||
2016 | xorps @tweak[1],$inout1 | ||
2017 | movdqa @tweak[5],@tweak[1] | ||
2018 | xorps @tweak[2],$inout2 | ||
2019 | movups $inout0,($out) | ||
2020 | movups $inout1,16*1($out) | ||
2021 | movups $inout2,16*2($out) | ||
2022 | lea 16*3($out),$out | ||
2023 | jmp .Lxts_dec_done | ||
2024 | |||
2025 | .align 16 | ||
2026 | .Lxts_dec_four: | ||
2027 | pshufd \$0x13,$twtmp,$twres | ||
2028 | movdqa @tweak[5],@tweak[4] | ||
2029 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
2030 | movups ($inp),$inout0 | ||
2031 | pand $twmask,$twres # isolate carry and residue | ||
2032 | movups 16*1($inp),$inout1 | ||
2033 | pxor $twres,@tweak[5] | ||
2034 | |||
2035 | movups 16*2($inp),$inout2 | ||
2036 | xorps @tweak[0],$inout0 | ||
2037 | movups 16*3($inp),$inout3 | ||
2038 | lea 16*4($inp),$inp | ||
2039 | xorps @tweak[1],$inout1 | ||
2040 | xorps @tweak[2],$inout2 | ||
2041 | xorps @tweak[3],$inout3 | ||
2042 | |||
2043 | call _aesni_decrypt4 | ||
2044 | |||
2045 | xorps @tweak[0],$inout0 | ||
2046 | movdqa @tweak[4],@tweak[0] | ||
2047 | xorps @tweak[1],$inout1 | ||
2048 | movdqa @tweak[5],@tweak[1] | ||
2049 | xorps @tweak[2],$inout2 | ||
2050 | movups $inout0,($out) | ||
2051 | xorps @tweak[3],$inout3 | ||
2052 | movups $inout1,16*1($out) | ||
2053 | movups $inout2,16*2($out) | ||
2054 | movups $inout3,16*3($out) | ||
2055 | lea 16*4($out),$out | ||
2056 | jmp .Lxts_dec_done | ||
2057 | |||
2058 | .align 16 | ||
2059 | .Lxts_dec_done: | ||
2060 | and \$15,$len_ | ||
2061 | jz .Lxts_dec_ret | ||
2062 | .Lxts_dec_done2: | ||
2063 | mov $len_,$len | ||
2064 | mov $key_,$key # restore $key | ||
2065 | mov $rnds_,$rounds # restore $rounds | ||
2066 | |||
2067 | movups ($inp),$inout0 | ||
2068 | xorps @tweak[1],$inout0 | ||
2069 | ___ | ||
2070 | &aesni_generate1("dec",$key,$rounds); | ||
2071 | $code.=<<___; | ||
2072 | xorps @tweak[1],$inout0 | ||
2073 | movups $inout0,($out) | ||
2074 | |||
2075 | .Lxts_dec_steal: | ||
2076 | movzb 16($inp),%eax # borrow $rounds ... | ||
2077 | movzb ($out),%ecx # ... and $key | ||
2078 | lea 1($inp),$inp | ||
2079 | mov %al,($out) | ||
2080 | mov %cl,16($out) | ||
2081 | lea 1($out),$out | ||
2082 | sub \$1,$len | ||
2083 | jnz .Lxts_dec_steal | ||
2084 | |||
2085 | sub $len_,$out # rewind $out | ||
2086 | mov $key_,$key # restore $key | ||
2087 | mov $rnds_,$rounds # restore $rounds | ||
2088 | |||
2089 | movups ($out),$inout0 | ||
2090 | xorps @tweak[0],$inout0 | ||
2091 | ___ | ||
2092 | &aesni_generate1("dec",$key,$rounds); | ||
2093 | $code.=<<___; | ||
2094 | xorps @tweak[0],$inout0 | ||
2095 | movups $inout0,($out) | ||
2096 | |||
2097 | .Lxts_dec_ret: | ||
2098 | ___ | ||
2099 | $code.=<<___ if ($win64); | ||
2100 | movaps 0x60(%rsp),%xmm6 | ||
2101 | movaps 0x70(%rsp),%xmm7 | ||
2102 | movaps 0x80(%rsp),%xmm8 | ||
2103 | movaps 0x90(%rsp),%xmm9 | ||
2104 | movaps 0xa0(%rsp),%xmm10 | ||
2105 | movaps 0xb0(%rsp),%xmm11 | ||
2106 | movaps 0xc0(%rsp),%xmm12 | ||
2107 | movaps 0xd0(%rsp),%xmm13 | ||
2108 | movaps 0xe0(%rsp),%xmm14 | ||
2109 | movaps 0xf0(%rsp),%xmm15 | ||
2110 | ___ | ||
2111 | $code.=<<___; | ||
2112 | lea $frame_size(%rsp),%rsp | ||
2113 | .Lxts_dec_epilogue: | ||
2114 | ret | ||
2115 | .size aesni_xts_decrypt,.-aesni_xts_decrypt | ||
2116 | ___ | ||
2117 | } }} | ||
2118 | |||
2119 | ######################################################################## | ||
2120 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, | ||
2121 | # size_t length, const AES_KEY *key, | ||
2122 | # unsigned char *ivp,const int enc); | ||
2123 | { | ||
2124 | my $reserved = $win64?0x40:-0x18; # used in decrypt | ||
2125 | $code.=<<___; | ||
2126 | .globl ${PREFIX}_cbc_encrypt | ||
2127 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
2128 | .align 16 | ||
2129 | ${PREFIX}_cbc_encrypt: | ||
2130 | test $len,$len # check length | ||
2131 | jz .Lcbc_ret | ||
2132 | |||
2133 | mov 240($key),$rnds_ # key->rounds | ||
2134 | mov $key,$key_ # backup $key | ||
2135 | test %r9d,%r9d # 6th argument | ||
2136 | jz .Lcbc_decrypt | ||
2137 | #--------------------------- CBC ENCRYPT ------------------------------# | ||
2138 | movups ($ivp),$inout0 # load iv as initial state | ||
2139 | mov $rnds_,$rounds | ||
2140 | cmp \$16,$len | ||
2141 | jb .Lcbc_enc_tail | ||
2142 | sub \$16,$len | ||
2143 | jmp .Lcbc_enc_loop | ||
2144 | .align 16 | ||
2145 | .Lcbc_enc_loop: | ||
2146 | movups ($inp),$inout1 # load input | ||
2147 | lea 16($inp),$inp | ||
2148 | #xorps $inout1,$inout0 | ||
2149 | ___ | ||
2150 | &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); | ||
2151 | $code.=<<___; | ||
2152 | mov $rnds_,$rounds # restore $rounds | ||
2153 | mov $key_,$key # restore $key | ||
2154 | movups $inout0,0($out) # store output | ||
2155 | lea 16($out),$out | ||
2156 | sub \$16,$len | ||
2157 | jnc .Lcbc_enc_loop | ||
2158 | add \$16,$len | ||
2159 | jnz .Lcbc_enc_tail | ||
2160 | movups $inout0,($ivp) | ||
2161 | jmp .Lcbc_ret | ||
2162 | |||
2163 | .Lcbc_enc_tail: | ||
2164 | mov $len,%rcx # zaps $key | ||
2165 | xchg $inp,$out # $inp is %rsi and $out is %rdi now | ||
2166 | .long 0x9066A4F3 # rep movsb | ||
2167 | mov \$16,%ecx # zero tail | ||
2168 | sub $len,%rcx | ||
2169 | xor %eax,%eax | ||
2170 | .long 0x9066AAF3 # rep stosb | ||
2171 | lea -16(%rdi),%rdi # rewind $out by 1 block | ||
2172 | mov $rnds_,$rounds # restore $rounds | ||
2173 | mov %rdi,%rsi # $inp and $out are the same | ||
2174 | mov $key_,$key # restore $key | ||
2175 | xor $len,$len # len=16 | ||
2176 | jmp .Lcbc_enc_loop # one more spin | ||
2177 | #--------------------------- CBC DECRYPT ------------------------------# | ||
2178 | .align 16 | ||
2179 | .Lcbc_decrypt: | ||
2180 | ___ | ||
2181 | $code.=<<___ if ($win64); | ||
2182 | lea -0x58(%rsp),%rsp | ||
2183 | movaps %xmm6,(%rsp) | ||
2184 | movaps %xmm7,0x10(%rsp) | ||
2185 | movaps %xmm8,0x20(%rsp) | ||
2186 | movaps %xmm9,0x30(%rsp) | ||
2187 | .Lcbc_decrypt_body: | ||
2188 | ___ | ||
2189 | $code.=<<___; | ||
2190 | movups ($ivp),$iv | ||
2191 | mov $rnds_,$rounds | ||
2192 | cmp \$0x70,$len | ||
2193 | jbe .Lcbc_dec_tail | ||
2194 | shr \$1,$rnds_ | ||
2195 | sub \$0x70,$len | ||
2196 | mov $rnds_,$rounds | ||
2197 | movaps $iv,$reserved(%rsp) | ||
2198 | jmp .Lcbc_dec_loop8_enter | ||
2199 | .align 16 | ||
2200 | .Lcbc_dec_loop8: | ||
2201 | movaps $rndkey0,$reserved(%rsp) # save IV | ||
2202 | movups $inout7,($out) | ||
2203 | lea 0x10($out),$out | ||
2204 | .Lcbc_dec_loop8_enter: | ||
2205 | $movkey ($key),$rndkey0 | ||
2206 | movups ($inp),$inout0 # load input | ||
2207 | movups 0x10($inp),$inout1 | ||
2208 | $movkey 16($key),$rndkey1 | ||
2209 | |||
2210 | lea 32($key),$key | ||
2211 | movdqu 0x20($inp),$inout2 | ||
2212 | xorps $rndkey0,$inout0 | ||
2213 | movdqu 0x30($inp),$inout3 | ||
2214 | xorps $rndkey0,$inout1 | ||
2215 | movdqu 0x40($inp),$inout4 | ||
2216 | aesdec $rndkey1,$inout0 | ||
2217 | pxor $rndkey0,$inout2 | ||
2218 | movdqu 0x50($inp),$inout5 | ||
2219 | aesdec $rndkey1,$inout1 | ||
2220 | pxor $rndkey0,$inout3 | ||
2221 | movdqu 0x60($inp),$inout6 | ||
2222 | aesdec $rndkey1,$inout2 | ||
2223 | pxor $rndkey0,$inout4 | ||
2224 | movdqu 0x70($inp),$inout7 | ||
2225 | aesdec $rndkey1,$inout3 | ||
2226 | pxor $rndkey0,$inout5 | ||
2227 | dec $rounds | ||
2228 | aesdec $rndkey1,$inout4 | ||
2229 | pxor $rndkey0,$inout6 | ||
2230 | aesdec $rndkey1,$inout5 | ||
2231 | pxor $rndkey0,$inout7 | ||
2232 | $movkey ($key),$rndkey0 | ||
2233 | aesdec $rndkey1,$inout6 | ||
2234 | aesdec $rndkey1,$inout7 | ||
2235 | $movkey 16($key),$rndkey1 | ||
2236 | |||
2237 | call .Ldec_loop8_enter | ||
2238 | |||
2239 | movups ($inp),$rndkey1 # re-load input | ||
2240 | movups 0x10($inp),$rndkey0 | ||
2241 | xorps $reserved(%rsp),$inout0 # ^= IV | ||
2242 | xorps $rndkey1,$inout1 | ||
2243 | movups 0x20($inp),$rndkey1 | ||
2244 | xorps $rndkey0,$inout2 | ||
2245 | movups 0x30($inp),$rndkey0 | ||
2246 | xorps $rndkey1,$inout3 | ||
2247 | movups 0x40($inp),$rndkey1 | ||
2248 | xorps $rndkey0,$inout4 | ||
2249 | movups 0x50($inp),$rndkey0 | ||
2250 | xorps $rndkey1,$inout5 | ||
2251 | movups 0x60($inp),$rndkey1 | ||
2252 | xorps $rndkey0,$inout6 | ||
2253 | movups 0x70($inp),$rndkey0 # IV | ||
2254 | xorps $rndkey1,$inout7 | ||
2255 | movups $inout0,($out) | ||
2256 | movups $inout1,0x10($out) | ||
2257 | movups $inout2,0x20($out) | ||
2258 | movups $inout3,0x30($out) | ||
2259 | mov $rnds_,$rounds # restore $rounds | ||
2260 | movups $inout4,0x40($out) | ||
2261 | mov $key_,$key # restore $key | ||
2262 | movups $inout5,0x50($out) | ||
2263 | lea 0x80($inp),$inp | ||
2264 | movups $inout6,0x60($out) | ||
2265 | lea 0x70($out),$out | ||
2266 | sub \$0x80,$len | ||
2267 | ja .Lcbc_dec_loop8 | ||
2268 | |||
2269 | movaps $inout7,$inout0 | ||
2270 | movaps $rndkey0,$iv | ||
2271 | add \$0x70,$len | ||
2272 | jle .Lcbc_dec_tail_collected | ||
2273 | movups $inout0,($out) | ||
2274 | lea 1($rnds_,$rnds_),$rounds | ||
2275 | lea 0x10($out),$out | ||
2276 | .Lcbc_dec_tail: | ||
2277 | movups ($inp),$inout0 | ||
2278 | movaps $inout0,$in0 | ||
2279 | cmp \$0x10,$len | ||
2280 | jbe .Lcbc_dec_one | ||
2281 | |||
2282 | movups 0x10($inp),$inout1 | ||
2283 | movaps $inout1,$in1 | ||
2284 | cmp \$0x20,$len | ||
2285 | jbe .Lcbc_dec_two | ||
2286 | |||
2287 | movups 0x20($inp),$inout2 | ||
2288 | movaps $inout2,$in2 | ||
2289 | cmp \$0x30,$len | ||
2290 | jbe .Lcbc_dec_three | ||
2291 | |||
2292 | movups 0x30($inp),$inout3 | ||
2293 | cmp \$0x40,$len | ||
2294 | jbe .Lcbc_dec_four | ||
2295 | |||
2296 | movups 0x40($inp),$inout4 | ||
2297 | cmp \$0x50,$len | ||
2298 | jbe .Lcbc_dec_five | ||
2299 | |||
2300 | movups 0x50($inp),$inout5 | ||
2301 | cmp \$0x60,$len | ||
2302 | jbe .Lcbc_dec_six | ||
2303 | |||
2304 | movups 0x60($inp),$inout6 | ||
2305 | movaps $iv,$reserved(%rsp) # save IV | ||
2306 | call _aesni_decrypt8 | ||
2307 | movups ($inp),$rndkey1 | ||
2308 | movups 0x10($inp),$rndkey0 | ||
2309 | xorps $reserved(%rsp),$inout0 # ^= IV | ||
2310 | xorps $rndkey1,$inout1 | ||
2311 | movups 0x20($inp),$rndkey1 | ||
2312 | xorps $rndkey0,$inout2 | ||
2313 | movups 0x30($inp),$rndkey0 | ||
2314 | xorps $rndkey1,$inout3 | ||
2315 | movups 0x40($inp),$rndkey1 | ||
2316 | xorps $rndkey0,$inout4 | ||
2317 | movups 0x50($inp),$rndkey0 | ||
2318 | xorps $rndkey1,$inout5 | ||
2319 | movups 0x60($inp),$iv # IV | ||
2320 | xorps $rndkey0,$inout6 | ||
2321 | movups $inout0,($out) | ||
2322 | movups $inout1,0x10($out) | ||
2323 | movups $inout2,0x20($out) | ||
2324 | movups $inout3,0x30($out) | ||
2325 | movups $inout4,0x40($out) | ||
2326 | movups $inout5,0x50($out) | ||
2327 | lea 0x60($out),$out | ||
2328 | movaps $inout6,$inout0 | ||
2329 | sub \$0x70,$len | ||
2330 | jmp .Lcbc_dec_tail_collected | ||
2331 | .align 16 | ||
2332 | .Lcbc_dec_one: | ||
2333 | ___ | ||
2334 | &aesni_generate1("dec",$key,$rounds); | ||
2335 | $code.=<<___; | ||
2336 | xorps $iv,$inout0 | ||
2337 | movaps $in0,$iv | ||
2338 | sub \$0x10,$len | ||
2339 | jmp .Lcbc_dec_tail_collected | ||
2340 | .align 16 | ||
2341 | .Lcbc_dec_two: | ||
2342 | xorps $inout2,$inout2 | ||
2343 | call _aesni_decrypt3 | ||
2344 | xorps $iv,$inout0 | ||
2345 | xorps $in0,$inout1 | ||
2346 | movups $inout0,($out) | ||
2347 | movaps $in1,$iv | ||
2348 | movaps $inout1,$inout0 | ||
2349 | lea 0x10($out),$out | ||
2350 | sub \$0x20,$len | ||
2351 | jmp .Lcbc_dec_tail_collected | ||
2352 | .align 16 | ||
2353 | .Lcbc_dec_three: | ||
2354 | call _aesni_decrypt3 | ||
2355 | xorps $iv,$inout0 | ||
2356 | xorps $in0,$inout1 | ||
2357 | movups $inout0,($out) | ||
2358 | xorps $in1,$inout2 | ||
2359 | movups $inout1,0x10($out) | ||
2360 | movaps $in2,$iv | ||
2361 | movaps $inout2,$inout0 | ||
2362 | lea 0x20($out),$out | ||
2363 | sub \$0x30,$len | ||
2364 | jmp .Lcbc_dec_tail_collected | ||
2365 | .align 16 | ||
2366 | .Lcbc_dec_four: | ||
2367 | call _aesni_decrypt4 | ||
2368 | xorps $iv,$inout0 | ||
2369 | movups 0x30($inp),$iv | ||
2370 | xorps $in0,$inout1 | ||
2371 | movups $inout0,($out) | ||
2372 | xorps $in1,$inout2 | ||
2373 | movups $inout1,0x10($out) | ||
2374 | xorps $in2,$inout3 | ||
2375 | movups $inout2,0x20($out) | ||
2376 | movaps $inout3,$inout0 | ||
2377 | lea 0x30($out),$out | ||
2378 | sub \$0x40,$len | ||
2379 | jmp .Lcbc_dec_tail_collected | ||
2380 | .align 16 | ||
2381 | .Lcbc_dec_five: | ||
2382 | xorps $inout5,$inout5 | ||
2383 | call _aesni_decrypt6 | ||
2384 | movups 0x10($inp),$rndkey1 | ||
2385 | movups 0x20($inp),$rndkey0 | ||
2386 | xorps $iv,$inout0 | ||
2387 | xorps $in0,$inout1 | ||
2388 | xorps $rndkey1,$inout2 | ||
2389 | movups 0x30($inp),$rndkey1 | ||
2390 | xorps $rndkey0,$inout3 | ||
2391 | movups 0x40($inp),$iv | ||
2392 | xorps $rndkey1,$inout4 | ||
2393 | movups $inout0,($out) | ||
2394 | movups $inout1,0x10($out) | ||
2395 | movups $inout2,0x20($out) | ||
2396 | movups $inout3,0x30($out) | ||
2397 | lea 0x40($out),$out | ||
2398 | movaps $inout4,$inout0 | ||
2399 | sub \$0x50,$len | ||
2400 | jmp .Lcbc_dec_tail_collected | ||
2401 | .align 16 | ||
2402 | .Lcbc_dec_six: | ||
2403 | call _aesni_decrypt6 | ||
2404 | movups 0x10($inp),$rndkey1 | ||
2405 | movups 0x20($inp),$rndkey0 | ||
2406 | xorps $iv,$inout0 | ||
2407 | xorps $in0,$inout1 | ||
2408 | xorps $rndkey1,$inout2 | ||
2409 | movups 0x30($inp),$rndkey1 | ||
2410 | xorps $rndkey0,$inout3 | ||
2411 | movups 0x40($inp),$rndkey0 | ||
2412 | xorps $rndkey1,$inout4 | ||
2413 | movups 0x50($inp),$iv | ||
2414 | xorps $rndkey0,$inout5 | ||
2415 | movups $inout0,($out) | ||
2416 | movups $inout1,0x10($out) | ||
2417 | movups $inout2,0x20($out) | ||
2418 | movups $inout3,0x30($out) | ||
2419 | movups $inout4,0x40($out) | ||
2420 | lea 0x50($out),$out | ||
2421 | movaps $inout5,$inout0 | ||
2422 | sub \$0x60,$len | ||
2423 | jmp .Lcbc_dec_tail_collected | ||
2424 | .align 16 | ||
2425 | .Lcbc_dec_tail_collected: | ||
2426 | and \$15,$len | ||
2427 | movups $iv,($ivp) | ||
2428 | jnz .Lcbc_dec_tail_partial | ||
2429 | movups $inout0,($out) | ||
2430 | jmp .Lcbc_dec_ret | ||
2431 | .align 16 | ||
2432 | .Lcbc_dec_tail_partial: | ||
2433 | movaps $inout0,$reserved(%rsp) | ||
2434 | mov \$16,%rcx | ||
2435 | mov $out,%rdi | ||
2436 | sub $len,%rcx | ||
2437 | lea $reserved(%rsp),%rsi | ||
2438 | .long 0x9066A4F3 # rep movsb | ||
2439 | |||
2440 | .Lcbc_dec_ret: | ||
2441 | ___ | ||
2442 | $code.=<<___ if ($win64); | ||
2443 | movaps (%rsp),%xmm6 | ||
2444 | movaps 0x10(%rsp),%xmm7 | ||
2445 | movaps 0x20(%rsp),%xmm8 | ||
2446 | movaps 0x30(%rsp),%xmm9 | ||
2447 | lea 0x58(%rsp),%rsp | ||
2448 | ___ | ||
2449 | $code.=<<___; | ||
2450 | .Lcbc_ret: | ||
2451 | ret | ||
2452 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
2453 | ___ | ||
2454 | } | ||
2455 | # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, | ||
2456 | # int bits, AES_KEY *key) | ||
2457 | { my ($inp,$bits,$key) = @_4args; | ||
2458 | $bits =~ s/%r/%e/; | ||
2459 | |||
2460 | $code.=<<___; | ||
2461 | .globl ${PREFIX}_set_decrypt_key | ||
2462 | .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent | ||
2463 | .align 16 | ||
2464 | ${PREFIX}_set_decrypt_key: | ||
2465 | sub \$8,%rsp | ||
2466 | call __aesni_set_encrypt_key | ||
2467 | shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key | ||
2468 | test %eax,%eax | ||
2469 | jnz .Ldec_key_ret | ||
2470 | lea 16($key,$bits),$inp # points at the end of key schedule | ||
2471 | |||
2472 | $movkey ($key),%xmm0 # just swap | ||
2473 | $movkey ($inp),%xmm1 | ||
2474 | $movkey %xmm0,($inp) | ||
2475 | $movkey %xmm1,($key) | ||
2476 | lea 16($key),$key | ||
2477 | lea -16($inp),$inp | ||
2478 | |||
2479 | .Ldec_key_inverse: | ||
2480 | $movkey ($key),%xmm0 # swap and inverse | ||
2481 | $movkey ($inp),%xmm1 | ||
2482 | aesimc %xmm0,%xmm0 | ||
2483 | aesimc %xmm1,%xmm1 | ||
2484 | lea 16($key),$key | ||
2485 | lea -16($inp),$inp | ||
2486 | $movkey %xmm0,16($inp) | ||
2487 | $movkey %xmm1,-16($key) | ||
2488 | cmp $key,$inp | ||
2489 | ja .Ldec_key_inverse | ||
2490 | |||
2491 | $movkey ($key),%xmm0 # inverse middle | ||
2492 | aesimc %xmm0,%xmm0 | ||
2493 | $movkey %xmm0,($inp) | ||
2494 | .Ldec_key_ret: | ||
2495 | add \$8,%rsp | ||
2496 | ret | ||
2497 | .LSEH_end_set_decrypt_key: | ||
2498 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
2499 | ___ | ||
2500 | |||
2501 | # This is based on submission by | ||
2502 | # | ||
2503 | # Huang Ying <ying.huang@intel.com> | ||
2504 | # Vinodh Gopal <vinodh.gopal@intel.com> | ||
2505 | # Kahraman Akdemir | ||
2506 | # | ||
2507 | # Agressively optimized in respect to aeskeygenassist's critical path | ||
2508 | # and is contained in %xmm0-5 to meet Win64 ABI requirement. | ||
2509 | # | ||
2510 | $code.=<<___; | ||
2511 | .globl ${PREFIX}_set_encrypt_key | ||
2512 | .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent | ||
2513 | .align 16 | ||
2514 | ${PREFIX}_set_encrypt_key: | ||
2515 | __aesni_set_encrypt_key: | ||
2516 | sub \$8,%rsp | ||
2517 | mov \$-1,%rax | ||
2518 | test $inp,$inp | ||
2519 | jz .Lenc_key_ret | ||
2520 | test $key,$key | ||
2521 | jz .Lenc_key_ret | ||
2522 | |||
2523 | movups ($inp),%xmm0 # pull first 128 bits of *userKey | ||
2524 | xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 | ||
2525 | lea 16($key),%rax | ||
2526 | cmp \$256,$bits | ||
2527 | je .L14rounds | ||
2528 | cmp \$192,$bits | ||
2529 | je .L12rounds | ||
2530 | cmp \$128,$bits | ||
2531 | jne .Lbad_keybits | ||
2532 | |||
2533 | .L10rounds: | ||
2534 | mov \$9,$bits # 10 rounds for 128-bit key | ||
2535 | $movkey %xmm0,($key) # round 0 | ||
2536 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 | ||
2537 | call .Lkey_expansion_128_cold | ||
2538 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 | ||
2539 | call .Lkey_expansion_128 | ||
2540 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 | ||
2541 | call .Lkey_expansion_128 | ||
2542 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 | ||
2543 | call .Lkey_expansion_128 | ||
2544 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 | ||
2545 | call .Lkey_expansion_128 | ||
2546 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 | ||
2547 | call .Lkey_expansion_128 | ||
2548 | aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 | ||
2549 | call .Lkey_expansion_128 | ||
2550 | aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 | ||
2551 | call .Lkey_expansion_128 | ||
2552 | aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 | ||
2553 | call .Lkey_expansion_128 | ||
2554 | aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 | ||
2555 | call .Lkey_expansion_128 | ||
2556 | $movkey %xmm0,(%rax) | ||
2557 | mov $bits,80(%rax) # 240(%rdx) | ||
2558 | xor %eax,%eax | ||
2559 | jmp .Lenc_key_ret | ||
2560 | |||
2561 | .align 16 | ||
2562 | .L12rounds: | ||
2563 | movq 16($inp),%xmm2 # remaining 1/3 of *userKey | ||
2564 | mov \$11,$bits # 12 rounds for 192 | ||
2565 | $movkey %xmm0,($key) # round 0 | ||
2566 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 | ||
2567 | call .Lkey_expansion_192a_cold | ||
2568 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 | ||
2569 | call .Lkey_expansion_192b | ||
2570 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 | ||
2571 | call .Lkey_expansion_192a | ||
2572 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 | ||
2573 | call .Lkey_expansion_192b | ||
2574 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 | ||
2575 | call .Lkey_expansion_192a | ||
2576 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 | ||
2577 | call .Lkey_expansion_192b | ||
2578 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 | ||
2579 | call .Lkey_expansion_192a | ||
2580 | aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 | ||
2581 | call .Lkey_expansion_192b | ||
2582 | $movkey %xmm0,(%rax) | ||
2583 | mov $bits,48(%rax) # 240(%rdx) | ||
2584 | xor %rax, %rax | ||
2585 | jmp .Lenc_key_ret | ||
2586 | |||
2587 | .align 16 | ||
2588 | .L14rounds: | ||
2589 | movups 16($inp),%xmm2 # remaning half of *userKey | ||
2590 | mov \$13,$bits # 14 rounds for 256 | ||
2591 | lea 16(%rax),%rax | ||
2592 | $movkey %xmm0,($key) # round 0 | ||
2593 | $movkey %xmm2,16($key) # round 1 | ||
2594 | aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 | ||
2595 | call .Lkey_expansion_256a_cold | ||
2596 | aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 | ||
2597 | call .Lkey_expansion_256b | ||
2598 | aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 | ||
2599 | call .Lkey_expansion_256a | ||
2600 | aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 | ||
2601 | call .Lkey_expansion_256b | ||
2602 | aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 | ||
2603 | call .Lkey_expansion_256a | ||
2604 | aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 | ||
2605 | call .Lkey_expansion_256b | ||
2606 | aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 | ||
2607 | call .Lkey_expansion_256a | ||
2608 | aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 | ||
2609 | call .Lkey_expansion_256b | ||
2610 | aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 | ||
2611 | call .Lkey_expansion_256a | ||
2612 | aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 | ||
2613 | call .Lkey_expansion_256b | ||
2614 | aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 | ||
2615 | call .Lkey_expansion_256a | ||
2616 | aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 | ||
2617 | call .Lkey_expansion_256b | ||
2618 | aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 | ||
2619 | call .Lkey_expansion_256a | ||
2620 | $movkey %xmm0,(%rax) | ||
2621 | mov $bits,16(%rax) # 240(%rdx) | ||
2622 | xor %rax,%rax | ||
2623 | jmp .Lenc_key_ret | ||
2624 | |||
2625 | .align 16 | ||
2626 | .Lbad_keybits: | ||
2627 | mov \$-2,%rax | ||
2628 | .Lenc_key_ret: | ||
2629 | add \$8,%rsp | ||
2630 | ret | ||
2631 | .LSEH_end_set_encrypt_key: | ||
2632 | |||
2633 | .align 16 | ||
2634 | .Lkey_expansion_128: | ||
2635 | $movkey %xmm0,(%rax) | ||
2636 | lea 16(%rax),%rax | ||
2637 | .Lkey_expansion_128_cold: | ||
2638 | shufps \$0b00010000,%xmm0,%xmm4 | ||
2639 | xorps %xmm4, %xmm0 | ||
2640 | shufps \$0b10001100,%xmm0,%xmm4 | ||
2641 | xorps %xmm4, %xmm0 | ||
2642 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | ||
2643 | xorps %xmm1,%xmm0 | ||
2644 | ret | ||
2645 | |||
2646 | .align 16 | ||
2647 | .Lkey_expansion_192a: | ||
2648 | $movkey %xmm0,(%rax) | ||
2649 | lea 16(%rax),%rax | ||
2650 | .Lkey_expansion_192a_cold: | ||
2651 | movaps %xmm2, %xmm5 | ||
2652 | .Lkey_expansion_192b_warm: | ||
2653 | shufps \$0b00010000,%xmm0,%xmm4 | ||
2654 | movdqa %xmm2,%xmm3 | ||
2655 | xorps %xmm4,%xmm0 | ||
2656 | shufps \$0b10001100,%xmm0,%xmm4 | ||
2657 | pslldq \$4,%xmm3 | ||
2658 | xorps %xmm4,%xmm0 | ||
2659 | pshufd \$0b01010101,%xmm1,%xmm1 # critical path | ||
2660 | pxor %xmm3,%xmm2 | ||
2661 | pxor %xmm1,%xmm0 | ||
2662 | pshufd \$0b11111111,%xmm0,%xmm3 | ||
2663 | pxor %xmm3,%xmm2 | ||
2664 | ret | ||
2665 | |||
2666 | .align 16 | ||
2667 | .Lkey_expansion_192b: | ||
2668 | movaps %xmm0,%xmm3 | ||
2669 | shufps \$0b01000100,%xmm0,%xmm5 | ||
2670 | $movkey %xmm5,(%rax) | ||
2671 | shufps \$0b01001110,%xmm2,%xmm3 | ||
2672 | $movkey %xmm3,16(%rax) | ||
2673 | lea 32(%rax),%rax | ||
2674 | jmp .Lkey_expansion_192b_warm | ||
2675 | |||
2676 | .align 16 | ||
2677 | .Lkey_expansion_256a: | ||
2678 | $movkey %xmm2,(%rax) | ||
2679 | lea 16(%rax),%rax | ||
2680 | .Lkey_expansion_256a_cold: | ||
2681 | shufps \$0b00010000,%xmm0,%xmm4 | ||
2682 | xorps %xmm4,%xmm0 | ||
2683 | shufps \$0b10001100,%xmm0,%xmm4 | ||
2684 | xorps %xmm4,%xmm0 | ||
2685 | shufps \$0b11111111,%xmm1,%xmm1 # critical path | ||
2686 | xorps %xmm1,%xmm0 | ||
2687 | ret | ||
2688 | |||
2689 | .align 16 | ||
2690 | .Lkey_expansion_256b: | ||
2691 | $movkey %xmm0,(%rax) | ||
2692 | lea 16(%rax),%rax | ||
2693 | |||
2694 | shufps \$0b00010000,%xmm2,%xmm4 | ||
2695 | xorps %xmm4,%xmm2 | ||
2696 | shufps \$0b10001100,%xmm2,%xmm4 | ||
2697 | xorps %xmm4,%xmm2 | ||
2698 | shufps \$0b10101010,%xmm1,%xmm1 # critical path | ||
2699 | xorps %xmm1,%xmm2 | ||
2700 | ret | ||
2701 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
2702 | .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key | ||
2703 | ___ | ||
2704 | } | ||
2705 | |||
2706 | $code.=<<___; | ||
2707 | .align 64 | ||
2708 | .Lbswap_mask: | ||
2709 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | ||
2710 | .Lincrement32: | ||
2711 | .long 6,6,6,0 | ||
2712 | .Lincrement64: | ||
2713 | .long 1,0,0,0 | ||
2714 | .Lxts_magic: | ||
2715 | .long 0x87,0,1,0 | ||
2716 | |||
2717 | .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" | ||
2718 | .align 64 | ||
2719 | ___ | ||
2720 | |||
2721 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2722 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2723 | if ($win64) { | ||
2724 | $rec="%rcx"; | ||
2725 | $frame="%rdx"; | ||
2726 | $context="%r8"; | ||
2727 | $disp="%r9"; | ||
2728 | |||
2729 | $code.=<<___; | ||
2730 | .extern __imp_RtlVirtualUnwind | ||
2731 | ___ | ||
2732 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
2733 | .type ecb_se_handler,\@abi-omnipotent | ||
2734 | .align 16 | ||
2735 | ecb_se_handler: | ||
2736 | push %rsi | ||
2737 | push %rdi | ||
2738 | push %rbx | ||
2739 | push %rbp | ||
2740 | push %r12 | ||
2741 | push %r13 | ||
2742 | push %r14 | ||
2743 | push %r15 | ||
2744 | pushfq | ||
2745 | sub \$64,%rsp | ||
2746 | |||
2747 | mov 152($context),%rax # pull context->Rsp | ||
2748 | |||
2749 | jmp .Lcommon_seh_tail | ||
2750 | .size ecb_se_handler,.-ecb_se_handler | ||
2751 | |||
2752 | .type ccm64_se_handler,\@abi-omnipotent | ||
2753 | .align 16 | ||
2754 | ccm64_se_handler: | ||
2755 | push %rsi | ||
2756 | push %rdi | ||
2757 | push %rbx | ||
2758 | push %rbp | ||
2759 | push %r12 | ||
2760 | push %r13 | ||
2761 | push %r14 | ||
2762 | push %r15 | ||
2763 | pushfq | ||
2764 | sub \$64,%rsp | ||
2765 | |||
2766 | mov 120($context),%rax # pull context->Rax | ||
2767 | mov 248($context),%rbx # pull context->Rip | ||
2768 | |||
2769 | mov 8($disp),%rsi # disp->ImageBase | ||
2770 | mov 56($disp),%r11 # disp->HandlerData | ||
2771 | |||
2772 | mov 0(%r11),%r10d # HandlerData[0] | ||
2773 | lea (%rsi,%r10),%r10 # prologue label | ||
2774 | cmp %r10,%rbx # context->Rip<prologue label | ||
2775 | jb .Lcommon_seh_tail | ||
2776 | |||
2777 | mov 152($context),%rax # pull context->Rsp | ||
2778 | |||
2779 | mov 4(%r11),%r10d # HandlerData[1] | ||
2780 | lea (%rsi,%r10),%r10 # epilogue label | ||
2781 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2782 | jae .Lcommon_seh_tail | ||
2783 | |||
2784 | lea 0(%rax),%rsi # %xmm save area | ||
2785 | lea 512($context),%rdi # &context.Xmm6 | ||
2786 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | ||
2787 | .long 0xa548f3fc # cld; rep movsq | ||
2788 | lea 0x58(%rax),%rax # adjust stack pointer | ||
2789 | |||
2790 | jmp .Lcommon_seh_tail | ||
2791 | .size ccm64_se_handler,.-ccm64_se_handler | ||
2792 | |||
2793 | .type ctr32_se_handler,\@abi-omnipotent | ||
2794 | .align 16 | ||
2795 | ctr32_se_handler: | ||
2796 | push %rsi | ||
2797 | push %rdi | ||
2798 | push %rbx | ||
2799 | push %rbp | ||
2800 | push %r12 | ||
2801 | push %r13 | ||
2802 | push %r14 | ||
2803 | push %r15 | ||
2804 | pushfq | ||
2805 | sub \$64,%rsp | ||
2806 | |||
2807 | mov 120($context),%rax # pull context->Rax | ||
2808 | mov 248($context),%rbx # pull context->Rip | ||
2809 | |||
2810 | lea .Lctr32_body(%rip),%r10 | ||
2811 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
2812 | jb .Lcommon_seh_tail | ||
2813 | |||
2814 | mov 152($context),%rax # pull context->Rsp | ||
2815 | |||
2816 | lea .Lctr32_ret(%rip),%r10 | ||
2817 | cmp %r10,%rbx | ||
2818 | jae .Lcommon_seh_tail | ||
2819 | |||
2820 | lea 0x20(%rax),%rsi # %xmm save area | ||
2821 | lea 512($context),%rdi # &context.Xmm6 | ||
2822 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2823 | .long 0xa548f3fc # cld; rep movsq | ||
2824 | lea 0xc8(%rax),%rax # adjust stack pointer | ||
2825 | |||
2826 | jmp .Lcommon_seh_tail | ||
2827 | .size ctr32_se_handler,.-ctr32_se_handler | ||
2828 | |||
2829 | .type xts_se_handler,\@abi-omnipotent | ||
2830 | .align 16 | ||
2831 | xts_se_handler: | ||
2832 | push %rsi | ||
2833 | push %rdi | ||
2834 | push %rbx | ||
2835 | push %rbp | ||
2836 | push %r12 | ||
2837 | push %r13 | ||
2838 | push %r14 | ||
2839 | push %r15 | ||
2840 | pushfq | ||
2841 | sub \$64,%rsp | ||
2842 | |||
2843 | mov 120($context),%rax # pull context->Rax | ||
2844 | mov 248($context),%rbx # pull context->Rip | ||
2845 | |||
2846 | mov 8($disp),%rsi # disp->ImageBase | ||
2847 | mov 56($disp),%r11 # disp->HandlerData | ||
2848 | |||
2849 | mov 0(%r11),%r10d # HandlerData[0] | ||
2850 | lea (%rsi,%r10),%r10 # prologue lable | ||
2851 | cmp %r10,%rbx # context->Rip<prologue label | ||
2852 | jb .Lcommon_seh_tail | ||
2853 | |||
2854 | mov 152($context),%rax # pull context->Rsp | ||
2855 | |||
2856 | mov 4(%r11),%r10d # HandlerData[1] | ||
2857 | lea (%rsi,%r10),%r10 # epilogue label | ||
2858 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2859 | jae .Lcommon_seh_tail | ||
2860 | |||
2861 | lea 0x60(%rax),%rsi # %xmm save area | ||
2862 | lea 512($context),%rdi # & context.Xmm6 | ||
2863 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2864 | .long 0xa548f3fc # cld; rep movsq | ||
2865 | lea 0x68+160(%rax),%rax # adjust stack pointer | ||
2866 | |||
2867 | jmp .Lcommon_seh_tail | ||
2868 | .size xts_se_handler,.-xts_se_handler | ||
2869 | ___ | ||
2870 | $code.=<<___; | ||
2871 | .type cbc_se_handler,\@abi-omnipotent | ||
2872 | .align 16 | ||
2873 | cbc_se_handler: | ||
2874 | push %rsi | ||
2875 | push %rdi | ||
2876 | push %rbx | ||
2877 | push %rbp | ||
2878 | push %r12 | ||
2879 | push %r13 | ||
2880 | push %r14 | ||
2881 | push %r15 | ||
2882 | pushfq | ||
2883 | sub \$64,%rsp | ||
2884 | |||
2885 | mov 152($context),%rax # pull context->Rsp | ||
2886 | mov 248($context),%rbx # pull context->Rip | ||
2887 | |||
2888 | lea .Lcbc_decrypt(%rip),%r10 | ||
2889 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
2890 | jb .Lcommon_seh_tail | ||
2891 | |||
2892 | lea .Lcbc_decrypt_body(%rip),%r10 | ||
2893 | cmp %r10,%rbx # context->Rip<cbc_decrypt_body | ||
2894 | jb .Lrestore_cbc_rax | ||
2895 | |||
2896 | lea .Lcbc_ret(%rip),%r10 | ||
2897 | cmp %r10,%rbx # context->Rip>="epilogue" label | ||
2898 | jae .Lcommon_seh_tail | ||
2899 | |||
2900 | lea 0(%rax),%rsi # top of stack | ||
2901 | lea 512($context),%rdi # &context.Xmm6 | ||
2902 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | ||
2903 | .long 0xa548f3fc # cld; rep movsq | ||
2904 | lea 0x58(%rax),%rax # adjust stack pointer | ||
2905 | jmp .Lcommon_seh_tail | ||
2906 | |||
2907 | .Lrestore_cbc_rax: | ||
2908 | mov 120($context),%rax | ||
2909 | |||
2910 | .Lcommon_seh_tail: | ||
2911 | mov 8(%rax),%rdi | ||
2912 | mov 16(%rax),%rsi | ||
2913 | mov %rax,152($context) # restore context->Rsp | ||
2914 | mov %rsi,168($context) # restore context->Rsi | ||
2915 | mov %rdi,176($context) # restore context->Rdi | ||
2916 | |||
2917 | mov 40($disp),%rdi # disp->ContextRecord | ||
2918 | mov $context,%rsi # context | ||
2919 | mov \$154,%ecx # sizeof(CONTEXT) | ||
2920 | .long 0xa548f3fc # cld; rep movsq | ||
2921 | |||
2922 | mov $disp,%rsi | ||
2923 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
2924 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
2925 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
2926 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
2927 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
2928 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
2929 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
2930 | mov %r10,32(%rsp) # arg5 | ||
2931 | mov %r11,40(%rsp) # arg6 | ||
2932 | mov %r12,48(%rsp) # arg7 | ||
2933 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
2934 | call *__imp_RtlVirtualUnwind(%rip) | ||
2935 | |||
2936 | mov \$1,%eax # ExceptionContinueSearch | ||
2937 | add \$64,%rsp | ||
2938 | popfq | ||
2939 | pop %r15 | ||
2940 | pop %r14 | ||
2941 | pop %r13 | ||
2942 | pop %r12 | ||
2943 | pop %rbp | ||
2944 | pop %rbx | ||
2945 | pop %rdi | ||
2946 | pop %rsi | ||
2947 | ret | ||
2948 | .size cbc_se_handler,.-cbc_se_handler | ||
2949 | |||
2950 | .section .pdata | ||
2951 | .align 4 | ||
2952 | ___ | ||
2953 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
2954 | .rva .LSEH_begin_aesni_ecb_encrypt | ||
2955 | .rva .LSEH_end_aesni_ecb_encrypt | ||
2956 | .rva .LSEH_info_ecb | ||
2957 | |||
2958 | .rva .LSEH_begin_aesni_ccm64_encrypt_blocks | ||
2959 | .rva .LSEH_end_aesni_ccm64_encrypt_blocks | ||
2960 | .rva .LSEH_info_ccm64_enc | ||
2961 | |||
2962 | .rva .LSEH_begin_aesni_ccm64_decrypt_blocks | ||
2963 | .rva .LSEH_end_aesni_ccm64_decrypt_blocks | ||
2964 | .rva .LSEH_info_ccm64_dec | ||
2965 | |||
2966 | .rva .LSEH_begin_aesni_ctr32_encrypt_blocks | ||
2967 | .rva .LSEH_end_aesni_ctr32_encrypt_blocks | ||
2968 | .rva .LSEH_info_ctr32 | ||
2969 | |||
2970 | .rva .LSEH_begin_aesni_xts_encrypt | ||
2971 | .rva .LSEH_end_aesni_xts_encrypt | ||
2972 | .rva .LSEH_info_xts_enc | ||
2973 | |||
2974 | .rva .LSEH_begin_aesni_xts_decrypt | ||
2975 | .rva .LSEH_end_aesni_xts_decrypt | ||
2976 | .rva .LSEH_info_xts_dec | ||
2977 | ___ | ||
2978 | $code.=<<___; | ||
2979 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
2980 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
2981 | .rva .LSEH_info_cbc | ||
2982 | |||
2983 | .rva ${PREFIX}_set_decrypt_key | ||
2984 | .rva .LSEH_end_set_decrypt_key | ||
2985 | .rva .LSEH_info_key | ||
2986 | |||
2987 | .rva ${PREFIX}_set_encrypt_key | ||
2988 | .rva .LSEH_end_set_encrypt_key | ||
2989 | .rva .LSEH_info_key | ||
2990 | .section .xdata | ||
2991 | .align 8 | ||
2992 | ___ | ||
2993 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
2994 | .LSEH_info_ecb: | ||
2995 | .byte 9,0,0,0 | ||
2996 | .rva ecb_se_handler | ||
2997 | .LSEH_info_ccm64_enc: | ||
2998 | .byte 9,0,0,0 | ||
2999 | .rva ccm64_se_handler | ||
3000 | .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] | ||
3001 | .LSEH_info_ccm64_dec: | ||
3002 | .byte 9,0,0,0 | ||
3003 | .rva ccm64_se_handler | ||
3004 | .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] | ||
3005 | .LSEH_info_ctr32: | ||
3006 | .byte 9,0,0,0 | ||
3007 | .rva ctr32_se_handler | ||
3008 | .LSEH_info_xts_enc: | ||
3009 | .byte 9,0,0,0 | ||
3010 | .rva xts_se_handler | ||
3011 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
3012 | .LSEH_info_xts_dec: | ||
3013 | .byte 9,0,0,0 | ||
3014 | .rva xts_se_handler | ||
3015 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
3016 | ___ | ||
3017 | $code.=<<___; | ||
3018 | .LSEH_info_cbc: | ||
3019 | .byte 9,0,0,0 | ||
3020 | .rva cbc_se_handler | ||
3021 | .LSEH_info_key: | ||
3022 | .byte 0x01,0x04,0x01,0x00 | ||
3023 | .byte 0x04,0x02,0x00,0x00 # sub rsp,8 | ||
3024 | ___ | ||
3025 | } | ||
3026 | |||
3027 | sub rex { | ||
3028 | local *opcode=shift; | ||
3029 | my ($dst,$src)=@_; | ||
3030 | my $rex=0; | ||
3031 | |||
3032 | $rex|=0x04 if($dst>=8); | ||
3033 | $rex|=0x01 if($src>=8); | ||
3034 | push @opcode,$rex|0x40 if($rex); | ||
3035 | } | ||
3036 | |||
3037 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
3038 | |||
3039 | print $code; | ||
3040 | |||
3041 | close STDOUT; | ||