diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm/vpaes-x86.pl')
-rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86.pl | 911 |
1 files changed, 0 insertions, 911 deletions
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl deleted file mode 100644 index 6e7bd36d05..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl +++ /dev/null | |||
@@ -1,911 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ###################################################################### | ||
4 | ## Constant-time SSSE3 AES core implementation. | ||
5 | ## version 0.1 | ||
6 | ## | ||
7 | ## By Mike Hamburg (Stanford University), 2009 | ||
8 | ## Public domain. | ||
9 | ## | ||
10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
11 | ## http://crypto.stanford.edu/vpaes/. | ||
12 | |||
13 | ###################################################################### | ||
14 | # September 2011. | ||
15 | # | ||
16 | # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for | ||
17 | # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
18 | # doesn't handle partial vectors (doesn't have to if called from | ||
19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
20 | # schedule structure with the original nor does it make assumption | ||
21 | # about its alignment... | ||
22 | # | ||
23 | # Performance summary. aes-586.pl column lists large-block CBC | ||
24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
25 | # byte processed with 128-bit key, and vpaes-x86.pl column - [also | ||
26 | # large-block CBC] encrypt/decrypt. | ||
27 | # | ||
28 | # aes-586.pl vpaes-x86.pl | ||
29 | # | ||
30 | # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) | ||
31 | # Nehalem 27.9/40.4/18.1 10.3/12.0 | ||
32 | # Atom 102./119./60.1 64.5/85.3(***) | ||
33 | # | ||
34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
36 | # majority of contemporary cores share cache, slower code path | ||
37 | # is common place. In other words "with-hyper-threading-off" | ||
38 | # results are presented mostly for reference purposes. | ||
39 | # | ||
40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
41 | # | ||
42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
43 | # pshufb, yet it's respectable +32%/65% improvement on Core 2 | ||
44 | # and +58%/40% on Atom (as implied, over "hyper-threading-safe" | ||
45 | # code path). | ||
46 | # | ||
47 | # <appro@openssl.org> | ||
48 | |||
49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
50 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
51 | require "x86asm.pl"; | ||
52 | |||
53 | &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
54 | |||
55 | $PREFIX="vpaes"; | ||
56 | |||
57 | my ($round, $base, $magic, $key, $const, $inp, $out)= | ||
58 | ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); | ||
59 | |||
60 | &rodataseg(); | ||
61 | &static_label("_vpaes_consts"); | ||
62 | &static_label("_vpaes_schedule_low_round"); | ||
63 | |||
64 | &set_label("_vpaes_consts",64); | ||
65 | $k_inv=-0x30; # inv, inva | ||
66 | &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); | ||
67 | &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); | ||
68 | |||
69 | $k_s0F=-0x10; # s0F | ||
70 | &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); | ||
71 | |||
72 | $k_ipt=0x00; # input transform (lo, hi) | ||
73 | &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); | ||
74 | &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); | ||
75 | |||
76 | $k_sb1=0x20; # sb1u, sb1t | ||
77 | &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); | ||
78 | &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); | ||
79 | $k_sb2=0x40; # sb2u, sb2t | ||
80 | &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); | ||
81 | &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); | ||
82 | $k_sbo=0x60; # sbou, sbot | ||
83 | &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); | ||
84 | &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); | ||
85 | |||
86 | $k_mc_forward=0x80; # mc_forward | ||
87 | &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); | ||
88 | &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); | ||
89 | &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); | ||
90 | &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); | ||
91 | |||
92 | $k_mc_backward=0xc0; # mc_backward | ||
93 | &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); | ||
94 | &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); | ||
95 | &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); | ||
96 | &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); | ||
97 | |||
98 | $k_sr=0x100; # sr | ||
99 | &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); | ||
100 | &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); | ||
101 | &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); | ||
102 | &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); | ||
103 | |||
104 | $k_rcon=0x140; # rcon | ||
105 | &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); | ||
106 | |||
107 | $k_s63=0x150; # s63: all equal to 0x63 transformed | ||
108 | &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); | ||
109 | |||
110 | $k_opt=0x160; # output transform | ||
111 | &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); | ||
112 | &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); | ||
113 | |||
114 | $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" | ||
115 | &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); | ||
116 | &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); | ||
117 | ## | ||
118 | ## Decryption stuff | ||
119 | ## Key schedule constants | ||
120 | ## | ||
121 | $k_dksd=0x1a0; # decryption key schedule: invskew x*D | ||
122 | &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); | ||
123 | &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); | ||
124 | $k_dksb=0x1c0; # decryption key schedule: invskew x*B | ||
125 | &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); | ||
126 | &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); | ||
127 | $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 | ||
128 | &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); | ||
129 | &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); | ||
130 | $k_dks9=0x200; # decryption key schedule: invskew x*9 | ||
131 | &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); | ||
132 | &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); | ||
133 | |||
134 | ## | ||
135 | ## Decryption stuff | ||
136 | ## Round function constants | ||
137 | ## | ||
138 | $k_dipt=0x220; # decryption input transform | ||
139 | &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); | ||
140 | &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); | ||
141 | |||
142 | $k_dsb9=0x240; # decryption sbox output *9*u, *9*t | ||
143 | &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); | ||
144 | &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); | ||
145 | $k_dsbd=0x260; # decryption sbox output *D*u, *D*t | ||
146 | &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); | ||
147 | &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); | ||
148 | $k_dsbb=0x280; # decryption sbox output *B*u, *B*t | ||
149 | &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); | ||
150 | &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); | ||
151 | $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t | ||
152 | &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); | ||
153 | &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); | ||
154 | $k_dsbo=0x2c0; # decryption sbox final output | ||
155 | &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); | ||
156 | &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); | ||
157 | &previous(); | ||
158 | |||
159 | &function_begin_B("_vpaes_preheat"); | ||
160 | &movdqa ("xmm7",&QWP($k_inv,$const)); | ||
161 | &movdqa ("xmm6",&QWP($k_s0F,$const)); | ||
162 | &ret (); | ||
163 | &function_end_B("_vpaes_preheat"); | ||
164 | |||
165 | ## | ||
166 | ## _aes_encrypt_core | ||
167 | ## | ||
168 | ## AES-encrypt %xmm0. | ||
169 | ## | ||
170 | ## Inputs: | ||
171 | ## %xmm0 = input | ||
172 | ## %xmm6-%xmm7 as in _vpaes_preheat | ||
173 | ## (%edx) = scheduled keys | ||
174 | ## | ||
175 | ## Output in %xmm0 | ||
176 | ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx | ||
177 | ## | ||
178 | ## | ||
179 | &function_begin_B("_vpaes_encrypt_core"); | ||
180 | &mov ($magic,16); | ||
181 | &mov ($round,&DWP(240,$key)); | ||
182 | &movdqa ("xmm1","xmm6") | ||
183 | &movdqa ("xmm2",&QWP($k_ipt,$const)); | ||
184 | &pandn ("xmm1","xmm0"); | ||
185 | &movdqu ("xmm5",&QWP(0,$key)); | ||
186 | &psrld ("xmm1",4); | ||
187 | &pand ("xmm0","xmm6"); | ||
188 | &pshufb ("xmm2","xmm0"); | ||
189 | &movdqa ("xmm0",&QWP($k_ipt+16,$const)); | ||
190 | &pshufb ("xmm0","xmm1"); | ||
191 | &pxor ("xmm2","xmm5"); | ||
192 | &pxor ("xmm0","xmm2"); | ||
193 | &add ($key,16); | ||
194 | &lea ($base,&DWP($k_mc_backward,$const)); | ||
195 | &jmp (&label("enc_entry")); | ||
196 | |||
197 | |||
198 | &set_label("enc_loop",16); | ||
199 | # middle of middle round | ||
200 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u | ||
201 | &pshufb ("xmm4","xmm2"); # 4 = sb1u | ||
202 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
203 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t | ||
204 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
205 | &pxor ("xmm0","xmm4"); # 0 = A | ||
206 | &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u | ||
207 | &pshufb ("xmm5","xmm2"); # 4 = sb2u | ||
208 | &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] | ||
209 | &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t | ||
210 | &pshufb ("xmm2","xmm3"); # 2 = sb2t | ||
211 | &pxor ("xmm2","xmm5"); # 2 = 2A | ||
212 | &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] | ||
213 | &movdqa ("xmm3","xmm0"); # 3 = A | ||
214 | &pshufb ("xmm0","xmm1"); # 0 = B | ||
215 | &add ($key,16); # next key | ||
216 | &pxor ("xmm0","xmm2"); # 0 = 2A+B | ||
217 | &pshufb ("xmm3","xmm4"); # 3 = D | ||
218 | &add ($magic,16); # next mc | ||
219 | &pxor ("xmm3","xmm0"); # 3 = 2A+B+D | ||
220 | &pshufb ("xmm0","xmm1"); # 0 = 2B+C | ||
221 | &and ($magic,0x30); # ... mod 4 | ||
222 | &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D | ||
223 | &sub ($round,1); # nr-- | ||
224 | |||
225 | &set_label("enc_entry"); | ||
226 | # top of round | ||
227 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
228 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
229 | &psrld ("xmm1",4); # 1 = i | ||
230 | &pand ("xmm0","xmm6"); # 0 = k | ||
231 | &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k | ||
232 | &pshufb ("xmm5","xmm0"); # 2 = a/k | ||
233 | &pxor ("xmm0","xmm1"); # 0 = j | ||
234 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
235 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
236 | &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k | ||
237 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
238 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
239 | &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k | ||
240 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
241 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
242 | &pxor ("xmm2","xmm0"); # 2 = io | ||
243 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
244 | &movdqu ("xmm5",&QWP(0,$key)); | ||
245 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
246 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
247 | &jnz (&label("enc_loop")); | ||
248 | |||
249 | # middle of last round | ||
250 | &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo | ||
251 | &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 | ||
252 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
253 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
254 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
255 | &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] | ||
256 | &pxor ("xmm0","xmm4"); # 0 = A | ||
257 | &pshufb ("xmm0","xmm1"); | ||
258 | &ret (); | ||
259 | &function_end_B("_vpaes_encrypt_core"); | ||
260 | |||
261 | ## | ||
262 | ## Decryption core | ||
263 | ## | ||
264 | ## Same API as encryption core. | ||
265 | ## | ||
266 | &function_begin_B("_vpaes_decrypt_core"); | ||
267 | &mov ($round,&DWP(240,$key)); | ||
268 | &lea ($base,&DWP($k_dsbd,$const)); | ||
269 | &movdqa ("xmm1","xmm6"); | ||
270 | &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); | ||
271 | &pandn ("xmm1","xmm0"); | ||
272 | &mov ($magic,$round); | ||
273 | &psrld ("xmm1",4) | ||
274 | &movdqu ("xmm5",&QWP(0,$key)); | ||
275 | &shl ($magic,4); | ||
276 | &pand ("xmm0","xmm6"); | ||
277 | &pshufb ("xmm2","xmm0"); | ||
278 | &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); | ||
279 | &xor ($magic,0x30); | ||
280 | &pshufb ("xmm0","xmm1"); | ||
281 | &and ($magic,0x30); | ||
282 | &pxor ("xmm2","xmm5"); | ||
283 | &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); | ||
284 | &pxor ("xmm0","xmm2"); | ||
285 | &add ($key,16); | ||
286 | &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); | ||
287 | &jmp (&label("dec_entry")); | ||
288 | |||
289 | &set_label("dec_loop",16); | ||
290 | ## | ||
291 | ## Inverse mix columns | ||
292 | ## | ||
293 | &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u | ||
294 | &pshufb ("xmm4","xmm2"); # 4 = sb9u | ||
295 | &pxor ("xmm4","xmm0"); | ||
296 | &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t | ||
297 | &pshufb ("xmm0","xmm3"); # 0 = sb9t | ||
298 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
299 | &add ($key,16); # next round key | ||
300 | |||
301 | &pshufb ("xmm0","xmm5"); # MC ch | ||
302 | &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu | ||
303 | &pshufb ("xmm4","xmm2"); # 4 = sbdu | ||
304 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
305 | &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt | ||
306 | &pshufb ("xmm0","xmm3"); # 0 = sbdt | ||
307 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
308 | &sub ($round,1); # nr-- | ||
309 | |||
310 | &pshufb ("xmm0","xmm5"); # MC ch | ||
311 | &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu | ||
312 | &pshufb ("xmm4","xmm2"); # 4 = sbbu | ||
313 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
314 | &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt | ||
315 | &pshufb ("xmm0","xmm3"); # 0 = sbbt | ||
316 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
317 | |||
318 | &pshufb ("xmm0","xmm5"); # MC ch | ||
319 | &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu | ||
320 | &pshufb ("xmm4","xmm2"); # 4 = sbeu | ||
321 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
322 | &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet | ||
323 | &pshufb ("xmm0","xmm3"); # 0 = sbet | ||
324 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
325 | |||
326 | &palignr("xmm5","xmm5",12); | ||
327 | |||
328 | &set_label("dec_entry"); | ||
329 | # top of round | ||
330 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
331 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
332 | &psrld ("xmm1",4); # 1 = i | ||
333 | &pand ("xmm0","xmm6"); # 0 = k | ||
334 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
335 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
336 | &pxor ("xmm0","xmm1"); # 0 = j | ||
337 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
338 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
339 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
340 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
341 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
342 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
343 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
344 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
345 | &pxor ("xmm2","xmm0"); # 2 = io | ||
346 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
347 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
348 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
349 | &movdqu ("xmm0",&QWP(0,$key)); | ||
350 | &jnz (&label("dec_loop")); | ||
351 | |||
352 | # middle of last round | ||
353 | &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou | ||
354 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
355 | &pxor ("xmm4","xmm0"); # 4 = sb1u + k | ||
356 | &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot | ||
357 | &movdqa ("xmm2",&QWP(0,$magic)); | ||
358 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
359 | &pxor ("xmm0","xmm4"); # 0 = A | ||
360 | &pshufb ("xmm0","xmm2"); | ||
361 | &ret (); | ||
362 | &function_end_B("_vpaes_decrypt_core"); | ||
363 | |||
364 | ######################################################## | ||
365 | ## ## | ||
366 | ## AES key schedule ## | ||
367 | ## ## | ||
368 | ######################################################## | ||
369 | &function_begin_B("_vpaes_schedule_core"); | ||
370 | &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) | ||
371 | &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon | ||
372 | |||
373 | # input transform | ||
374 | &movdqa ("xmm3","xmm0"); | ||
375 | &lea ($base,&DWP($k_ipt,$const)); | ||
376 | &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 | ||
377 | &call ("_vpaes_schedule_transform"); | ||
378 | &movdqa ("xmm7","xmm0"); | ||
379 | |||
380 | &test ($out,$out); | ||
381 | &jnz (&label("schedule_am_decrypting")); | ||
382 | |||
383 | # encrypting, output zeroth round key after transform | ||
384 | &movdqu (&QWP(0,$key),"xmm0"); | ||
385 | &jmp (&label("schedule_go")); | ||
386 | |||
387 | &set_label("schedule_am_decrypting"); | ||
388 | # decrypting, output zeroth round key after shiftrows | ||
389 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
390 | &pshufb ("xmm3","xmm1"); | ||
391 | &movdqu (&QWP(0,$key),"xmm3"); | ||
392 | &xor ($magic,0x30); | ||
393 | |||
394 | &set_label("schedule_go"); | ||
395 | &cmp ($round,192); | ||
396 | &ja (&label("schedule_256")); | ||
397 | &je (&label("schedule_192")); | ||
398 | # 128: fall though | ||
399 | |||
400 | ## | ||
401 | ## .schedule_128 | ||
402 | ## | ||
403 | ## 128-bit specific part of key schedule. | ||
404 | ## | ||
405 | ## This schedule is really simple, because all its parts | ||
406 | ## are accomplished by the subroutines. | ||
407 | ## | ||
408 | &set_label("schedule_128"); | ||
409 | &mov ($round,10); | ||
410 | |||
411 | &set_label("loop_schedule_128"); | ||
412 | &call ("_vpaes_schedule_round"); | ||
413 | &dec ($round); | ||
414 | &jz (&label("schedule_mangle_last")); | ||
415 | &call ("_vpaes_schedule_mangle"); # write output | ||
416 | &jmp (&label("loop_schedule_128")); | ||
417 | |||
418 | ## | ||
419 | ## .aes_schedule_192 | ||
420 | ## | ||
421 | ## 192-bit specific part of key schedule. | ||
422 | ## | ||
423 | ## The main body of this schedule is the same as the 128-bit | ||
424 | ## schedule, but with more smearing. The long, high side is | ||
425 | ## stored in %xmm7 as before, and the short, low side is in | ||
426 | ## the high bits of %xmm6. | ||
427 | ## | ||
428 | ## This schedule is somewhat nastier, however, because each | ||
429 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
430 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
431 | ## keys. | ||
432 | ## | ||
433 | &set_label("schedule_192",16); | ||
434 | &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) | ||
435 | &call ("_vpaes_schedule_transform"); # input transform | ||
436 | &movdqa ("xmm6","xmm0"); # save short part | ||
437 | &pxor ("xmm4","xmm4"); # clear 4 | ||
438 | &movhlps("xmm6","xmm4"); # clobber low side with zeros | ||
439 | &mov ($round,4); | ||
440 | |||
441 | &set_label("loop_schedule_192"); | ||
442 | &call ("_vpaes_schedule_round"); | ||
443 | &palignr("xmm0","xmm6",8); | ||
444 | &call ("_vpaes_schedule_mangle"); # save key n | ||
445 | &call ("_vpaes_schedule_192_smear"); | ||
446 | &call ("_vpaes_schedule_mangle"); # save key n+1 | ||
447 | &call ("_vpaes_schedule_round"); | ||
448 | &dec ($round); | ||
449 | &jz (&label("schedule_mangle_last")); | ||
450 | &call ("_vpaes_schedule_mangle"); # save key n+2 | ||
451 | &call ("_vpaes_schedule_192_smear"); | ||
452 | &jmp (&label("loop_schedule_192")); | ||
453 | |||
454 | ## | ||
455 | ## .aes_schedule_256 | ||
456 | ## | ||
457 | ## 256-bit specific part of key schedule. | ||
458 | ## | ||
459 | ## The structure here is very similar to the 128-bit | ||
460 | ## schedule, but with an additional "low side" in | ||
461 | ## %xmm6. The low side's rounds are the same as the | ||
462 | ## high side's, except no rcon and no rotation. | ||
463 | ## | ||
464 | &set_label("schedule_256",16); | ||
465 | &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) | ||
466 | &call ("_vpaes_schedule_transform"); # input transform | ||
467 | &mov ($round,7); | ||
468 | |||
469 | &set_label("loop_schedule_256"); | ||
470 | &call ("_vpaes_schedule_mangle"); # output low result | ||
471 | &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 | ||
472 | |||
473 | # high round | ||
474 | &call ("_vpaes_schedule_round"); | ||
475 | &dec ($round); | ||
476 | &jz (&label("schedule_mangle_last")); | ||
477 | &call ("_vpaes_schedule_mangle"); | ||
478 | |||
479 | # low round. swap xmm7 and xmm6 | ||
480 | &pshufd ("xmm0","xmm0",0xFF); | ||
481 | &movdqa (&QWP(20,"esp"),"xmm7"); | ||
482 | &movdqa ("xmm7","xmm6"); | ||
483 | &call ("_vpaes_schedule_low_round"); | ||
484 | &movdqa ("xmm7",&QWP(20,"esp")); | ||
485 | |||
486 | &jmp (&label("loop_schedule_256")); | ||
487 | |||
488 | ## | ||
489 | ## .aes_schedule_mangle_last | ||
490 | ## | ||
491 | ## Mangler for last round of key schedule | ||
492 | ## Mangles %xmm0 | ||
493 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
494 | ## when decrypting, outputs unskew(%xmm0) | ||
495 | ## | ||
496 | ## Always called right before return... jumps to cleanup and exits | ||
497 | ## | ||
498 | &set_label("schedule_mangle_last",16); | ||
499 | # schedule last round key from xmm0 | ||
500 | &lea ($base,&DWP($k_deskew,$const)); | ||
501 | &test ($out,$out); | ||
502 | &jnz (&label("schedule_mangle_last_dec")); | ||
503 | |||
504 | # encrypting | ||
505 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
506 | &pshufb ("xmm0","xmm1"); # output permute | ||
507 | &lea ($base,&DWP($k_opt,$const)); # prepare to output transform | ||
508 | &add ($key,32); | ||
509 | |||
510 | &set_label("schedule_mangle_last_dec"); | ||
511 | &add ($key,-16); | ||
512 | &pxor ("xmm0",&QWP($k_s63,$const)); | ||
513 | &call ("_vpaes_schedule_transform"); # output transform | ||
514 | &movdqu (&QWP(0,$key),"xmm0"); # save last key | ||
515 | |||
516 | # cleanup | ||
517 | &pxor ("xmm0","xmm0"); | ||
518 | &pxor ("xmm1","xmm1"); | ||
519 | &pxor ("xmm2","xmm2"); | ||
520 | &pxor ("xmm3","xmm3"); | ||
521 | &pxor ("xmm4","xmm4"); | ||
522 | &pxor ("xmm5","xmm5"); | ||
523 | &pxor ("xmm6","xmm6"); | ||
524 | &pxor ("xmm7","xmm7"); | ||
525 | &ret (); | ||
526 | &function_end_B("_vpaes_schedule_core"); | ||
527 | |||
528 | ## | ||
529 | ## .aes_schedule_192_smear | ||
530 | ## | ||
531 | ## Smear the short, low side in the 192-bit key schedule. | ||
532 | ## | ||
533 | ## Inputs: | ||
534 | ## %xmm7: high side, b a x y | ||
535 | ## %xmm6: low side, d c 0 0 | ||
536 | ## %xmm13: 0 | ||
537 | ## | ||
538 | ## Outputs: | ||
539 | ## %xmm6: b+c+d b+c 0 0 | ||
540 | ## %xmm0: b+c+d b+c b a | ||
541 | ## | ||
542 | &function_begin_B("_vpaes_schedule_192_smear"); | ||
543 | &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 | ||
544 | &pxor ("xmm6","xmm0"); # -> c+d c 0 0 | ||
545 | &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a | ||
546 | &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a | ||
547 | &movdqa ("xmm0","xmm6"); | ||
548 | &pxor ("xmm1","xmm1"); | ||
549 | &movhlps("xmm6","xmm1"); # clobber low side with zeros | ||
550 | &ret (); | ||
551 | &function_end_B("_vpaes_schedule_192_smear"); | ||
552 | |||
553 | ## | ||
554 | ## .aes_schedule_round | ||
555 | ## | ||
556 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
557 | ## | ||
558 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
559 | ## then rotates it by one byte and xors into the low dword of | ||
560 | ## %xmm7. | ||
561 | ## | ||
562 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
563 | ## next rcon. | ||
564 | ## | ||
565 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
566 | ## second low, result into third, result into highest. | ||
567 | ## | ||
568 | ## Returns results in %xmm7 = %xmm0. | ||
569 | ## Clobbers %xmm1-%xmm5. | ||
570 | ## | ||
571 | &function_begin_B("_vpaes_schedule_round"); | ||
572 | # extract rcon from xmm8 | ||
573 | &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 | ||
574 | &pxor ("xmm1","xmm1"); | ||
575 | &palignr("xmm1","xmm2",15); | ||
576 | &palignr("xmm2","xmm2",15); | ||
577 | &pxor ("xmm7","xmm1"); | ||
578 | |||
579 | # rotate | ||
580 | &pshufd ("xmm0","xmm0",0xFF); | ||
581 | &palignr("xmm0","xmm0",1); | ||
582 | |||
583 | # fall through... | ||
584 | &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 | ||
585 | |||
586 | # low round: same as high round, but no rotation and no rcon. | ||
587 | &set_label("_vpaes_schedule_low_round"); | ||
588 | # smear xmm7 | ||
589 | &movdqa ("xmm1","xmm7"); | ||
590 | &pslldq ("xmm7",4); | ||
591 | &pxor ("xmm7","xmm1"); | ||
592 | &movdqa ("xmm1","xmm7"); | ||
593 | &pslldq ("xmm7",8); | ||
594 | &pxor ("xmm7","xmm1"); | ||
595 | &pxor ("xmm7",&QWP($k_s63,$const)); | ||
596 | |||
597 | # subbyte | ||
598 | &movdqa ("xmm4",&QWP($k_s0F,$const)); | ||
599 | &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j | ||
600 | &movdqa ("xmm1","xmm4"); | ||
601 | &pandn ("xmm1","xmm0"); | ||
602 | &psrld ("xmm1",4); # 1 = i | ||
603 | &pand ("xmm0","xmm4"); # 0 = k | ||
604 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
605 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
606 | &pxor ("xmm0","xmm1"); # 0 = j | ||
607 | &movdqa ("xmm3","xmm5"); # 3 : 1/i | ||
608 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
609 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
610 | &movdqa ("xmm4","xmm5"); # 4 : 1/j | ||
611 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
612 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
613 | &movdqa ("xmm2","xmm5"); # 2 : 1/iak | ||
614 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
615 | &pxor ("xmm2","xmm0"); # 2 = io | ||
616 | &movdqa ("xmm3","xmm5"); # 3 : 1/jak | ||
617 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
618 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
619 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou | ||
620 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
621 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot | ||
622 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
623 | &pxor ("xmm0","xmm4"); # 0 = sbox output | ||
624 | |||
625 | # add in smeared stuff | ||
626 | &pxor ("xmm0","xmm7"); | ||
627 | &movdqa ("xmm7","xmm0"); | ||
628 | &ret (); | ||
629 | &function_end_B("_vpaes_schedule_round"); | ||
630 | |||
631 | ## | ||
632 | ## .aes_schedule_transform | ||
633 | ## | ||
634 | ## Linear-transform %xmm0 according to tables at (%ebx) | ||
635 | ## | ||
636 | ## Output in %xmm0 | ||
637 | ## Clobbers %xmm1, %xmm2 | ||
638 | ## | ||
639 | &function_begin_B("_vpaes_schedule_transform"); | ||
640 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
641 | &movdqa ("xmm1","xmm2"); | ||
642 | &pandn ("xmm1","xmm0"); | ||
643 | &psrld ("xmm1",4); | ||
644 | &pand ("xmm0","xmm2"); | ||
645 | &movdqa ("xmm2",&QWP(0,$base)); | ||
646 | &pshufb ("xmm2","xmm0"); | ||
647 | &movdqa ("xmm0",&QWP(16,$base)); | ||
648 | &pshufb ("xmm0","xmm1"); | ||
649 | &pxor ("xmm0","xmm2"); | ||
650 | &ret (); | ||
651 | &function_end_B("_vpaes_schedule_transform"); | ||
652 | |||
653 | ## | ||
654 | ## .aes_schedule_mangle | ||
655 | ## | ||
656 | ## Mangle xmm0 from (basis-transformed) standard version | ||
657 | ## to our version. | ||
658 | ## | ||
659 | ## On encrypt, | ||
660 | ## xor with 0x63 | ||
661 | ## multiply by circulant 0,1,1,1 | ||
662 | ## apply shiftrows transform | ||
663 | ## | ||
664 | ## On decrypt, | ||
665 | ## xor with 0x63 | ||
666 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
667 | ## deskew | ||
668 | ## apply shiftrows transform | ||
669 | ## | ||
670 | ## | ||
671 | ## Writes out to (%edx), and increments or decrements it | ||
672 | ## Keeps track of round number mod 4 in %ecx | ||
673 | ## Preserves xmm0 | ||
674 | ## Clobbers xmm1-xmm5 | ||
675 | ## | ||
676 | &function_begin_B("_vpaes_schedule_mangle"); | ||
677 | &movdqa ("xmm4","xmm0"); # save xmm0 for later | ||
678 | &movdqa ("xmm5",&QWP($k_mc_forward,$const)); | ||
679 | &test ($out,$out); | ||
680 | &jnz (&label("schedule_mangle_dec")); | ||
681 | |||
682 | # encrypting | ||
683 | &add ($key,16); | ||
684 | &pxor ("xmm4",&QWP($k_s63,$const)); | ||
685 | &pshufb ("xmm4","xmm5"); | ||
686 | &movdqa ("xmm3","xmm4"); | ||
687 | &pshufb ("xmm4","xmm5"); | ||
688 | &pxor ("xmm3","xmm4"); | ||
689 | &pshufb ("xmm4","xmm5"); | ||
690 | &pxor ("xmm3","xmm4"); | ||
691 | |||
692 | &jmp (&label("schedule_mangle_both")); | ||
693 | |||
694 | &set_label("schedule_mangle_dec",16); | ||
695 | # inverse mix columns | ||
696 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
697 | &lea ($inp,&DWP($k_dksd,$const)); | ||
698 | &movdqa ("xmm1","xmm2"); | ||
699 | &pandn ("xmm1","xmm4"); | ||
700 | &psrld ("xmm1",4); # 1 = hi | ||
701 | &pand ("xmm4","xmm2"); # 4 = lo | ||
702 | |||
703 | &movdqa ("xmm2",&QWP(0,$inp)); | ||
704 | &pshufb ("xmm2","xmm4"); | ||
705 | &movdqa ("xmm3",&QWP(0x10,$inp)); | ||
706 | &pshufb ("xmm3","xmm1"); | ||
707 | &pxor ("xmm3","xmm2"); | ||
708 | &pshufb ("xmm3","xmm5"); | ||
709 | |||
710 | &movdqa ("xmm2",&QWP(0x20,$inp)); | ||
711 | &pshufb ("xmm2","xmm4"); | ||
712 | &pxor ("xmm2","xmm3"); | ||
713 | &movdqa ("xmm3",&QWP(0x30,$inp)); | ||
714 | &pshufb ("xmm3","xmm1"); | ||
715 | &pxor ("xmm3","xmm2"); | ||
716 | &pshufb ("xmm3","xmm5"); | ||
717 | |||
718 | &movdqa ("xmm2",&QWP(0x40,$inp)); | ||
719 | &pshufb ("xmm2","xmm4"); | ||
720 | &pxor ("xmm2","xmm3"); | ||
721 | &movdqa ("xmm3",&QWP(0x50,$inp)); | ||
722 | &pshufb ("xmm3","xmm1"); | ||
723 | &pxor ("xmm3","xmm2"); | ||
724 | &pshufb ("xmm3","xmm5"); | ||
725 | |||
726 | &movdqa ("xmm2",&QWP(0x60,$inp)); | ||
727 | &pshufb ("xmm2","xmm4"); | ||
728 | &pxor ("xmm2","xmm3"); | ||
729 | &movdqa ("xmm3",&QWP(0x70,$inp)); | ||
730 | &pshufb ("xmm3","xmm1"); | ||
731 | &pxor ("xmm3","xmm2"); | ||
732 | |||
733 | &add ($key,-16); | ||
734 | |||
735 | &set_label("schedule_mangle_both"); | ||
736 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
737 | &pshufb ("xmm3","xmm1"); | ||
738 | &add ($magic,-16); | ||
739 | &and ($magic,0x30); | ||
740 | &movdqu (&QWP(0,$key),"xmm3"); | ||
741 | &ret (); | ||
742 | &function_end_B("_vpaes_schedule_mangle"); | ||
743 | |||
744 | # | ||
745 | # Interface to OpenSSL | ||
746 | # | ||
747 | &function_begin("${PREFIX}_set_encrypt_key"); | ||
748 | &mov ($inp,&wparam(0)); # inp | ||
749 | &lea ($base,&DWP(-56,"esp")); | ||
750 | &mov ($round,&wparam(1)); # bits | ||
751 | &and ($base,-16); | ||
752 | &mov ($key,&wparam(2)); # key | ||
753 | &xchg ($base,"esp"); # alloca | ||
754 | &mov (&DWP(48,"esp"),$base); | ||
755 | |||
756 | &mov ($base,$round); | ||
757 | &shr ($base,5); | ||
758 | &add ($base,5); | ||
759 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
760 | &mov ($magic,0x30); | ||
761 | &mov ($out,0); | ||
762 | |||
763 | &picsetup($const); | ||
764 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
765 | &lea ($const,&DWP(0x30,$const)) | ||
766 | |||
767 | &call ("_vpaes_schedule_core"); | ||
768 | |||
769 | &mov ("esp",&DWP(48,"esp")); | ||
770 | &xor ("eax","eax"); | ||
771 | &function_end("${PREFIX}_set_encrypt_key"); | ||
772 | |||
773 | &function_begin("${PREFIX}_set_decrypt_key"); | ||
774 | &mov ($inp,&wparam(0)); # inp | ||
775 | &lea ($base,&DWP(-56,"esp")); | ||
776 | &mov ($round,&wparam(1)); # bits | ||
777 | &and ($base,-16); | ||
778 | &mov ($key,&wparam(2)); # key | ||
779 | &xchg ($base,"esp"); # alloca | ||
780 | &mov (&DWP(48,"esp"),$base); | ||
781 | |||
782 | &mov ($base,$round); | ||
783 | &shr ($base,5); | ||
784 | &add ($base,5); | ||
785 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
786 | &shl ($base,4); | ||
787 | &lea ($key,&DWP(16,$key,$base)); | ||
788 | |||
789 | &mov ($out,1); | ||
790 | &mov ($magic,$round); | ||
791 | &shr ($magic,1); | ||
792 | &and ($magic,32); | ||
793 | &xor ($magic,32); # nbist==192?0:32; | ||
794 | |||
795 | &picsetup($const); | ||
796 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
797 | &lea ($const,&DWP(0x30,$const)) | ||
798 | |||
799 | &call ("_vpaes_schedule_core"); | ||
800 | |||
801 | &mov ("esp",&DWP(48,"esp")); | ||
802 | &xor ("eax","eax"); | ||
803 | &function_end("${PREFIX}_set_decrypt_key"); | ||
804 | |||
805 | &function_begin("${PREFIX}_encrypt"); | ||
806 | &picsetup($const); | ||
807 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
808 | &lea ($const,&DWP(0x30,$const)) | ||
809 | |||
810 | &call ("_vpaes_preheat"); | ||
811 | &mov ($inp,&wparam(0)); # inp | ||
812 | &lea ($base,&DWP(-56,"esp")); | ||
813 | &mov ($out,&wparam(1)); # out | ||
814 | &and ($base,-16); | ||
815 | &mov ($key,&wparam(2)); # key | ||
816 | &xchg ($base,"esp"); # alloca | ||
817 | &mov (&DWP(48,"esp"),$base); | ||
818 | |||
819 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
820 | &call ("_vpaes_encrypt_core"); | ||
821 | &movdqu (&QWP(0,$out),"xmm0"); | ||
822 | |||
823 | &mov ("esp",&DWP(48,"esp")); | ||
824 | &function_end("${PREFIX}_encrypt"); | ||
825 | |||
826 | &function_begin("${PREFIX}_decrypt"); | ||
827 | &picsetup($const); | ||
828 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
829 | &lea ($const,&DWP(0x30,$const)) | ||
830 | |||
831 | &call ("_vpaes_preheat"); | ||
832 | &mov ($inp,&wparam(0)); # inp | ||
833 | &lea ($base,&DWP(-56,"esp")); | ||
834 | &mov ($out,&wparam(1)); # out | ||
835 | &and ($base,-16); | ||
836 | &mov ($key,&wparam(2)); # key | ||
837 | &xchg ($base,"esp"); # alloca | ||
838 | &mov (&DWP(48,"esp"),$base); | ||
839 | |||
840 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
841 | &call ("_vpaes_decrypt_core"); | ||
842 | &movdqu (&QWP(0,$out),"xmm0"); | ||
843 | |||
844 | &mov ("esp",&DWP(48,"esp")); | ||
845 | &function_end("${PREFIX}_decrypt"); | ||
846 | |||
847 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
848 | &mov ($inp,&wparam(0)); # inp | ||
849 | &mov ($out,&wparam(1)); # out | ||
850 | &mov ($round,&wparam(2)); # len | ||
851 | &mov ($key,&wparam(3)); # key | ||
852 | &sub ($round,16); | ||
853 | &jc (&label("cbc_abort")); | ||
854 | &lea ($base,&DWP(-56,"esp")); | ||
855 | &mov ($const,&wparam(4)); # ivp | ||
856 | &and ($base,-16); | ||
857 | &mov ($magic,&wparam(5)); # enc | ||
858 | &xchg ($base,"esp"); # alloca | ||
859 | &movdqu ("xmm1",&QWP(0,$const)); # load IV | ||
860 | &sub ($out,$inp); | ||
861 | &mov (&DWP(48,"esp"),$base); | ||
862 | |||
863 | &mov (&DWP(0,"esp"),$out); # save out | ||
864 | &mov (&DWP(4,"esp"),$key) # save key | ||
865 | &mov (&DWP(8,"esp"),$const); # save ivp | ||
866 | &mov ($out,$round); # $out works as $len | ||
867 | |||
868 | &picsetup($const); | ||
869 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
870 | &lea ($const,&DWP(0x30,$const)) | ||
871 | |||
872 | &call ("_vpaes_preheat"); | ||
873 | &cmp ($magic,0); | ||
874 | &je (&label("cbc_dec_loop")); | ||
875 | &jmp (&label("cbc_enc_loop")); | ||
876 | |||
877 | &set_label("cbc_enc_loop",16); | ||
878 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
879 | &pxor ("xmm0","xmm1"); # inp^=iv | ||
880 | &call ("_vpaes_encrypt_core"); | ||
881 | &mov ($base,&DWP(0,"esp")); # restore out | ||
882 | &mov ($key,&DWP(4,"esp")); # restore key | ||
883 | &movdqa ("xmm1","xmm0"); | ||
884 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
885 | &lea ($inp,&DWP(16,$inp)); | ||
886 | &sub ($out,16); | ||
887 | &jnc (&label("cbc_enc_loop")); | ||
888 | &jmp (&label("cbc_done")); | ||
889 | |||
890 | &set_label("cbc_dec_loop",16); | ||
891 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
892 | &movdqa (&QWP(16,"esp"),"xmm1"); # save IV | ||
893 | &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV | ||
894 | &call ("_vpaes_decrypt_core"); | ||
895 | &mov ($base,&DWP(0,"esp")); # restore out | ||
896 | &mov ($key,&DWP(4,"esp")); # restore key | ||
897 | &pxor ("xmm0",&QWP(16,"esp")); # out^=iv | ||
898 | &movdqa ("xmm1",&QWP(32,"esp")); # load next IV | ||
899 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
900 | &lea ($inp,&DWP(16,$inp)); | ||
901 | &sub ($out,16); | ||
902 | &jnc (&label("cbc_dec_loop")); | ||
903 | |||
904 | &set_label("cbc_done"); | ||
905 | &mov ($base,&DWP(8,"esp")); # restore ivp | ||
906 | &mov ("esp",&DWP(48,"esp")); | ||
907 | &movdqu (&QWP(0,$base),"xmm1"); # write IV | ||
908 | &set_label("cbc_abort"); | ||
909 | &function_end("${PREFIX}_cbc_encrypt"); | ||
910 | |||
911 | &asm_finish(); | ||