diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm/vpaes-x86.pl')
-rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86.pl | 903 |
1 files changed, 0 insertions, 903 deletions
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl deleted file mode 100644 index 1533e2c304..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl +++ /dev/null | |||
@@ -1,903 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ###################################################################### | ||
4 | ## Constant-time SSSE3 AES core implementation. | ||
5 | ## version 0.1 | ||
6 | ## | ||
7 | ## By Mike Hamburg (Stanford University), 2009 | ||
8 | ## Public domain. | ||
9 | ## | ||
10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
11 | ## http://crypto.stanford.edu/vpaes/. | ||
12 | |||
13 | ###################################################################### | ||
14 | # September 2011. | ||
15 | # | ||
16 | # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for | ||
17 | # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
18 | # doesn't handle partial vectors (doesn't have to if called from | ||
19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
20 | # schedule structure with the original nor does it make assumption | ||
21 | # about its alignment... | ||
22 | # | ||
23 | # Performance summary. aes-586.pl column lists large-block CBC | ||
24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
25 | # byte processed with 128-bit key, and vpaes-x86.pl column - [also | ||
26 | # large-block CBC] encrypt/decrypt. | ||
27 | # | ||
28 | # aes-586.pl vpaes-x86.pl | ||
29 | # | ||
30 | # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) | ||
31 | # Nehalem 27.9/40.4/18.1 10.3/12.0 | ||
32 | # Atom 102./119./60.1 64.5/85.3(***) | ||
33 | # | ||
34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
36 | # majority of contemporary cores share cache, slower code path | ||
37 | # is common place. In other words "with-hyper-threading-off" | ||
38 | # results are presented mostly for reference purposes. | ||
39 | # | ||
40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
41 | # | ||
42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
43 | # pshufb, yet it's respectable +32%/65% improvement on Core 2 | ||
44 | # and +58%/40% on Atom (as implied, over "hyper-threading-safe" | ||
45 | # code path). | ||
46 | # | ||
47 | # <appro@openssl.org> | ||
48 | |||
49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
50 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
51 | require "x86asm.pl"; | ||
52 | |||
53 | &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
54 | |||
55 | $PREFIX="vpaes"; | ||
56 | |||
57 | my ($round, $base, $magic, $key, $const, $inp, $out)= | ||
58 | ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); | ||
59 | |||
60 | &static_label("_vpaes_consts"); | ||
61 | &static_label("_vpaes_schedule_low_round"); | ||
62 | |||
63 | &set_label("_vpaes_consts",64); | ||
64 | $k_inv=-0x30; # inv, inva | ||
65 | &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); | ||
66 | &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); | ||
67 | |||
68 | $k_s0F=-0x10; # s0F | ||
69 | &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); | ||
70 | |||
71 | $k_ipt=0x00; # input transform (lo, hi) | ||
72 | &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); | ||
73 | &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); | ||
74 | |||
75 | $k_sb1=0x20; # sb1u, sb1t | ||
76 | &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); | ||
77 | &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); | ||
78 | $k_sb2=0x40; # sb2u, sb2t | ||
79 | &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); | ||
80 | &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); | ||
81 | $k_sbo=0x60; # sbou, sbot | ||
82 | &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); | ||
83 | &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); | ||
84 | |||
85 | $k_mc_forward=0x80; # mc_forward | ||
86 | &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); | ||
87 | &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); | ||
88 | &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); | ||
89 | &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); | ||
90 | |||
91 | $k_mc_backward=0xc0; # mc_backward | ||
92 | &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); | ||
93 | &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); | ||
94 | &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); | ||
95 | &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); | ||
96 | |||
97 | $k_sr=0x100; # sr | ||
98 | &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); | ||
99 | &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); | ||
100 | &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); | ||
101 | &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); | ||
102 | |||
103 | $k_rcon=0x140; # rcon | ||
104 | &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); | ||
105 | |||
106 | $k_s63=0x150; # s63: all equal to 0x63 transformed | ||
107 | &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); | ||
108 | |||
109 | $k_opt=0x160; # output transform | ||
110 | &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); | ||
111 | &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); | ||
112 | |||
113 | $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" | ||
114 | &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); | ||
115 | &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); | ||
116 | ## | ||
117 | ## Decryption stuff | ||
118 | ## Key schedule constants | ||
119 | ## | ||
120 | $k_dksd=0x1a0; # decryption key schedule: invskew x*D | ||
121 | &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); | ||
122 | &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); | ||
123 | $k_dksb=0x1c0; # decryption key schedule: invskew x*B | ||
124 | &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); | ||
125 | &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); | ||
126 | $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 | ||
127 | &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); | ||
128 | &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); | ||
129 | $k_dks9=0x200; # decryption key schedule: invskew x*9 | ||
130 | &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); | ||
131 | &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); | ||
132 | |||
133 | ## | ||
134 | ## Decryption stuff | ||
135 | ## Round function constants | ||
136 | ## | ||
137 | $k_dipt=0x220; # decryption input transform | ||
138 | &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); | ||
139 | &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); | ||
140 | |||
141 | $k_dsb9=0x240; # decryption sbox output *9*u, *9*t | ||
142 | &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); | ||
143 | &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); | ||
144 | $k_dsbd=0x260; # decryption sbox output *D*u, *D*t | ||
145 | &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); | ||
146 | &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); | ||
147 | $k_dsbb=0x280; # decryption sbox output *B*u, *B*t | ||
148 | &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); | ||
149 | &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); | ||
150 | $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t | ||
151 | &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); | ||
152 | &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); | ||
153 | $k_dsbo=0x2c0; # decryption sbox final output | ||
154 | &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); | ||
155 | &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); | ||
156 | &asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); | ||
157 | &align (64); | ||
158 | |||
159 | &function_begin_B("_vpaes_preheat"); | ||
160 | &add ($const,&DWP(0,"esp")); | ||
161 | &movdqa ("xmm7",&QWP($k_inv,$const)); | ||
162 | &movdqa ("xmm6",&QWP($k_s0F,$const)); | ||
163 | &ret (); | ||
164 | &function_end_B("_vpaes_preheat"); | ||
165 | |||
166 | ## | ||
167 | ## _aes_encrypt_core | ||
168 | ## | ||
169 | ## AES-encrypt %xmm0. | ||
170 | ## | ||
171 | ## Inputs: | ||
172 | ## %xmm0 = input | ||
173 | ## %xmm6-%xmm7 as in _vpaes_preheat | ||
174 | ## (%edx) = scheduled keys | ||
175 | ## | ||
176 | ## Output in %xmm0 | ||
177 | ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx | ||
178 | ## | ||
179 | ## | ||
180 | &function_begin_B("_vpaes_encrypt_core"); | ||
181 | &mov ($magic,16); | ||
182 | &mov ($round,&DWP(240,$key)); | ||
183 | &movdqa ("xmm1","xmm6") | ||
184 | &movdqa ("xmm2",&QWP($k_ipt,$const)); | ||
185 | &pandn ("xmm1","xmm0"); | ||
186 | &movdqu ("xmm5",&QWP(0,$key)); | ||
187 | &psrld ("xmm1",4); | ||
188 | &pand ("xmm0","xmm6"); | ||
189 | &pshufb ("xmm2","xmm0"); | ||
190 | &movdqa ("xmm0",&QWP($k_ipt+16,$const)); | ||
191 | &pshufb ("xmm0","xmm1"); | ||
192 | &pxor ("xmm2","xmm5"); | ||
193 | &pxor ("xmm0","xmm2"); | ||
194 | &add ($key,16); | ||
195 | &lea ($base,&DWP($k_mc_backward,$const)); | ||
196 | &jmp (&label("enc_entry")); | ||
197 | |||
198 | |||
199 | &set_label("enc_loop",16); | ||
200 | # middle of middle round | ||
201 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u | ||
202 | &pshufb ("xmm4","xmm2"); # 4 = sb1u | ||
203 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
204 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t | ||
205 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
206 | &pxor ("xmm0","xmm4"); # 0 = A | ||
207 | &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u | ||
208 | &pshufb ("xmm5","xmm2"); # 4 = sb2u | ||
209 | &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] | ||
210 | &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t | ||
211 | &pshufb ("xmm2","xmm3"); # 2 = sb2t | ||
212 | &pxor ("xmm2","xmm5"); # 2 = 2A | ||
213 | &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] | ||
214 | &movdqa ("xmm3","xmm0"); # 3 = A | ||
215 | &pshufb ("xmm0","xmm1"); # 0 = B | ||
216 | &add ($key,16); # next key | ||
217 | &pxor ("xmm0","xmm2"); # 0 = 2A+B | ||
218 | &pshufb ("xmm3","xmm4"); # 3 = D | ||
219 | &add ($magic,16); # next mc | ||
220 | &pxor ("xmm3","xmm0"); # 3 = 2A+B+D | ||
221 | &pshufb ("xmm0","xmm1"); # 0 = 2B+C | ||
222 | &and ($magic,0x30); # ... mod 4 | ||
223 | &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D | ||
224 | &sub ($round,1); # nr-- | ||
225 | |||
226 | &set_label("enc_entry"); | ||
227 | # top of round | ||
228 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
229 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
230 | &psrld ("xmm1",4); # 1 = i | ||
231 | &pand ("xmm0","xmm6"); # 0 = k | ||
232 | &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k | ||
233 | &pshufb ("xmm5","xmm0"); # 2 = a/k | ||
234 | &pxor ("xmm0","xmm1"); # 0 = j | ||
235 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
236 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
237 | &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k | ||
238 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
239 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
240 | &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k | ||
241 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
242 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
243 | &pxor ("xmm2","xmm0"); # 2 = io | ||
244 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
245 | &movdqu ("xmm5",&QWP(0,$key)); | ||
246 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
247 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
248 | &jnz (&label("enc_loop")); | ||
249 | |||
250 | # middle of last round | ||
251 | &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo | ||
252 | &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 | ||
253 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
254 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
255 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
256 | &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] | ||
257 | &pxor ("xmm0","xmm4"); # 0 = A | ||
258 | &pshufb ("xmm0","xmm1"); | ||
259 | &ret (); | ||
260 | &function_end_B("_vpaes_encrypt_core"); | ||
261 | |||
262 | ## | ||
263 | ## Decryption core | ||
264 | ## | ||
265 | ## Same API as encryption core. | ||
266 | ## | ||
267 | &function_begin_B("_vpaes_decrypt_core"); | ||
268 | &mov ($round,&DWP(240,$key)); | ||
269 | &lea ($base,&DWP($k_dsbd,$const)); | ||
270 | &movdqa ("xmm1","xmm6"); | ||
271 | &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); | ||
272 | &pandn ("xmm1","xmm0"); | ||
273 | &mov ($magic,$round); | ||
274 | &psrld ("xmm1",4) | ||
275 | &movdqu ("xmm5",&QWP(0,$key)); | ||
276 | &shl ($magic,4); | ||
277 | &pand ("xmm0","xmm6"); | ||
278 | &pshufb ("xmm2","xmm0"); | ||
279 | &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); | ||
280 | &xor ($magic,0x30); | ||
281 | &pshufb ("xmm0","xmm1"); | ||
282 | &and ($magic,0x30); | ||
283 | &pxor ("xmm2","xmm5"); | ||
284 | &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); | ||
285 | &pxor ("xmm0","xmm2"); | ||
286 | &add ($key,16); | ||
287 | &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); | ||
288 | &jmp (&label("dec_entry")); | ||
289 | |||
290 | &set_label("dec_loop",16); | ||
291 | ## | ||
292 | ## Inverse mix columns | ||
293 | ## | ||
294 | &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u | ||
295 | &pshufb ("xmm4","xmm2"); # 4 = sb9u | ||
296 | &pxor ("xmm4","xmm0"); | ||
297 | &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t | ||
298 | &pshufb ("xmm0","xmm3"); # 0 = sb9t | ||
299 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
300 | &add ($key,16); # next round key | ||
301 | |||
302 | &pshufb ("xmm0","xmm5"); # MC ch | ||
303 | &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu | ||
304 | &pshufb ("xmm4","xmm2"); # 4 = sbdu | ||
305 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
306 | &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt | ||
307 | &pshufb ("xmm0","xmm3"); # 0 = sbdt | ||
308 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
309 | &sub ($round,1); # nr-- | ||
310 | |||
311 | &pshufb ("xmm0","xmm5"); # MC ch | ||
312 | &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu | ||
313 | &pshufb ("xmm4","xmm2"); # 4 = sbbu | ||
314 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
315 | &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt | ||
316 | &pshufb ("xmm0","xmm3"); # 0 = sbbt | ||
317 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
318 | |||
319 | &pshufb ("xmm0","xmm5"); # MC ch | ||
320 | &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu | ||
321 | &pshufb ("xmm4","xmm2"); # 4 = sbeu | ||
322 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
323 | &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet | ||
324 | &pshufb ("xmm0","xmm3"); # 0 = sbet | ||
325 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
326 | |||
327 | &palignr("xmm5","xmm5",12); | ||
328 | |||
329 | &set_label("dec_entry"); | ||
330 | # top of round | ||
331 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
332 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
333 | &psrld ("xmm1",4); # 1 = i | ||
334 | &pand ("xmm0","xmm6"); # 0 = k | ||
335 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
336 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
337 | &pxor ("xmm0","xmm1"); # 0 = j | ||
338 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
339 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
340 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
341 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
342 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
343 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
344 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
345 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
346 | &pxor ("xmm2","xmm0"); # 2 = io | ||
347 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
348 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
349 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
350 | &movdqu ("xmm0",&QWP(0,$key)); | ||
351 | &jnz (&label("dec_loop")); | ||
352 | |||
353 | # middle of last round | ||
354 | &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou | ||
355 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
356 | &pxor ("xmm4","xmm0"); # 4 = sb1u + k | ||
357 | &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot | ||
358 | &movdqa ("xmm2",&QWP(0,$magic)); | ||
359 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
360 | &pxor ("xmm0","xmm4"); # 0 = A | ||
361 | &pshufb ("xmm0","xmm2"); | ||
362 | &ret (); | ||
363 | &function_end_B("_vpaes_decrypt_core"); | ||
364 | |||
365 | ######################################################## | ||
366 | ## ## | ||
367 | ## AES key schedule ## | ||
368 | ## ## | ||
369 | ######################################################## | ||
370 | &function_begin_B("_vpaes_schedule_core"); | ||
371 | &add ($const,&DWP(0,"esp")); | ||
372 | &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) | ||
373 | &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon | ||
374 | |||
375 | # input transform | ||
376 | &movdqa ("xmm3","xmm0"); | ||
377 | &lea ($base,&DWP($k_ipt,$const)); | ||
378 | &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 | ||
379 | &call ("_vpaes_schedule_transform"); | ||
380 | &movdqa ("xmm7","xmm0"); | ||
381 | |||
382 | &test ($out,$out); | ||
383 | &jnz (&label("schedule_am_decrypting")); | ||
384 | |||
385 | # encrypting, output zeroth round key after transform | ||
386 | &movdqu (&QWP(0,$key),"xmm0"); | ||
387 | &jmp (&label("schedule_go")); | ||
388 | |||
389 | &set_label("schedule_am_decrypting"); | ||
390 | # decrypting, output zeroth round key after shiftrows | ||
391 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
392 | &pshufb ("xmm3","xmm1"); | ||
393 | &movdqu (&QWP(0,$key),"xmm3"); | ||
394 | &xor ($magic,0x30); | ||
395 | |||
396 | &set_label("schedule_go"); | ||
397 | &cmp ($round,192); | ||
398 | &ja (&label("schedule_256")); | ||
399 | &je (&label("schedule_192")); | ||
400 | # 128: fall though | ||
401 | |||
402 | ## | ||
403 | ## .schedule_128 | ||
404 | ## | ||
405 | ## 128-bit specific part of key schedule. | ||
406 | ## | ||
407 | ## This schedule is really simple, because all its parts | ||
408 | ## are accomplished by the subroutines. | ||
409 | ## | ||
410 | &set_label("schedule_128"); | ||
411 | &mov ($round,10); | ||
412 | |||
413 | &set_label("loop_schedule_128"); | ||
414 | &call ("_vpaes_schedule_round"); | ||
415 | &dec ($round); | ||
416 | &jz (&label("schedule_mangle_last")); | ||
417 | &call ("_vpaes_schedule_mangle"); # write output | ||
418 | &jmp (&label("loop_schedule_128")); | ||
419 | |||
420 | ## | ||
421 | ## .aes_schedule_192 | ||
422 | ## | ||
423 | ## 192-bit specific part of key schedule. | ||
424 | ## | ||
425 | ## The main body of this schedule is the same as the 128-bit | ||
426 | ## schedule, but with more smearing. The long, high side is | ||
427 | ## stored in %xmm7 as before, and the short, low side is in | ||
428 | ## the high bits of %xmm6. | ||
429 | ## | ||
430 | ## This schedule is somewhat nastier, however, because each | ||
431 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
432 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
433 | ## keys. | ||
434 | ## | ||
435 | &set_label("schedule_192",16); | ||
436 | &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) | ||
437 | &call ("_vpaes_schedule_transform"); # input transform | ||
438 | &movdqa ("xmm6","xmm0"); # save short part | ||
439 | &pxor ("xmm4","xmm4"); # clear 4 | ||
440 | &movhlps("xmm6","xmm4"); # clobber low side with zeros | ||
441 | &mov ($round,4); | ||
442 | |||
443 | &set_label("loop_schedule_192"); | ||
444 | &call ("_vpaes_schedule_round"); | ||
445 | &palignr("xmm0","xmm6",8); | ||
446 | &call ("_vpaes_schedule_mangle"); # save key n | ||
447 | &call ("_vpaes_schedule_192_smear"); | ||
448 | &call ("_vpaes_schedule_mangle"); # save key n+1 | ||
449 | &call ("_vpaes_schedule_round"); | ||
450 | &dec ($round); | ||
451 | &jz (&label("schedule_mangle_last")); | ||
452 | &call ("_vpaes_schedule_mangle"); # save key n+2 | ||
453 | &call ("_vpaes_schedule_192_smear"); | ||
454 | &jmp (&label("loop_schedule_192")); | ||
455 | |||
456 | ## | ||
457 | ## .aes_schedule_256 | ||
458 | ## | ||
459 | ## 256-bit specific part of key schedule. | ||
460 | ## | ||
461 | ## The structure here is very similar to the 128-bit | ||
462 | ## schedule, but with an additional "low side" in | ||
463 | ## %xmm6. The low side's rounds are the same as the | ||
464 | ## high side's, except no rcon and no rotation. | ||
465 | ## | ||
466 | &set_label("schedule_256",16); | ||
467 | &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) | ||
468 | &call ("_vpaes_schedule_transform"); # input transform | ||
469 | &mov ($round,7); | ||
470 | |||
471 | &set_label("loop_schedule_256"); | ||
472 | &call ("_vpaes_schedule_mangle"); # output low result | ||
473 | &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 | ||
474 | |||
475 | # high round | ||
476 | &call ("_vpaes_schedule_round"); | ||
477 | &dec ($round); | ||
478 | &jz (&label("schedule_mangle_last")); | ||
479 | &call ("_vpaes_schedule_mangle"); | ||
480 | |||
481 | # low round. swap xmm7 and xmm6 | ||
482 | &pshufd ("xmm0","xmm0",0xFF); | ||
483 | &movdqa (&QWP(20,"esp"),"xmm7"); | ||
484 | &movdqa ("xmm7","xmm6"); | ||
485 | &call ("_vpaes_schedule_low_round"); | ||
486 | &movdqa ("xmm7",&QWP(20,"esp")); | ||
487 | |||
488 | &jmp (&label("loop_schedule_256")); | ||
489 | |||
490 | ## | ||
491 | ## .aes_schedule_mangle_last | ||
492 | ## | ||
493 | ## Mangler for last round of key schedule | ||
494 | ## Mangles %xmm0 | ||
495 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
496 | ## when decrypting, outputs unskew(%xmm0) | ||
497 | ## | ||
498 | ## Always called right before return... jumps to cleanup and exits | ||
499 | ## | ||
500 | &set_label("schedule_mangle_last",16); | ||
501 | # schedule last round key from xmm0 | ||
502 | &lea ($base,&DWP($k_deskew,$const)); | ||
503 | &test ($out,$out); | ||
504 | &jnz (&label("schedule_mangle_last_dec")); | ||
505 | |||
506 | # encrypting | ||
507 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
508 | &pshufb ("xmm0","xmm1"); # output permute | ||
509 | &lea ($base,&DWP($k_opt,$const)); # prepare to output transform | ||
510 | &add ($key,32); | ||
511 | |||
512 | &set_label("schedule_mangle_last_dec"); | ||
513 | &add ($key,-16); | ||
514 | &pxor ("xmm0",&QWP($k_s63,$const)); | ||
515 | &call ("_vpaes_schedule_transform"); # output transform | ||
516 | &movdqu (&QWP(0,$key),"xmm0"); # save last key | ||
517 | |||
518 | # cleanup | ||
519 | &pxor ("xmm0","xmm0"); | ||
520 | &pxor ("xmm1","xmm1"); | ||
521 | &pxor ("xmm2","xmm2"); | ||
522 | &pxor ("xmm3","xmm3"); | ||
523 | &pxor ("xmm4","xmm4"); | ||
524 | &pxor ("xmm5","xmm5"); | ||
525 | &pxor ("xmm6","xmm6"); | ||
526 | &pxor ("xmm7","xmm7"); | ||
527 | &ret (); | ||
528 | &function_end_B("_vpaes_schedule_core"); | ||
529 | |||
530 | ## | ||
531 | ## .aes_schedule_192_smear | ||
532 | ## | ||
533 | ## Smear the short, low side in the 192-bit key schedule. | ||
534 | ## | ||
535 | ## Inputs: | ||
536 | ## %xmm7: high side, b a x y | ||
537 | ## %xmm6: low side, d c 0 0 | ||
538 | ## %xmm13: 0 | ||
539 | ## | ||
540 | ## Outputs: | ||
541 | ## %xmm6: b+c+d b+c 0 0 | ||
542 | ## %xmm0: b+c+d b+c b a | ||
543 | ## | ||
544 | &function_begin_B("_vpaes_schedule_192_smear"); | ||
545 | &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 | ||
546 | &pxor ("xmm6","xmm0"); # -> c+d c 0 0 | ||
547 | &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a | ||
548 | &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a | ||
549 | &movdqa ("xmm0","xmm6"); | ||
550 | &pxor ("xmm1","xmm1"); | ||
551 | &movhlps("xmm6","xmm1"); # clobber low side with zeros | ||
552 | &ret (); | ||
553 | &function_end_B("_vpaes_schedule_192_smear"); | ||
554 | |||
555 | ## | ||
556 | ## .aes_schedule_round | ||
557 | ## | ||
558 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
559 | ## | ||
560 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
561 | ## then rotates it by one byte and xors into the low dword of | ||
562 | ## %xmm7. | ||
563 | ## | ||
564 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
565 | ## next rcon. | ||
566 | ## | ||
567 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
568 | ## second low, result into third, result into highest. | ||
569 | ## | ||
570 | ## Returns results in %xmm7 = %xmm0. | ||
571 | ## Clobbers %xmm1-%xmm5. | ||
572 | ## | ||
573 | &function_begin_B("_vpaes_schedule_round"); | ||
574 | # extract rcon from xmm8 | ||
575 | &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 | ||
576 | &pxor ("xmm1","xmm1"); | ||
577 | &palignr("xmm1","xmm2",15); | ||
578 | &palignr("xmm2","xmm2",15); | ||
579 | &pxor ("xmm7","xmm1"); | ||
580 | |||
581 | # rotate | ||
582 | &pshufd ("xmm0","xmm0",0xFF); | ||
583 | &palignr("xmm0","xmm0",1); | ||
584 | |||
585 | # fall through... | ||
586 | &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 | ||
587 | |||
588 | # low round: same as high round, but no rotation and no rcon. | ||
589 | &set_label("_vpaes_schedule_low_round"); | ||
590 | # smear xmm7 | ||
591 | &movdqa ("xmm1","xmm7"); | ||
592 | &pslldq ("xmm7",4); | ||
593 | &pxor ("xmm7","xmm1"); | ||
594 | &movdqa ("xmm1","xmm7"); | ||
595 | &pslldq ("xmm7",8); | ||
596 | &pxor ("xmm7","xmm1"); | ||
597 | &pxor ("xmm7",&QWP($k_s63,$const)); | ||
598 | |||
599 | # subbyte | ||
600 | &movdqa ("xmm4",&QWP($k_s0F,$const)); | ||
601 | &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j | ||
602 | &movdqa ("xmm1","xmm4"); | ||
603 | &pandn ("xmm1","xmm0"); | ||
604 | &psrld ("xmm1",4); # 1 = i | ||
605 | &pand ("xmm0","xmm4"); # 0 = k | ||
606 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
607 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
608 | &pxor ("xmm0","xmm1"); # 0 = j | ||
609 | &movdqa ("xmm3","xmm5"); # 3 : 1/i | ||
610 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
611 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
612 | &movdqa ("xmm4","xmm5"); # 4 : 1/j | ||
613 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
614 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
615 | &movdqa ("xmm2","xmm5"); # 2 : 1/iak | ||
616 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
617 | &pxor ("xmm2","xmm0"); # 2 = io | ||
618 | &movdqa ("xmm3","xmm5"); # 3 : 1/jak | ||
619 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
620 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
621 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou | ||
622 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
623 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot | ||
624 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
625 | &pxor ("xmm0","xmm4"); # 0 = sbox output | ||
626 | |||
627 | # add in smeared stuff | ||
628 | &pxor ("xmm0","xmm7"); | ||
629 | &movdqa ("xmm7","xmm0"); | ||
630 | &ret (); | ||
631 | &function_end_B("_vpaes_schedule_round"); | ||
632 | |||
633 | ## | ||
634 | ## .aes_schedule_transform | ||
635 | ## | ||
636 | ## Linear-transform %xmm0 according to tables at (%ebx) | ||
637 | ## | ||
638 | ## Output in %xmm0 | ||
639 | ## Clobbers %xmm1, %xmm2 | ||
640 | ## | ||
641 | &function_begin_B("_vpaes_schedule_transform"); | ||
642 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
643 | &movdqa ("xmm1","xmm2"); | ||
644 | &pandn ("xmm1","xmm0"); | ||
645 | &psrld ("xmm1",4); | ||
646 | &pand ("xmm0","xmm2"); | ||
647 | &movdqa ("xmm2",&QWP(0,$base)); | ||
648 | &pshufb ("xmm2","xmm0"); | ||
649 | &movdqa ("xmm0",&QWP(16,$base)); | ||
650 | &pshufb ("xmm0","xmm1"); | ||
651 | &pxor ("xmm0","xmm2"); | ||
652 | &ret (); | ||
653 | &function_end_B("_vpaes_schedule_transform"); | ||
654 | |||
655 | ## | ||
656 | ## .aes_schedule_mangle | ||
657 | ## | ||
658 | ## Mangle xmm0 from (basis-transformed) standard version | ||
659 | ## to our version. | ||
660 | ## | ||
661 | ## On encrypt, | ||
662 | ## xor with 0x63 | ||
663 | ## multiply by circulant 0,1,1,1 | ||
664 | ## apply shiftrows transform | ||
665 | ## | ||
666 | ## On decrypt, | ||
667 | ## xor with 0x63 | ||
668 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
669 | ## deskew | ||
670 | ## apply shiftrows transform | ||
671 | ## | ||
672 | ## | ||
673 | ## Writes out to (%edx), and increments or decrements it | ||
674 | ## Keeps track of round number mod 4 in %ecx | ||
675 | ## Preserves xmm0 | ||
676 | ## Clobbers xmm1-xmm5 | ||
677 | ## | ||
678 | &function_begin_B("_vpaes_schedule_mangle"); | ||
679 | &movdqa ("xmm4","xmm0"); # save xmm0 for later | ||
680 | &movdqa ("xmm5",&QWP($k_mc_forward,$const)); | ||
681 | &test ($out,$out); | ||
682 | &jnz (&label("schedule_mangle_dec")); | ||
683 | |||
684 | # encrypting | ||
685 | &add ($key,16); | ||
686 | &pxor ("xmm4",&QWP($k_s63,$const)); | ||
687 | &pshufb ("xmm4","xmm5"); | ||
688 | &movdqa ("xmm3","xmm4"); | ||
689 | &pshufb ("xmm4","xmm5"); | ||
690 | &pxor ("xmm3","xmm4"); | ||
691 | &pshufb ("xmm4","xmm5"); | ||
692 | &pxor ("xmm3","xmm4"); | ||
693 | |||
694 | &jmp (&label("schedule_mangle_both")); | ||
695 | |||
696 | &set_label("schedule_mangle_dec",16); | ||
697 | # inverse mix columns | ||
698 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
699 | &lea ($inp,&DWP($k_dksd,$const)); | ||
700 | &movdqa ("xmm1","xmm2"); | ||
701 | &pandn ("xmm1","xmm4"); | ||
702 | &psrld ("xmm1",4); # 1 = hi | ||
703 | &pand ("xmm4","xmm2"); # 4 = lo | ||
704 | |||
705 | &movdqa ("xmm2",&QWP(0,$inp)); | ||
706 | &pshufb ("xmm2","xmm4"); | ||
707 | &movdqa ("xmm3",&QWP(0x10,$inp)); | ||
708 | &pshufb ("xmm3","xmm1"); | ||
709 | &pxor ("xmm3","xmm2"); | ||
710 | &pshufb ("xmm3","xmm5"); | ||
711 | |||
712 | &movdqa ("xmm2",&QWP(0x20,$inp)); | ||
713 | &pshufb ("xmm2","xmm4"); | ||
714 | &pxor ("xmm2","xmm3"); | ||
715 | &movdqa ("xmm3",&QWP(0x30,$inp)); | ||
716 | &pshufb ("xmm3","xmm1"); | ||
717 | &pxor ("xmm3","xmm2"); | ||
718 | &pshufb ("xmm3","xmm5"); | ||
719 | |||
720 | &movdqa ("xmm2",&QWP(0x40,$inp)); | ||
721 | &pshufb ("xmm2","xmm4"); | ||
722 | &pxor ("xmm2","xmm3"); | ||
723 | &movdqa ("xmm3",&QWP(0x50,$inp)); | ||
724 | &pshufb ("xmm3","xmm1"); | ||
725 | &pxor ("xmm3","xmm2"); | ||
726 | &pshufb ("xmm3","xmm5"); | ||
727 | |||
728 | &movdqa ("xmm2",&QWP(0x60,$inp)); | ||
729 | &pshufb ("xmm2","xmm4"); | ||
730 | &pxor ("xmm2","xmm3"); | ||
731 | &movdqa ("xmm3",&QWP(0x70,$inp)); | ||
732 | &pshufb ("xmm3","xmm1"); | ||
733 | &pxor ("xmm3","xmm2"); | ||
734 | |||
735 | &add ($key,-16); | ||
736 | |||
737 | &set_label("schedule_mangle_both"); | ||
738 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
739 | &pshufb ("xmm3","xmm1"); | ||
740 | &add ($magic,-16); | ||
741 | &and ($magic,0x30); | ||
742 | &movdqu (&QWP(0,$key),"xmm3"); | ||
743 | &ret (); | ||
744 | &function_end_B("_vpaes_schedule_mangle"); | ||
745 | |||
746 | # | ||
747 | # Interface to OpenSSL | ||
748 | # | ||
749 | &function_begin("${PREFIX}_set_encrypt_key"); | ||
750 | &mov ($inp,&wparam(0)); # inp | ||
751 | &lea ($base,&DWP(-56,"esp")); | ||
752 | &mov ($round,&wparam(1)); # bits | ||
753 | &and ($base,-16); | ||
754 | &mov ($key,&wparam(2)); # key | ||
755 | &xchg ($base,"esp"); # alloca | ||
756 | &mov (&DWP(48,"esp"),$base); | ||
757 | |||
758 | &mov ($base,$round); | ||
759 | &shr ($base,5); | ||
760 | &add ($base,5); | ||
761 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
762 | &mov ($magic,0x30); | ||
763 | &mov ($out,0); | ||
764 | |||
765 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
766 | &call ("_vpaes_schedule_core"); | ||
767 | &set_label("pic_point"); | ||
768 | |||
769 | &mov ("esp",&DWP(48,"esp")); | ||
770 | &xor ("eax","eax"); | ||
771 | &function_end("${PREFIX}_set_encrypt_key"); | ||
772 | |||
773 | &function_begin("${PREFIX}_set_decrypt_key"); | ||
774 | &mov ($inp,&wparam(0)); # inp | ||
775 | &lea ($base,&DWP(-56,"esp")); | ||
776 | &mov ($round,&wparam(1)); # bits | ||
777 | &and ($base,-16); | ||
778 | &mov ($key,&wparam(2)); # key | ||
779 | &xchg ($base,"esp"); # alloca | ||
780 | &mov (&DWP(48,"esp"),$base); | ||
781 | |||
782 | &mov ($base,$round); | ||
783 | &shr ($base,5); | ||
784 | &add ($base,5); | ||
785 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
786 | &shl ($base,4); | ||
787 | &lea ($key,&DWP(16,$key,$base)); | ||
788 | |||
789 | &mov ($out,1); | ||
790 | &mov ($magic,$round); | ||
791 | &shr ($magic,1); | ||
792 | &and ($magic,32); | ||
793 | &xor ($magic,32); # nbist==192?0:32; | ||
794 | |||
795 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
796 | &call ("_vpaes_schedule_core"); | ||
797 | &set_label("pic_point"); | ||
798 | |||
799 | &mov ("esp",&DWP(48,"esp")); | ||
800 | &xor ("eax","eax"); | ||
801 | &function_end("${PREFIX}_set_decrypt_key"); | ||
802 | |||
803 | &function_begin("${PREFIX}_encrypt"); | ||
804 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
805 | &call ("_vpaes_preheat"); | ||
806 | &set_label("pic_point"); | ||
807 | &mov ($inp,&wparam(0)); # inp | ||
808 | &lea ($base,&DWP(-56,"esp")); | ||
809 | &mov ($out,&wparam(1)); # out | ||
810 | &and ($base,-16); | ||
811 | &mov ($key,&wparam(2)); # key | ||
812 | &xchg ($base,"esp"); # alloca | ||
813 | &mov (&DWP(48,"esp"),$base); | ||
814 | |||
815 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
816 | &call ("_vpaes_encrypt_core"); | ||
817 | &movdqu (&QWP(0,$out),"xmm0"); | ||
818 | |||
819 | &mov ("esp",&DWP(48,"esp")); | ||
820 | &function_end("${PREFIX}_encrypt"); | ||
821 | |||
822 | &function_begin("${PREFIX}_decrypt"); | ||
823 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
824 | &call ("_vpaes_preheat"); | ||
825 | &set_label("pic_point"); | ||
826 | &mov ($inp,&wparam(0)); # inp | ||
827 | &lea ($base,&DWP(-56,"esp")); | ||
828 | &mov ($out,&wparam(1)); # out | ||
829 | &and ($base,-16); | ||
830 | &mov ($key,&wparam(2)); # key | ||
831 | &xchg ($base,"esp"); # alloca | ||
832 | &mov (&DWP(48,"esp"),$base); | ||
833 | |||
834 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
835 | &call ("_vpaes_decrypt_core"); | ||
836 | &movdqu (&QWP(0,$out),"xmm0"); | ||
837 | |||
838 | &mov ("esp",&DWP(48,"esp")); | ||
839 | &function_end("${PREFIX}_decrypt"); | ||
840 | |||
841 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
842 | &mov ($inp,&wparam(0)); # inp | ||
843 | &mov ($out,&wparam(1)); # out | ||
844 | &mov ($round,&wparam(2)); # len | ||
845 | &mov ($key,&wparam(3)); # key | ||
846 | &sub ($round,16); | ||
847 | &jc (&label("cbc_abort")); | ||
848 | &lea ($base,&DWP(-56,"esp")); | ||
849 | &mov ($const,&wparam(4)); # ivp | ||
850 | &and ($base,-16); | ||
851 | &mov ($magic,&wparam(5)); # enc | ||
852 | &xchg ($base,"esp"); # alloca | ||
853 | &movdqu ("xmm1",&QWP(0,$const)); # load IV | ||
854 | &sub ($out,$inp); | ||
855 | &mov (&DWP(48,"esp"),$base); | ||
856 | |||
857 | &mov (&DWP(0,"esp"),$out); # save out | ||
858 | &mov (&DWP(4,"esp"),$key) # save key | ||
859 | &mov (&DWP(8,"esp"),$const); # save ivp | ||
860 | &mov ($out,$round); # $out works as $len | ||
861 | |||
862 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
863 | &call ("_vpaes_preheat"); | ||
864 | &set_label("pic_point"); | ||
865 | &cmp ($magic,0); | ||
866 | &je (&label("cbc_dec_loop")); | ||
867 | &jmp (&label("cbc_enc_loop")); | ||
868 | |||
869 | &set_label("cbc_enc_loop",16); | ||
870 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
871 | &pxor ("xmm0","xmm1"); # inp^=iv | ||
872 | &call ("_vpaes_encrypt_core"); | ||
873 | &mov ($base,&DWP(0,"esp")); # restore out | ||
874 | &mov ($key,&DWP(4,"esp")); # restore key | ||
875 | &movdqa ("xmm1","xmm0"); | ||
876 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
877 | &lea ($inp,&DWP(16,$inp)); | ||
878 | &sub ($out,16); | ||
879 | &jnc (&label("cbc_enc_loop")); | ||
880 | &jmp (&label("cbc_done")); | ||
881 | |||
882 | &set_label("cbc_dec_loop",16); | ||
883 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
884 | &movdqa (&QWP(16,"esp"),"xmm1"); # save IV | ||
885 | &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV | ||
886 | &call ("_vpaes_decrypt_core"); | ||
887 | &mov ($base,&DWP(0,"esp")); # restore out | ||
888 | &mov ($key,&DWP(4,"esp")); # restore key | ||
889 | &pxor ("xmm0",&QWP(16,"esp")); # out^=iv | ||
890 | &movdqa ("xmm1",&QWP(32,"esp")); # load next IV | ||
891 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
892 | &lea ($inp,&DWP(16,$inp)); | ||
893 | &sub ($out,16); | ||
894 | &jnc (&label("cbc_dec_loop")); | ||
895 | |||
896 | &set_label("cbc_done"); | ||
897 | &mov ($base,&DWP(8,"esp")); # restore ivp | ||
898 | &mov ("esp",&DWP(48,"esp")); | ||
899 | &movdqu (&QWP(0,$base),"xmm1"); # write IV | ||
900 | &set_label("cbc_abort"); | ||
901 | &function_end("${PREFIX}_cbc_encrypt"); | ||
902 | |||
903 | &asm_finish(); | ||