summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/camellia
diff options
context:
space:
mode:
authordjm <>2009-04-06 06:30:10 +0000
committerdjm <>2009-04-06 06:30:10 +0000
commitf929570d17be2469dc7104fcdf26fdaddf3dbb65 (patch)
treed27deb705d08b9515fe0c6a5de67639235c5ad78 /src/lib/libcrypto/camellia
parent8495770bca2f5a7c4d65351d78035a1cf89684f0 (diff)
parent2b6e09b39ef1d803b50ee024a06d1c250fde442d (diff)
downloadopenbsd-f929570d17be2469dc7104fcdf26fdaddf3dbb65.tar.gz
openbsd-f929570d17be2469dc7104fcdf26fdaddf3dbb65.tar.bz2
openbsd-f929570d17be2469dc7104fcdf26fdaddf3dbb65.zip
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/camellia')
-rw-r--r--src/lib/libcrypto/camellia/asm/cmll-x86.pl1138
-rw-r--r--src/lib/libcrypto/camellia/asm/cmll-x86_64.pl1080
2 files changed, 2218 insertions, 0 deletions
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
new file mode 100644
index 0000000000..0812815bfb
--- /dev/null
+++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
@@ -0,0 +1,1138 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD K8 Core2 PIII P4
18# -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
21#
22# camellia-128-cbc 17.3 21.1 23.9 25.9
23#
24# 128-bit key setup 196 280 256 240 cycles/key
25# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
27#
28# Pairs of numbers in "+" rows represent performance improvement over
29# compiler generated position-independent code, PIC, and non-PIC
30# respectively. PIC results are of greater relevance, as this module
31# is position-independent, i.e. suitable for a shared library or PIE.
32# Position independence "costs" one register, which is why compilers
33# are so close with non-PIC results, they have an extra register to
34# spare. CBC results are better than ECB ones thanks to "zero-copy"
35# private _x86_* interface, and are ~30-40% better than with compiler
36# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37# same CPU (where applicable).
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40push(@INC,"${dir}","${dir}../../perlasm");
41require "x86asm.pl";
42
43$OPENSSL=1;
44
45&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
46
47@T=("eax","ebx","ecx","edx");
48$idx="esi";
49$key="edi";
50$Tbl="ebp";
51
52# stack frame layout in _x86_Camellia_* routines, frame is allocated
53# by caller
54$__ra=&DWP(0,"esp"); # return address
55$__s0=&DWP(4,"esp"); # s0 backing store
56$__s1=&DWP(8,"esp"); # s1 backing store
57$__s2=&DWP(12,"esp"); # s2 backing store
58$__s3=&DWP(16,"esp"); # s3 backing store
59$__end=&DWP(20,"esp"); # pointer to end/start of key schedule
60
61# stack frame layout in Camellia_[en|crypt] routines, which differs from
62# above by 4 and overlaps by pointer to end/start of key schedule
63$_end=&DWP(16,"esp");
64$_esp=&DWP(20,"esp");
65
66# const unsigned int Camellia_SBOX[4][256];
67# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68# and [2][] - with [3][]. This is done to optimize code size.
69$SBOX1_1110=0; # Camellia_SBOX[0]
70$SBOX4_4404=4; # Camellia_SBOX[1]
71$SBOX2_0222=2048; # Camellia_SBOX[2]
72$SBOX3_3033=2052; # Camellia_SBOX[3]
73&static_label("Camellia_SIGMA");
74&static_label("Camellia_SBOX");
75
76sub Camellia_Feistel {
77my $i=@_[0];
78my $seed=defined(@_[1])?@_[1]:0;
79my $scale=$seed<0?-8:8;
80my $frame=defined(@_[2])?@_[2]:0;
81my $j=($i&1)*2;
82my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
83
84 &xor ($t0,$idx); # t0^=key[0]
85 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
86 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
87 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
88 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
89 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
90 &shr ($t0,16);
91 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
92 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
93 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
94 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
95 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
96 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
97 &shr ($t1,16);
98 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
99 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
100 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
101 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
102 &xor ($t2,$t3); # t2^=t3
103 &rotr ($t3,8); # t3=RightRotate(t3,8)
104 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
105 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
106 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
107 &xor ($t3,$t0); # t3^=s3
108 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
109 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
110 &xor ($t3,$t2); # t3^=t2
111 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
112 &xor ($t2,$t1); # t2^=s2
113 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
114}
115
116# void Camellia_EncryptBlock_Rounds(
117# int grandRounds,
118# const Byte plaintext[],
119# const KEY_TABLE_TYPE keyTable,
120# Byte ciphertext[])
121&function_begin("Camellia_EncryptBlock_Rounds");
122 &mov ("eax",&wparam(0)); # load grandRounds
123 &mov ($idx,&wparam(1)); # load plaintext pointer
124 &mov ($key,&wparam(2)); # load key schedule pointer
125
126 &mov ("ebx","esp");
127 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
128 &and ("esp",-64);
129
130 # place stack frame just "above mod 1024" the key schedule
131 # this ensures that cache associativity of 2 suffices
132 &lea ("ecx",&DWP(-64-63,$key));
133 &sub ("ecx","esp");
134 &neg ("ecx");
135 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
136 &sub ("esp","ecx");
137 &add ("esp",4); # 4 is reserved for callee's return address
138
139 &shl ("eax",6);
140 &lea ("eax",&DWP(0,$key,"eax"));
141 &mov ($_esp,"ebx"); # save %esp
142 &mov ($_end,"eax"); # save keyEnd
143
144 &call (&label("pic_point"));
145 &set_label("pic_point");
146 &blindpop($Tbl);
147 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
148
149 &mov (@T[0],&DWP(0,$idx)); # load plaintext
150 &mov (@T[1],&DWP(4,$idx));
151 &mov (@T[2],&DWP(8,$idx));
152 &bswap (@T[0]);
153 &mov (@T[3],&DWP(12,$idx));
154 &bswap (@T[1]);
155 &bswap (@T[2]);
156 &bswap (@T[3]);
157
158 &call ("_x86_Camellia_encrypt");
159
160 &mov ("esp",$_esp);
161 &bswap (@T[0]);
162 &mov ($idx,&wparam(3)); # load ciphertext pointer
163 &bswap (@T[1]);
164 &bswap (@T[2]);
165 &bswap (@T[3]);
166 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
167 &mov (&DWP(4,$idx),@T[1]);
168 &mov (&DWP(8,$idx),@T[2]);
169 &mov (&DWP(12,$idx),@T[3]);
170&function_end("Camellia_EncryptBlock_Rounds");
171# V1.x API
172&function_begin_B("Camellia_EncryptBlock");
173 &mov ("eax",128);
174 &sub ("eax",&wparam(0)); # load keyBitLength
175 &mov ("eax",3);
176 &adc ("eax",0); # keyBitLength==128?3:4
177 &mov (&wparam(0),"eax");
178 &jmp (&label("Camellia_EncryptBlock_Rounds"));
179&function_end_B("Camellia_EncryptBlock");
180
181if ($OPENSSL) {
182# void Camellia_encrypt(
183# const unsigned char *in,
184# unsigned char *out,
185# const CAMELLIA_KEY *key)
186&function_begin("Camellia_encrypt");
187 &mov ($idx,&wparam(0)); # load plaintext pointer
188 &mov ($key,&wparam(2)); # load key schedule pointer
189
190 &mov ("ebx","esp");
191 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
192 &and ("esp",-64);
193 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
194
195 # place stack frame just "above mod 1024" the key schedule
196 # this ensures that cache associativity of 2 suffices
197 &lea ("ecx",&DWP(-64-63,$key));
198 &sub ("ecx","esp");
199 &neg ("ecx");
200 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
201 &sub ("esp","ecx");
202 &add ("esp",4); # 4 is reserved for callee's return address
203
204 &shl ("eax",6);
205 &lea ("eax",&DWP(0,$key,"eax"));
206 &mov ($_esp,"ebx"); # save %esp
207 &mov ($_end,"eax"); # save keyEnd
208
209 &call (&label("pic_point"));
210 &set_label("pic_point");
211 &blindpop($Tbl);
212 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
213
214 &mov (@T[0],&DWP(0,$idx)); # load plaintext
215 &mov (@T[1],&DWP(4,$idx));
216 &mov (@T[2],&DWP(8,$idx));
217 &bswap (@T[0]);
218 &mov (@T[3],&DWP(12,$idx));
219 &bswap (@T[1]);
220 &bswap (@T[2]);
221 &bswap (@T[3]);
222
223 &call ("_x86_Camellia_encrypt");
224
225 &mov ("esp",$_esp);
226 &bswap (@T[0]);
227 &mov ($idx,&wparam(1)); # load ciphertext pointer
228 &bswap (@T[1]);
229 &bswap (@T[2]);
230 &bswap (@T[3]);
231 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
232 &mov (&DWP(4,$idx),@T[1]);
233 &mov (&DWP(8,$idx),@T[2]);
234 &mov (&DWP(12,$idx),@T[3]);
235&function_end("Camellia_encrypt");
236}
237
238&function_begin_B("_x86_Camellia_encrypt");
239 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
240 &xor (@T[1],&DWP(4,$key));
241 &xor (@T[2],&DWP(8,$key));
242 &xor (@T[3],&DWP(12,$key));
243 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
244
245 &mov ($__s0,@T[0]); # save s[0-3]
246 &mov ($__s1,@T[1]);
247 &mov ($__s2,@T[2]);
248 &mov ($__s3,@T[3]);
249
250&set_label("loop",16);
251 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
252
253 &add ($key,16*4);
254 &cmp ($key,$__end);
255 &je (&label("done"));
256
257 # @T[0-1] are preloaded, $idx is preloaded with key[0]
258 &and ($idx,@T[0]);
259 &mov (@T[3],$__s3);
260 &rotl ($idx,1);
261 &mov (@T[2],@T[3]);
262 &xor (@T[1],$idx);
263 &or (@T[2],&DWP(12,$key));
264 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
265 &xor (@T[2],$__s2);
266
267 &mov ($idx,&DWP(4,$key));
268 &mov ($__s2,@T[2]); # s2^=s3|key[3];
269 &or ($idx,@T[1]);
270 &and (@T[2],&DWP(8,$key));
271 &xor (@T[0],$idx);
272 &rotl (@T[2],1);
273 &mov ($__s0,@T[0]); # s0^=s1|key[1];
274 &xor (@T[3],@T[2]);
275 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
276 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
277 &jmp (&label("loop"));
278
279&set_label("done",8);
280 &mov (@T[2],@T[0]); # SwapHalf
281 &mov (@T[3],@T[1]);
282 &mov (@T[0],$__s2);
283 &mov (@T[1],$__s3);
284 &xor (@T[0],$idx); # $idx is preloaded with key[0]
285 &xor (@T[1],&DWP(4,$key));
286 &xor (@T[2],&DWP(8,$key));
287 &xor (@T[3],&DWP(12,$key));
288 &ret ();
289&function_end_B("_x86_Camellia_encrypt");
290
291# void Camellia_DecryptBlock_Rounds(
292# int grandRounds,
293# const Byte ciphertext[],
294# const KEY_TABLE_TYPE keyTable,
295# Byte plaintext[])
296&function_begin("Camellia_DecryptBlock_Rounds");
297 &mov ("eax",&wparam(0)); # load grandRounds
298 &mov ($idx,&wparam(1)); # load ciphertext pointer
299 &mov ($key,&wparam(2)); # load key schedule pointer
300
301 &mov ("ebx","esp");
302 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
303 &and ("esp",-64);
304
305 # place stack frame just "above mod 1024" the key schedule
306 # this ensures that cache associativity of 2 suffices
307 &lea ("ecx",&DWP(-64-63,$key));
308 &sub ("ecx","esp");
309 &neg ("ecx");
310 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
311 &sub ("esp","ecx");
312 &add ("esp",4); # 4 is reserved for callee's return address
313
314 &shl ("eax",6);
315 &mov (&DWP(4*4,"esp"),$key); # save keyStart
316 &lea ($key,&DWP(0,$key,"eax"));
317 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
318
319 &call (&label("pic_point"));
320 &set_label("pic_point");
321 &blindpop($Tbl);
322 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
323
324 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
325 &mov (@T[1],&DWP(4,$idx));
326 &mov (@T[2],&DWP(8,$idx));
327 &bswap (@T[0]);
328 &mov (@T[3],&DWP(12,$idx));
329 &bswap (@T[1]);
330 &bswap (@T[2]);
331 &bswap (@T[3]);
332
333 &call ("_x86_Camellia_decrypt");
334
335 &mov ("esp",&DWP(5*4,"esp"));
336 &bswap (@T[0]);
337 &mov ($idx,&wparam(3)); # load plaintext pointer
338 &bswap (@T[1]);
339 &bswap (@T[2]);
340 &bswap (@T[3]);
341 &mov (&DWP(0,$idx),@T[0]); # write plaintext
342 &mov (&DWP(4,$idx),@T[1]);
343 &mov (&DWP(8,$idx),@T[2]);
344 &mov (&DWP(12,$idx),@T[3]);
345&function_end("Camellia_DecryptBlock_Rounds");
346# V1.x API
347&function_begin_B("Camellia_DecryptBlock");
348 &mov ("eax",128);
349 &sub ("eax",&wparam(0)); # load keyBitLength
350 &mov ("eax",3);
351 &adc ("eax",0); # keyBitLength==128?3:4
352 &mov (&wparam(0),"eax");
353 &jmp (&label("Camellia_DecryptBlock_Rounds"));
354&function_end_B("Camellia_DecryptBlock");
355
356if ($OPENSSL) {
357# void Camellia_decrypt(
358# const unsigned char *in,
359# unsigned char *out,
360# const CAMELLIA_KEY *key)
361&function_begin("Camellia_decrypt");
362 &mov ($idx,&wparam(0)); # load ciphertext pointer
363 &mov ($key,&wparam(2)); # load key schedule pointer
364
365 &mov ("ebx","esp");
366 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
367 &and ("esp",-64);
368 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
369
370 # place stack frame just "above mod 1024" the key schedule
371 # this ensures that cache associativity of 2 suffices
372 &lea ("ecx",&DWP(-64-63,$key));
373 &sub ("ecx","esp");
374 &neg ("ecx");
375 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
376 &sub ("esp","ecx");
377 &add ("esp",4); # 4 is reserved for callee's return address
378
379 &shl ("eax",6);
380 &mov (&DWP(4*4,"esp"),$key); # save keyStart
381 &lea ($key,&DWP(0,$key,"eax"));
382 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
383
384 &call (&label("pic_point"));
385 &set_label("pic_point");
386 &blindpop($Tbl);
387 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
388
389 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
390 &mov (@T[1],&DWP(4,$idx));
391 &mov (@T[2],&DWP(8,$idx));
392 &bswap (@T[0]);
393 &mov (@T[3],&DWP(12,$idx));
394 &bswap (@T[1]);
395 &bswap (@T[2]);
396 &bswap (@T[3]);
397
398 &call ("_x86_Camellia_decrypt");
399
400 &mov ("esp",&DWP(5*4,"esp"));
401 &bswap (@T[0]);
402 &mov ($idx,&wparam(1)); # load plaintext pointer
403 &bswap (@T[1]);
404 &bswap (@T[2]);
405 &bswap (@T[3]);
406 &mov (&DWP(0,$idx),@T[0]); # write plaintext
407 &mov (&DWP(4,$idx),@T[1]);
408 &mov (&DWP(8,$idx),@T[2]);
409 &mov (&DWP(12,$idx),@T[3]);
410&function_end("Camellia_decrypt");
411}
412
413&function_begin_B("_x86_Camellia_decrypt");
414 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
415 &xor (@T[1],&DWP(4,$key));
416 &xor (@T[2],&DWP(8,$key));
417 &xor (@T[3],&DWP(12,$key));
418 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
419
420 &mov ($__s0,@T[0]); # save s[0-3]
421 &mov ($__s1,@T[1]);
422 &mov ($__s2,@T[2]);
423 &mov ($__s3,@T[3]);
424
425&set_label("loop",16);
426 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
427
428 &sub ($key,16*4);
429 &cmp ($key,$__end);
430 &je (&label("done"));
431
432 # @T[0-1] are preloaded, $idx is preloaded with key[2]
433 &and ($idx,@T[0]);
434 &mov (@T[3],$__s3);
435 &rotl ($idx,1);
436 &mov (@T[2],@T[3]);
437 &xor (@T[1],$idx);
438 &or (@T[2],&DWP(4,$key));
439 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
440 &xor (@T[2],$__s2);
441
442 &mov ($idx,&DWP(12,$key));
443 &mov ($__s2,@T[2]); # s2^=s3|key[3];
444 &or ($idx,@T[1]);
445 &and (@T[2],&DWP(0,$key));
446 &xor (@T[0],$idx);
447 &rotl (@T[2],1);
448 &mov ($__s0,@T[0]); # s0^=s1|key[1];
449 &xor (@T[3],@T[2]);
450 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
451 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
452 &jmp (&label("loop"));
453
454&set_label("done",8);
455 &mov (@T[2],@T[0]); # SwapHalf
456 &mov (@T[3],@T[1]);
457 &mov (@T[0],$__s2);
458 &mov (@T[1],$__s3);
459 &xor (@T[2],$idx); # $idx is preloaded with key[2]
460 &xor (@T[3],&DWP(12,$key));
461 &xor (@T[0],&DWP(0,$key));
462 &xor (@T[1],&DWP(4,$key));
463 &ret ();
464&function_end_B("_x86_Camellia_decrypt");
465
466# shld is very slow on Intel P4 family. Even on AMD it limits
467# instruction decode rate [because it's VectorPath] and consequently
468# performance. PIII, PM and Core[2] seem to be the only ones which
469# execute this code ~7% faster...
470sub __rotl128 {
471 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
472
473 $rnd *= 2;
474 if ($rot) {
475 &mov ($idx,$i0);
476 &shld ($i0,$i1,$rot);
477 &shld ($i1,$i2,$rot);
478 &shld ($i2,$i3,$rot);
479 &shld ($i3,$idx,$rot);
480 }
481 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
482 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
483 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
484 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
485}
486
487# ... Implementing 128-bit rotate without shld gives >3x performance
488# improvement on P4, only ~7% degradation on other Intel CPUs and
489# not worse performance on AMD. This is therefore preferred.
490sub _rotl128 {
491 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
492
493 $rnd *= 2;
494 if ($rot) {
495 &mov ($Tbl,$i0);
496 &shl ($i0,$rot);
497 &mov ($idx,$i1);
498 &shr ($idx,32-$rot);
499 &shl ($i1,$rot);
500 &or ($i0,$idx);
501 &mov ($idx,$i2);
502 &shl ($i2,$rot);
503 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
504 &shr ($idx,32-$rot);
505 &or ($i1,$idx);
506 &shr ($Tbl,32-$rot);
507 &mov ($idx,$i3);
508 &shr ($idx,32-$rot);
509 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
510 &shl ($i3,$rot);
511 &or ($i2,$idx);
512 &or ($i3,$Tbl);
513 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
514 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
515 } else {
516 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
517 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
518 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
519 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
520 }
521}
522
523sub _saveround {
524my ($rnd,$key,@T)=@_;
525my $bias=int(@T[0])?shift(@T):0;
526
527 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
528 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
529 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
530 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
531}
532
533sub _loadround {
534my ($rnd,$key,@T)=@_;
535my $bias=int(@T[0])?shift(@T):0;
536
537 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
538 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
539 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
540 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
541}
542
543# void Camellia_Ekeygen(
544# const int keyBitLength,
545# const Byte *rawKey,
546# KEY_TABLE_TYPE keyTable)
547&function_begin("Camellia_Ekeygen");
548{ my $step=0;
549
550 &stack_push(4); # place for s[0-3]
551
552 &mov ($Tbl,&wparam(0)); # load arguments
553 &mov ($idx,&wparam(1));
554 &mov ($key,&wparam(2));
555
556 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
557 &mov (@T[1],&DWP(4,$idx));
558 &mov (@T[2],&DWP(8,$idx));
559 &mov (@T[3],&DWP(12,$idx));
560
561 &bswap (@T[0]);
562 &bswap (@T[1]);
563 &bswap (@T[2]);
564 &bswap (@T[3]);
565
566 &_saveround (0,$key,@T); # KL<<<0
567
568 &cmp ($Tbl,128);
569 &je (&label("1st128"));
570
571 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
572 &mov (@T[1],&DWP(20,$idx));
573 &cmp ($Tbl,192);
574 &je (&label("1st192"));
575 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
576 &mov (@T[3],&DWP(28,$idx));
577 &jmp (&label("1st256"));
578&set_label("1st192",4);
579 &mov (@T[2],@T[0]);
580 &mov (@T[3],@T[1]);
581 &not (@T[2]);
582 &not (@T[3]);
583&set_label("1st256",4);
584 &bswap (@T[0]);
585 &bswap (@T[1]);
586 &bswap (@T[2]);
587 &bswap (@T[3]);
588
589 &_saveround (4,$key,@T); # temporary storage for KR!
590
591 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
592 &xor (@T[1],&DWP(0*8+4,$key));
593 &xor (@T[2],&DWP(1*8+0,$key));
594 &xor (@T[3],&DWP(1*8+4,$key));
595
596&set_label("1st128",4);
597 &call (&label("pic_point"));
598 &set_label("pic_point");
599 &blindpop($Tbl);
600 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
601 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
602
603 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
604 &mov (&swtmp(0),@T[0]); # save s[0-3]
605 &mov (&swtmp(1),@T[1]);
606 &mov (&swtmp(2),@T[2]);
607 &mov (&swtmp(3),@T[3]);
608 &Camellia_Feistel($step++);
609 &Camellia_Feistel($step++);
610 &mov (@T[2],&swtmp(2));
611 &mov (@T[3],&swtmp(3));
612
613 &mov ($idx,&wparam(2));
614 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
615 &xor (@T[1],&DWP(0*8+4,$idx));
616 &xor (@T[2],&DWP(1*8+0,$idx));
617 &xor (@T[3],&DWP(1*8+4,$idx));
618
619 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
620 &mov (&swtmp(0),@T[0]); # save s[0-3]
621 &mov (&swtmp(1),@T[1]);
622 &mov (&swtmp(2),@T[2]);
623 &mov (&swtmp(3),@T[3]);
624 &Camellia_Feistel($step++);
625 &Camellia_Feistel($step++);
626 &mov (@T[2],&swtmp(2));
627 &mov (@T[3],&swtmp(3));
628
629 &mov ($idx,&wparam(0));
630 &cmp ($idx,128);
631 &jne (&label("2nd256"));
632
633 &mov ($key,&wparam(2));
634 &lea ($key,&DWP(128,$key)); # size optimization
635
636 ####### process KA
637 &_saveround (2,$key,-128,@T); # KA<<<0
638 &_rotl128 (@T,15,6,@T); # KA<<<15
639 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
640 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
641 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
642 push (@T,shift(@T)); # rotl128(@T,32);
643 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
644 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
645
646 ####### process KL
647 &_loadround (0,$key,-128,@T); # load KL
648 &_rotl128 (@T,15,4,@T); # KL<<<15
649 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
650 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
651 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
652 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
653 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
654
655 while (@T[0] ne "eax") # restore order
656 { unshift (@T,pop(@T)); }
657
658 &mov ("eax",3); # 3 grandRounds
659 &jmp (&label("done"));
660
661&set_label("2nd256",16);
662 &mov ($idx,&wparam(2));
663 &_saveround (6,$idx,@T); # temporary storage for KA!
664
665 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
666 &xor (@T[1],&DWP(4*8+4,$idx));
667 &xor (@T[2],&DWP(5*8+0,$idx));
668 &xor (@T[3],&DWP(5*8+4,$idx));
669
670 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
671 &mov (&swtmp(0),@T[0]); # save s[0-3]
672 &mov (&swtmp(1),@T[1]);
673 &mov (&swtmp(2),@T[2]);
674 &mov (&swtmp(3),@T[3]);
675 &Camellia_Feistel($step++);
676 &Camellia_Feistel($step++);
677 &mov (@T[2],&swtmp(2));
678 &mov (@T[3],&swtmp(3));
679
680 &mov ($key,&wparam(2));
681 &lea ($key,&DWP(128,$key)); # size optimization
682
683 ####### process KB
684 &_saveround (2,$key,-128,@T); # KB<<<0
685 &_rotl128 (@T,30,10,@T); # KB<<<30
686 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
687 push (@T,shift(@T)); # rotl128(@T,32);
688 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
689
690 ####### process KR
691 &_loadround (4,$key,-128,@T); # load KR
692 &_rotl128 (@T,15,4,@T); # KR<<<15
693 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
694 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
695 push (@T,shift(@T)); # rotl128(@T,32);
696 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
697
698 ####### process KA
699 &_loadround (6,$key,-128,@T); # load KA
700 &_rotl128 (@T,15,6,@T); # KA<<<15
701 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
702 push (@T,shift(@T)); # rotl128(@T,32);
703 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
704 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
705
706 ####### process KL
707 &_loadround (0,$key,-128,@T); # load KL
708 push (@T,shift(@T)); # rotl128(@T,32);
709 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
710 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
711 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
712 push (@T,shift(@T)); # rotl128(@T,32);
713 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
714
715 while (@T[0] ne "eax") # restore order
716 { unshift (@T,pop(@T)); }
717
718 &mov ("eax",4); # 4 grandRounds
719&set_label("done");
720 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
721 &stack_pop(4);
722}
723&function_end("Camellia_Ekeygen");
724
725if ($OPENSSL) {
726# int Camellia_set_key (
727# const unsigned char *userKey,
728# int bits,
729# CAMELLIA_KEY *key)
730&function_begin_B("Camellia_set_key");
731 &push ("ebx");
732 &mov ("ecx",&wparam(0)); # pull arguments
733 &mov ("ebx",&wparam(1));
734 &mov ("edx",&wparam(2));
735
736 &mov ("eax",-1);
737 &test ("ecx","ecx");
738 &jz (&label("done")); # userKey==NULL?
739 &test ("edx","edx");
740 &jz (&label("done")); # key==NULL?
741
742 &mov ("eax",-2);
743 &cmp ("ebx",256);
744 &je (&label("arg_ok")); # bits==256?
745 &cmp ("ebx",192);
746 &je (&label("arg_ok")); # bits==192?
747 &cmp ("ebx",128);
748 &jne (&label("done")); # bits!=128?
749&set_label("arg_ok",4);
750
751 &push ("edx"); # push arguments
752 &push ("ecx");
753 &push ("ebx");
754 &call ("Camellia_Ekeygen");
755 &stack_pop(3);
756
757 # eax holds grandRounds and edx points at where to put it
758 &mov (&DWP(0,"edx"),"eax");
759 &xor ("eax","eax");
760&set_label("done",4);
761 &pop ("ebx");
762 &ret ();
763&function_end_B("Camellia_set_key");
764}
765
766@SBOX=(
767112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
768 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
769134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
770166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
771139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
772223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
773 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
774254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
775170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
776 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
777135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
778 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
779233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
780120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
781114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
782 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
783
784sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
785sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
786sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
787sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
788
789&set_label("Camellia_SIGMA",64);
790&data_word(
791 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
792 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
793 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
794 0, 0, 0, 0);
795&set_label("Camellia_SBOX",64);
796# tables are interleaved, remember?
797for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
798for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
799
800# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
801# size_t length, const CAMELLIA_KEY *key,
802# unsigned char *ivp,const int enc);
803{
804# stack frame layout
805# -4(%esp) # return address 0(%esp)
806# 0(%esp) # s0 4(%esp)
807# 4(%esp) # s1 8(%esp)
808# 8(%esp) # s2 12(%esp)
809# 12(%esp) # s3 16(%esp)
810# 16(%esp) # end of key schedule 20(%esp)
811# 20(%esp) # %esp backup
812my $_inp=&DWP(24,"esp"); #copy of wparam(0)
813my $_out=&DWP(28,"esp"); #copy of wparam(1)
814my $_len=&DWP(32,"esp"); #copy of wparam(2)
815my $_key=&DWP(36,"esp"); #copy of wparam(3)
816my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
817my $ivec=&DWP(44,"esp"); #ivec[16]
818my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
819my ($s0,$s1,$s2,$s3) = @T;
820
821&function_begin("Camellia_cbc_encrypt");
822 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
823 &cmp ($s2,0);
824 &je (&label("enc_out"));
825
826 &pushf ();
827 &cld ();
828
829 &mov ($s0,&wparam(0)); # load inp
830 &mov ($s1,&wparam(1)); # load out
831 #&mov ($s2,&wparam(2)); # load len
832 &mov ($s3,&wparam(3)); # load key
833 &mov ($Tbl,&wparam(4)); # load ivp
834
835 # allocate aligned stack frame...
836 &lea ($idx,&DWP(-64,"esp"));
837 &and ($idx,-64);
838
839 # place stack frame just "above mod 1024" the key schedule
840 # this ensures that cache associativity of 2 suffices
841 &lea ($key,&DWP(-64-63,$s3));
842 &sub ($key,$idx);
843 &neg ($key);
844 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
845 &sub ($idx,$key);
846
847 &mov ($key,&wparam(5)); # load enc
848
849 &exch ("esp",$idx);
850 &add ("esp",4); # reserve for return address!
851 &mov ($_esp,$idx); # save %esp
852
853 &mov ($_inp,$s0); # save copy of inp
854 &mov ($_out,$s1); # save copy of out
855 &mov ($_len,$s2); # save copy of len
856 &mov ($_key,$s3); # save copy of key
857 &mov ($_ivp,$Tbl); # save copy of ivp
858
859 &call (&label("pic_point")); # make it PIC!
860 &set_label("pic_point");
861 &blindpop($Tbl);
862 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
863
864 &mov ($idx,32);
865 &set_label("prefetch_sbox",4);
866 &mov ($s0,&DWP(0,$Tbl));
867 &mov ($s1,&DWP(32,$Tbl));
868 &mov ($s2,&DWP(64,$Tbl));
869 &mov ($s3,&DWP(96,$Tbl));
870 &lea ($Tbl,&DWP(128,$Tbl));
871 &dec ($idx);
872 &jnz (&label("prefetch_sbox"));
873 &mov ($s0,$_key);
874 &sub ($Tbl,4096);
875 &mov ($idx,$_inp);
876 &mov ($s3,&DWP(272,$s0)); # load grandRounds
877
878 &cmp ($key,0);
879 &je (&label("DECRYPT"));
880
881 &mov ($s2,$_len);
882 &mov ($key,$_ivp);
883 &shl ($s3,6);
884 &lea ($s3,&DWP(0,$s0,$s3));
885 &mov ($_end,$s3);
886
887 &test ($s2,0xFFFFFFF0);
888 &jz (&label("enc_tail")); # short input...
889
890 &mov ($s0,&DWP(0,$key)); # load iv
891 &mov ($s1,&DWP(4,$key));
892
893 &set_label("enc_loop",4);
894 &mov ($s2,&DWP(8,$key));
895 &mov ($s3,&DWP(12,$key));
896
897 &xor ($s0,&DWP(0,$idx)); # xor input data
898 &xor ($s1,&DWP(4,$idx));
899 &xor ($s2,&DWP(8,$idx));
900 &bswap ($s0);
901 &xor ($s3,&DWP(12,$idx));
902 &bswap ($s1);
903 &mov ($key,$_key); # load key
904 &bswap ($s2);
905 &bswap ($s3);
906
907 &call ("_x86_Camellia_encrypt");
908
909 &mov ($idx,$_inp); # load inp
910 &mov ($key,$_out); # load out
911
912 &bswap ($s0);
913 &bswap ($s1);
914 &bswap ($s2);
915 &mov (&DWP(0,$key),$s0); # save output data
916 &bswap ($s3);
917 &mov (&DWP(4,$key),$s1);
918 &mov (&DWP(8,$key),$s2);
919 &mov (&DWP(12,$key),$s3);
920
921 &mov ($s2,$_len); # load len
922
923 &lea ($idx,&DWP(16,$idx));
924 &mov ($_inp,$idx); # save inp
925
926 &lea ($s3,&DWP(16,$key));
927 &mov ($_out,$s3); # save out
928
929 &sub ($s2,16);
930 &test ($s2,0xFFFFFFF0);
931 &mov ($_len,$s2); # save len
932 &jnz (&label("enc_loop"));
933 &test ($s2,15);
934 &jnz (&label("enc_tail"));
935 &mov ($idx,$_ivp); # load ivp
936 &mov ($s2,&DWP(8,$key)); # restore last dwords
937 &mov ($s3,&DWP(12,$key));
938 &mov (&DWP(0,$idx),$s0); # save ivec
939 &mov (&DWP(4,$idx),$s1);
940 &mov (&DWP(8,$idx),$s2);
941 &mov (&DWP(12,$idx),$s3);
942
943 &mov ("esp",$_esp);
944 &popf ();
945 &set_label("enc_out");
946 &function_end_A();
947 &pushf (); # kludge, never executed
948
949 &set_label("enc_tail",4);
950 &mov ($s0,$key eq "edi" ? $key : "");
951 &mov ($key,$_out); # load out
952 &push ($s0); # push ivp
953 &mov ($s1,16);
954 &sub ($s1,$s2);
955 &cmp ($key,$idx); # compare with inp
956 &je (&label("enc_in_place"));
957 &align (4);
958 &data_word(0xA4F3F689); # rep movsb # copy input
959 &jmp (&label("enc_skip_in_place"));
960 &set_label("enc_in_place");
961 &lea ($key,&DWP(0,$key,$s2));
962 &set_label("enc_skip_in_place");
963 &mov ($s2,$s1);
964 &xor ($s0,$s0);
965 &align (4);
966 &data_word(0xAAF3F689); # rep stosb # zero tail
967 &pop ($key); # pop ivp
968
969 &mov ($idx,$_out); # output as input
970 &mov ($s0,&DWP(0,$key));
971 &mov ($s1,&DWP(4,$key));
972 &mov ($_len,16); # len=16
973 &jmp (&label("enc_loop")); # one more spin...
974
975#----------------------------- DECRYPT -----------------------------#
976&set_label("DECRYPT",16);
977 &shl ($s3,6);
978 &lea ($s3,&DWP(0,$s0,$s3));
979 &mov ($_end,$s0);
980 &mov ($_key,$s3);
981
982 &cmp ($idx,$_out);
983 &je (&label("dec_in_place")); # in-place processing...
984
985 &mov ($key,$_ivp); # load ivp
986 &mov ($_tmp,$key);
987
988 &set_label("dec_loop",4);
989 &mov ($s0,&DWP(0,$idx)); # read input
990 &mov ($s1,&DWP(4,$idx));
991 &mov ($s2,&DWP(8,$idx));
992 &bswap ($s0);
993 &mov ($s3,&DWP(12,$idx));
994 &bswap ($s1);
995 &mov ($key,$_key); # load key
996 &bswap ($s2);
997 &bswap ($s3);
998
999 &call ("_x86_Camellia_decrypt");
1000
1001 &mov ($key,$_tmp); # load ivp
1002 &mov ($idx,$_len); # load len
1003
1004 &bswap ($s0);
1005 &bswap ($s1);
1006 &bswap ($s2);
1007 &xor ($s0,&DWP(0,$key)); # xor iv
1008 &bswap ($s3);
1009 &xor ($s1,&DWP(4,$key));
1010 &xor ($s2,&DWP(8,$key));
1011 &xor ($s3,&DWP(12,$key));
1012
1013 &sub ($idx,16);
1014 &jc (&label("dec_partial"));
1015 &mov ($_len,$idx); # save len
1016 &mov ($idx,$_inp); # load inp
1017 &mov ($key,$_out); # load out
1018
1019 &mov (&DWP(0,$key),$s0); # write output
1020 &mov (&DWP(4,$key),$s1);
1021 &mov (&DWP(8,$key),$s2);
1022 &mov (&DWP(12,$key),$s3);
1023
1024 &mov ($_tmp,$idx); # save ivp
1025 &lea ($idx,&DWP(16,$idx));
1026 &mov ($_inp,$idx); # save inp
1027
1028 &lea ($key,&DWP(16,$key));
1029 &mov ($_out,$key); # save out
1030
1031 &jnz (&label("dec_loop"));
1032 &mov ($key,$_tmp); # load temp ivp
1033 &set_label("dec_end");
1034 &mov ($idx,$_ivp); # load user ivp
1035 &mov ($s0,&DWP(0,$key)); # load iv
1036 &mov ($s1,&DWP(4,$key));
1037 &mov ($s2,&DWP(8,$key));
1038 &mov ($s3,&DWP(12,$key));
1039 &mov (&DWP(0,$idx),$s0); # copy back to user
1040 &mov (&DWP(4,$idx),$s1);
1041 &mov (&DWP(8,$idx),$s2);
1042 &mov (&DWP(12,$idx),$s3);
1043 &jmp (&label("dec_out"));
1044
1045 &set_label("dec_partial",4);
1046 &lea ($key,$ivec);
1047 &mov (&DWP(0,$key),$s0); # dump output to stack
1048 &mov (&DWP(4,$key),$s1);
1049 &mov (&DWP(8,$key),$s2);
1050 &mov (&DWP(12,$key),$s3);
1051 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1052 &mov ($idx eq "esi" ? $idx : "",$key);
1053 &mov ($key eq "edi" ? $key : "",$_out); # load out
1054 &data_word(0xA4F3F689); # rep movsb # copy output
1055 &mov ($key,$_inp); # use inp as temp ivp
1056 &jmp (&label("dec_end"));
1057
1058 &set_label("dec_in_place",4);
1059 &set_label("dec_in_place_loop");
1060 &lea ($key,$ivec);
1061 &mov ($s0,&DWP(0,$idx)); # read input
1062 &mov ($s1,&DWP(4,$idx));
1063 &mov ($s2,&DWP(8,$idx));
1064 &mov ($s3,&DWP(12,$idx));
1065
1066 &mov (&DWP(0,$key),$s0); # copy to temp
1067 &mov (&DWP(4,$key),$s1);
1068 &mov (&DWP(8,$key),$s2);
1069 &bswap ($s0);
1070 &mov (&DWP(12,$key),$s3);
1071 &bswap ($s1);
1072 &mov ($key,$_key); # load key
1073 &bswap ($s2);
1074 &bswap ($s3);
1075
1076 &call ("_x86_Camellia_decrypt");
1077
1078 &mov ($key,$_ivp); # load ivp
1079 &mov ($idx,$_out); # load out
1080
1081 &bswap ($s0);
1082 &bswap ($s1);
1083 &bswap ($s2);
1084 &xor ($s0,&DWP(0,$key)); # xor iv
1085 &bswap ($s3);
1086 &xor ($s1,&DWP(4,$key));
1087 &xor ($s2,&DWP(8,$key));
1088 &xor ($s3,&DWP(12,$key));
1089
1090 &mov (&DWP(0,$idx),$s0); # write output
1091 &mov (&DWP(4,$idx),$s1);
1092 &mov (&DWP(8,$idx),$s2);
1093 &mov (&DWP(12,$idx),$s3);
1094
1095 &lea ($idx,&DWP(16,$idx));
1096 &mov ($_out,$idx); # save out
1097
1098 &lea ($idx,$ivec);
1099 &mov ($s0,&DWP(0,$idx)); # read temp
1100 &mov ($s1,&DWP(4,$idx));
1101 &mov ($s2,&DWP(8,$idx));
1102 &mov ($s3,&DWP(12,$idx));
1103
1104 &mov (&DWP(0,$key),$s0); # copy iv
1105 &mov (&DWP(4,$key),$s1);
1106 &mov (&DWP(8,$key),$s2);
1107 &mov (&DWP(12,$key),$s3);
1108
1109 &mov ($idx,$_inp); # load inp
1110
1111 &lea ($idx,&DWP(16,$idx));
1112 &mov ($_inp,$idx); # save inp
1113
1114 &mov ($s2,$_len); # load len
1115 &sub ($s2,16);
1116 &jc (&label("dec_in_place_partial"));
1117 &mov ($_len,$s2); # save len
1118 &jnz (&label("dec_in_place_loop"));
1119 &jmp (&label("dec_out"));
1120
1121 &set_label("dec_in_place_partial",4);
1122 # one can argue if this is actually required...
1123 &mov ($key eq "edi" ? $key : "",$_out);
1124 &lea ($idx eq "esi" ? $idx : "",$ivec);
1125 &lea ($key,&DWP(0,$key,$s2));
1126 &lea ($idx,&DWP(16,$idx,$s2));
1127 &neg ($s2 eq "ecx" ? $s2 : "");
1128 &data_word(0xA4F3F689); # rep movsb # restore tail
1129
1130 &set_label("dec_out",4);
1131 &mov ("esp",$_esp);
1132 &popf ();
1133&function_end("Camellia_cbc_encrypt");
1134}
1135
1136&asciz("Camellia for x86 by <appro@openssl.org>");
1137
1138&asm_finish();
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl b/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
new file mode 100644
index 0000000000..c683646ca7
--- /dev/null
+++ b/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
@@ -0,0 +1,1080 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD64 Core2 EM64T
18# -evp camellia-128-ecb 16.7 21.0 22.7
19# + over gcc 3.4.6 +25% +5% 0%
20#
21# camellia-128-cbc 15.7 20.4 21.1
22#
23# 128-bit key setup 128 216 205 cycles/key
24# + over gcc 3.4.6 +54% +39% +15%
25#
26# Numbers in "+" rows represent performance improvement over compiler
27# generated code. Key setup timings are impressive on AMD and Core2
28# thanks to 64-bit operations being covertly deployed. Improvement on
29# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30# apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open STDOUT,"| $^X $xlate $flavour $output";
44
45sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
46sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
47 $r =~ s/%[er]([sd]i)/%\1l/;
48 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
49
50$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
51@S=("%r8d","%r9d","%r10d","%r11d");
52$i0="%esi";
53$i1="%edi";
54$Tbl="%rbp"; # size optimization
55$inp="%r12";
56$out="%r13";
57$key="%r14";
58$keyend="%r15";
59$arg0d=$win64?"%ecx":"%edi";
60
61# const unsigned int Camellia_SBOX[4][256];
62# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
63# and [2][] - with [3][]. This is done to minimize code size.
64$SBOX1_1110=0; # Camellia_SBOX[0]
65$SBOX4_4404=4; # Camellia_SBOX[1]
66$SBOX2_0222=2048; # Camellia_SBOX[2]
67$SBOX3_3033=2052; # Camellia_SBOX[3]
68
69sub Camellia_Feistel {
70my $i=@_[0];
71my $seed=defined(@_[1])?@_[1]:0;
72my $scale=$seed<0?-8:8;
73my $j=($i&1)*2;
74my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75
76$code.=<<___;
77 xor $s0,$t0 # t0^=key[0]
78 xor $s1,$t1 # t1^=key[1]
79 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
80 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
81 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
82 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
83 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
84 shr \$16,$t0
85 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
86 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
87 shr \$16,$t1
88 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
89 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
90 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
91 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
92 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
93 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
94 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
95 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
96 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
97 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
98 mov `$seed+($i+1)*$scale+4`($key),$t0
99 xor $t3,$t2 # t2^=t3
100 ror \$8,$t3 # t3=RightRotate(t3,8)
101 xor $t2,$s2
102 xor $t2,$s3
103 xor $t3,$s3
104___
105}
106
107# void Camellia_EncryptBlock_Rounds(
108# int grandRounds,
109# const Byte plaintext[],
110# const KEY_TABLE_TYPE keyTable,
111# Byte ciphertext[])
112$code=<<___;
113.text
114
115# V1.x API
116.globl Camellia_EncryptBlock
117.type Camellia_EncryptBlock,\@abi-omnipotent
118.align 16
119Camellia_EncryptBlock:
120 movl \$128,%eax
121 subl $arg0d,%eax
122 movl \$3,$arg0d
123 adcl \$0,$arg0d # keyBitLength==128?3:4
124 jmp .Lenc_rounds
125.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
126# V2
127.globl Camellia_EncryptBlock_Rounds
128.type Camellia_EncryptBlock_Rounds,\@function,4
129.align 16
130.Lenc_rounds:
131Camellia_EncryptBlock_Rounds:
132 push %rbx
133 push %rbp
134 push %r13
135 push %r14
136 push %r15
137.Lenc_prologue:
138
139 #mov %rsi,$inp # put away arguments
140 mov %rcx,$out
141 mov %rdx,$key
142
143 shl \$6,%edi # process grandRounds
144 lea .LCamellia_SBOX(%rip),$Tbl
145 lea ($key,%rdi),$keyend
146
147 mov 0(%rsi),@S[0] # load plaintext
148 mov 4(%rsi),@S[1]
149 mov 8(%rsi),@S[2]
150 bswap @S[0]
151 mov 12(%rsi),@S[3]
152 bswap @S[1]
153 bswap @S[2]
154 bswap @S[3]
155
156 call _x86_64_Camellia_encrypt
157
158 bswap @S[0]
159 bswap @S[1]
160 bswap @S[2]
161 mov @S[0],0($out)
162 bswap @S[3]
163 mov @S[1],4($out)
164 mov @S[2],8($out)
165 mov @S[3],12($out)
166
167 mov 0(%rsp),%r15
168 mov 8(%rsp),%r14
169 mov 16(%rsp),%r13
170 mov 24(%rsp),%rbp
171 mov 32(%rsp),%rbx
172 lea 40(%rsp),%rsp
173.Lenc_epilogue:
174 ret
175.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176
177.type _x86_64_Camellia_encrypt,\@abi-omnipotent
178.align 16
179_x86_64_Camellia_encrypt:
180 xor 0($key),@S[1]
181 xor 4($key),@S[0] # ^=key[0-3]
182 xor 8($key),@S[3]
183 xor 12($key),@S[2]
184.align 16
185.Leloop:
186 mov 16($key),$t1 # prefetch key[4-5]
187 mov 20($key),$t0
188
189___
190 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
191$code.=<<___;
192 lea 16*4($key),$key
193 cmp $keyend,$key
194 mov 8($key),$t3 # prefetch key[2-3]
195 mov 12($key),$t2
196 je .Ledone
197
198 and @S[0],$t0
199 or @S[3],$t3
200 rol \$1,$t0
201 xor $t3,@S[2] # s2^=s3|key[3];
202 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
203 and @S[2],$t2
204 or @S[1],$t1
205 rol \$1,$t2
206 xor $t1,@S[0] # s0^=s1|key[1];
207 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
208 jmp .Leloop
209
210.align 16
211.Ledone:
212 xor @S[2],$t0 # SwapHalf
213 xor @S[3],$t1
214 xor @S[0],$t2
215 xor @S[1],$t3
216
217 mov $t0,@S[0]
218 mov $t1,@S[1]
219 mov $t2,@S[2]
220 mov $t3,@S[3]
221
222 .byte 0xf3,0xc3 # rep ret
223.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224
225# V1.x API
226.globl Camellia_DecryptBlock
227.type Camellia_DecryptBlock,\@abi-omnipotent
228.align 16
229Camellia_DecryptBlock:
230 movl \$128,%eax
231 subl $arg0d,%eax
232 movl \$3,$arg0d
233 adcl \$0,$arg0d # keyBitLength==128?3:4
234 jmp .Ldec_rounds
235.size Camellia_DecryptBlock,.-Camellia_DecryptBlock
236# V2
237.globl Camellia_DecryptBlock_Rounds
238.type Camellia_DecryptBlock_Rounds,\@function,4
239.align 16
240.Ldec_rounds:
241Camellia_DecryptBlock_Rounds:
242 push %rbx
243 push %rbp
244 push %r13
245 push %r14
246 push %r15
247.Ldec_prologue:
248
249 #mov %rsi,$inp # put away arguments
250 mov %rcx,$out
251 mov %rdx,$keyend
252
253 shl \$6,%edi # process grandRounds
254 lea .LCamellia_SBOX(%rip),$Tbl
255 lea ($keyend,%rdi),$key
256
257 mov 0(%rsi),@S[0] # load plaintext
258 mov 4(%rsi),@S[1]
259 mov 8(%rsi),@S[2]
260 bswap @S[0]
261 mov 12(%rsi),@S[3]
262 bswap @S[1]
263 bswap @S[2]
264 bswap @S[3]
265
266 call _x86_64_Camellia_decrypt
267
268 bswap @S[0]
269 bswap @S[1]
270 bswap @S[2]
271 mov @S[0],0($out)
272 bswap @S[3]
273 mov @S[1],4($out)
274 mov @S[2],8($out)
275 mov @S[3],12($out)
276
277 mov 0(%rsp),%r15
278 mov 8(%rsp),%r14
279 mov 16(%rsp),%r13
280 mov 24(%rsp),%rbp
281 mov 32(%rsp),%rbx
282 lea 40(%rsp),%rsp
283.Ldec_epilogue:
284 ret
285.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286
287.type _x86_64_Camellia_decrypt,\@abi-omnipotent
288.align 16
289_x86_64_Camellia_decrypt:
290 xor 0($key),@S[1]
291 xor 4($key),@S[0] # ^=key[0-3]
292 xor 8($key),@S[3]
293 xor 12($key),@S[2]
294.align 16
295.Ldloop:
296 mov -8($key),$t1 # prefetch key[4-5]
297 mov -4($key),$t0
298
299___
300 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
301$code.=<<___;
302 lea -16*4($key),$key
303 cmp $keyend,$key
304 mov 0($key),$t3 # prefetch key[2-3]
305 mov 4($key),$t2
306 je .Lddone
307
308 and @S[0],$t0
309 or @S[3],$t3
310 rol \$1,$t0
311 xor $t3,@S[2] # s2^=s3|key[3];
312 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
313 and @S[2],$t2
314 or @S[1],$t1
315 rol \$1,$t2
316 xor $t1,@S[0] # s0^=s1|key[1];
317 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
318
319 jmp .Ldloop
320
321.align 16
322.Lddone:
323 xor @S[2],$t2
324 xor @S[3],$t3
325 xor @S[0],$t0
326 xor @S[1],$t1
327
328 mov $t2,@S[0] # SwapHalf
329 mov $t3,@S[1]
330 mov $t0,@S[2]
331 mov $t1,@S[3]
332
333 .byte 0xf3,0xc3 # rep ret
334.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
335___
336
337sub _saveround {
338my ($rnd,$key,@T)=@_;
339my $bias=int(@T[0])?shift(@T):0;
340
341 if ($#T==3) {
342 $code.=<<___;
343 mov @T[1],`$bias+$rnd*8+0`($key)
344 mov @T[0],`$bias+$rnd*8+4`($key)
345 mov @T[3],`$bias+$rnd*8+8`($key)
346 mov @T[2],`$bias+$rnd*8+12`($key)
347___
348 } else {
349 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
350 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
351 }
352}
353
354sub _loadround {
355my ($rnd,$key,@T)=@_;
356my $bias=int(@T[0])?shift(@T):0;
357
358$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
359$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
360}
361
362# shld is very slow on Intel EM64T family. Even on AMD it limits
363# instruction decode rate [because it's VectorPath] and consequently
364# performance...
365sub __rotl128 {
366my ($i0,$i1,$rot)=@_;
367
368 if ($rot) {
369 $code.=<<___;
370 mov $i0,%r11
371 shld \$$rot,$i1,$i0
372 shld \$$rot,%r11,$i1
373___
374 }
375}
376
377# ... Implementing 128-bit rotate without shld gives 80% better
378# performance EM64T, +15% on AMD64 and only ~7% degradation on
379# Core2. This is therefore preferred.
380sub _rotl128 {
381my ($i0,$i1,$rot)=@_;
382
383 if ($rot) {
384 $code.=<<___;
385 mov $i0,%r11
386 shl \$$rot,$i0
387 mov $i1,%r9
388 shr \$`64-$rot`,%r9
389 shr \$`64-$rot`,%r11
390 or %r9,$i0
391 shl \$$rot,$i1
392 or %r11,$i1
393___
394 }
395}
396
397{ my $step=0;
398
399$code.=<<___;
400.globl Camellia_Ekeygen
401.type Camellia_Ekeygen,\@function,3
402.align 16
403Camellia_Ekeygen:
404 push %rbx
405 push %rbp
406 push %r13
407 push %r14
408 push %r15
409.Lkey_prologue:
410
411 mov %rdi,$keyend # put away arguments, keyBitLength
412 mov %rdx,$out # keyTable
413
414 mov 0(%rsi),@S[0] # load 0-127 bits
415 mov 4(%rsi),@S[1]
416 mov 8(%rsi),@S[2]
417 mov 12(%rsi),@S[3]
418
419 bswap @S[0]
420 bswap @S[1]
421 bswap @S[2]
422 bswap @S[3]
423___
424 &_saveround (0,$out,@S); # KL<<<0
425$code.=<<___;
426 cmp \$128,$keyend # check keyBitLength
427 je .L1st128
428
429 mov 16(%rsi),@S[0] # load 128-191 bits
430 mov 20(%rsi),@S[1]
431 cmp \$192,$keyend
432 je .L1st192
433 mov 24(%rsi),@S[2] # load 192-255 bits
434 mov 28(%rsi),@S[3]
435 jmp .L1st256
436.L1st192:
437 mov @S[0],@S[2]
438 mov @S[1],@S[3]
439 not @S[2]
440 not @S[3]
441.L1st256:
442 bswap @S[0]
443 bswap @S[1]
444 bswap @S[2]
445 bswap @S[3]
446___
447 &_saveround (4,$out,@S); # temp storage for KR!
448$code.=<<___;
449 xor 0($out),@S[1] # KR^KL
450 xor 4($out),@S[0]
451 xor 8($out),@S[3]
452 xor 12($out),@S[2]
453
454.L1st128:
455 lea .LCamellia_SIGMA(%rip),$key
456 lea .LCamellia_SBOX(%rip),$Tbl
457
458 mov 0($key),$t1
459 mov 4($key),$t0
460___
461 &Camellia_Feistel($step++);
462 &Camellia_Feistel($step++);
463$code.=<<___;
464 xor 0($out),@S[1] # ^KL
465 xor 4($out),@S[0]
466 xor 8($out),@S[3]
467 xor 12($out),@S[2]
468___
469 &Camellia_Feistel($step++);
470 &Camellia_Feistel($step++);
471$code.=<<___;
472 cmp \$128,$keyend
473 jne .L2nd256
474
475 lea 128($out),$out # size optimization
476 shl \$32,%r8 # @S[0]||
477 shl \$32,%r10 # @S[2]||
478 or %r9,%r8 # ||@S[1]
479 or %r11,%r10 # ||@S[3]
480___
481 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
482 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
483 &_rotl128 ("%rax","%rbx",15);
484 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
485 &_rotl128 ("%r8","%r10",15);
486 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
487 &_rotl128 ("%r8","%r10",15); # 15+15=30
488 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
489 &_rotl128 ("%rax","%rbx",30); # 15+30=45
490 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
491 &_rotl128 ("%r8","%r10",15); # 30+15=45
492 &_saveround (12,$out,-128,"%r8"); # KA<<<45
493 &_rotl128 ("%rax","%rbx",15); # 45+15=60
494 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
495 &_rotl128 ("%r8","%r10",15); # 45+15=60
496 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
497 &_rotl128 ("%rax","%rbx",17); # 60+17=77
498 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
499 &_rotl128 ("%rax","%rbx",17); # 77+17=94
500 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
501 &_rotl128 ("%r8","%r10",34); # 60+34=94
502 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
503 &_rotl128 ("%rax","%rbx",17); # 94+17=111
504 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
505 &_rotl128 ("%r8","%r10",17); # 94+17=111
506 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
507$code.=<<___;
508 mov \$3,%eax
509 jmp .Ldone
510.align 16
511.L2nd256:
512___
513 &_saveround (6,$out,@S); # temp storage for KA!
514$code.=<<___;
515 xor `4*8+0`($out),@S[1] # KA^KR
516 xor `4*8+4`($out),@S[0]
517 xor `5*8+0`($out),@S[3]
518 xor `5*8+4`($out),@S[2]
519___
520 &Camellia_Feistel($step++);
521 &Camellia_Feistel($step++);
522
523 &_loadround (0,$out,"%rax","%rbx"); # KL
524 &_loadround (4,$out,"%rcx","%rdx"); # KR
525 &_loadround (6,$out,"%r14","%r15"); # KA
526$code.=<<___;
527 lea 128($out),$out # size optimization
528 shl \$32,%r8 # @S[0]||
529 shl \$32,%r10 # @S[2]||
530 or %r9,%r8 # ||@S[1]
531 or %r11,%r10 # ||@S[3]
532___
533 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
534 &_rotl128 ("%rcx","%rdx",15);
535 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
536 &_rotl128 ("%r14","%r15",15);
537 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
538 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
539 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
540 &_rotl128 ("%r8","%r10",30);
541 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
542 &_rotl128 ("%rax","%rbx",45);
543 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
544 &_rotl128 ("%r14","%r15",30); # 15+30=45
545 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
546 &_rotl128 ("%rax","%rbx",15); # 45+15=60
547 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
548 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
549 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
550 &_rotl128 ("%r8","%r10",30); # 30+30=60
551 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
552 &_rotl128 ("%rax","%rbx",17); # 60+17=77
553 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
554 &_rotl128 ("%r14","%r15",32); # 45+32=77
555 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
556 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
557 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
558 &_rotl128 ("%r14","%r15",17); # 77+17=94
559 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
560 &_rotl128 ("%rax","%rbx",34); # 77+34=111
561 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
562 &_rotl128 ("%r8","%r10",51); # 60+51=111
563 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
564$code.=<<___;
565 mov \$4,%eax
566.Ldone:
567 mov 0(%rsp),%r15
568 mov 8(%rsp),%r14
569 mov 16(%rsp),%r13
570 mov 24(%rsp),%rbp
571 mov 32(%rsp),%rbx
572 lea 40(%rsp),%rsp
573.Lkey_epilogue:
574 ret
575.size Camellia_Ekeygen,.-Camellia_Ekeygen
576___
577}
578
579@SBOX=(
580112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
581 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
582134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
583166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
584139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
585223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
586 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
587254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
588170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
589 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
590135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
591 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
592233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
593120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
594114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
595 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596
597sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
598sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
599sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
600sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
601
602$code.=<<___;
603.align 64
604.LCamellia_SIGMA:
605.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
606.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
607.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
608.long 0, 0, 0, 0
609.LCamellia_SBOX:
610___
611# tables are interleaved, remember?
612sub data_word { $code.=".long\t".join(',',@_)."\n"; }
613for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
614for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615
616# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
617# size_t length, const CAMELLIA_KEY *key,
618# unsigned char *ivp,const int enc);
619{
620$_key="0(%rsp)";
621$_end="8(%rsp)"; # inp+len&~15
622$_res="16(%rsp)"; # len&15
623$ivec="24(%rsp)";
624$_ivp="40(%rsp)";
625$_rsp="48(%rsp)";
626
627$code.=<<___;
628.globl Camellia_cbc_encrypt
629.type Camellia_cbc_encrypt,\@function,6
630.align 16
631Camellia_cbc_encrypt:
632 cmp \$0,%rdx
633 je .Lcbc_abort
634 push %rbx
635 push %rbp
636 push %r12
637 push %r13
638 push %r14
639 push %r15
640.Lcbc_prologue:
641
642 mov %rsp,%rbp
643 sub \$64,%rsp
644 and \$-64,%rsp
645
646 # place stack frame just "above mod 1024" the key schedule,
647 # this ensures that cache associativity suffices
648 lea -64-63(%rcx),%r10
649 sub %rsp,%r10
650 neg %r10
651 and \$0x3C0,%r10
652 sub %r10,%rsp
653 #add \$8,%rsp # 8 is reserved for callee's ra
654
655 mov %rdi,$inp # inp argument
656 mov %rsi,$out # out argument
657 mov %r8,%rbx # ivp argument
658 mov %rcx,$key # key argument
659 mov 272(%rcx),$keyend # grandRounds
660
661 mov %r8,$_ivp
662 mov %rbp,$_rsp
663
664.Lcbc_body:
665 lea .LCamellia_SBOX(%rip),$Tbl
666
667 mov \$32,%ecx
668.align 4
669.Lcbc_prefetch_sbox:
670 mov 0($Tbl),%rax
671 mov 32($Tbl),%rsi
672 mov 64($Tbl),%rdi
673 mov 96($Tbl),%r11
674 lea 128($Tbl),$Tbl
675 loop .Lcbc_prefetch_sbox
676 sub \$4096,$Tbl
677 shl \$6,$keyend
678 mov %rdx,%rcx # len argument
679 lea ($key,$keyend),$keyend
680
681 cmp \$0,%r9d # enc argument
682 je .LCBC_DECRYPT
683
684 and \$-16,%rdx
685 and \$15,%rcx # length residue
686 lea ($inp,%rdx),%rdx
687 mov $key,$_key
688 mov %rdx,$_end
689 mov %rcx,$_res
690
691 cmp $inp,%rdx
692 mov 0(%rbx),@S[0] # load IV
693 mov 4(%rbx),@S[1]
694 mov 8(%rbx),@S[2]
695 mov 12(%rbx),@S[3]
696 je .Lcbc_enc_tail
697 jmp .Lcbc_eloop
698
699.align 16
700.Lcbc_eloop:
701 xor 0($inp),@S[0]
702 xor 4($inp),@S[1]
703 xor 8($inp),@S[2]
704 bswap @S[0]
705 xor 12($inp),@S[3]
706 bswap @S[1]
707 bswap @S[2]
708 bswap @S[3]
709
710 call _x86_64_Camellia_encrypt
711
712 mov $_key,$key # "rewind" the key
713 bswap @S[0]
714 mov $_end,%rdx
715 bswap @S[1]
716 mov $_res,%rcx
717 bswap @S[2]
718 mov @S[0],0($out)
719 bswap @S[3]
720 mov @S[1],4($out)
721 mov @S[2],8($out)
722 lea 16($inp),$inp
723 mov @S[3],12($out)
724 cmp %rdx,$inp
725 lea 16($out),$out
726 jne .Lcbc_eloop
727
728 cmp \$0,%rcx
729 jne .Lcbc_enc_tail
730
731 mov $_ivp,$out
732 mov @S[0],0($out) # write out IV residue
733 mov @S[1],4($out)
734 mov @S[2],8($out)
735 mov @S[3],12($out)
736 jmp .Lcbc_done
737
738.align 16
739.Lcbc_enc_tail:
740 xor %rax,%rax
741 mov %rax,0+$ivec
742 mov %rax,8+$ivec
743 mov %rax,$_res
744
745.Lcbc_enc_pushf:
746 pushfq
747 cld
748 mov $inp,%rsi
749 lea 8+$ivec,%rdi
750 .long 0x9066A4F3 # rep movsb
751 popfq
752.Lcbc_enc_popf:
753
754 lea $ivec,$inp
755 lea 16+$ivec,%rax
756 mov %rax,$_end
757 jmp .Lcbc_eloop # one more time
758
759.align 16
760.LCBC_DECRYPT:
761 xchg $key,$keyend
762 add \$15,%rdx
763 and \$15,%rcx # length residue
764 and \$-16,%rdx
765 mov $key,$_key
766 lea ($inp,%rdx),%rdx
767 mov %rdx,$_end
768 mov %rcx,$_res
769
770 mov (%rbx),%rax # load IV
771 mov 8(%rbx),%rbx
772 jmp .Lcbc_dloop
773.align 16
774.Lcbc_dloop:
775 mov 0($inp),@S[0]
776 mov 4($inp),@S[1]
777 mov 8($inp),@S[2]
778 bswap @S[0]
779 mov 12($inp),@S[3]
780 bswap @S[1]
781 mov %rax,0+$ivec # save IV to temporary storage
782 bswap @S[2]
783 mov %rbx,8+$ivec
784 bswap @S[3]
785
786 call _x86_64_Camellia_decrypt
787
788 mov $_key,$key # "rewind" the key
789 mov $_end,%rdx
790 mov $_res,%rcx
791
792 bswap @S[0]
793 mov ($inp),%rax # load IV for next iteration
794 bswap @S[1]
795 mov 8($inp),%rbx
796 bswap @S[2]
797 xor 0+$ivec,@S[0]
798 bswap @S[3]
799 xor 4+$ivec,@S[1]
800 xor 8+$ivec,@S[2]
801 lea 16($inp),$inp
802 xor 12+$ivec,@S[3]
803 cmp %rdx,$inp
804 je .Lcbc_ddone
805
806 mov @S[0],0($out)
807 mov @S[1],4($out)
808 mov @S[2],8($out)
809 mov @S[3],12($out)
810
811 lea 16($out),$out
812 jmp .Lcbc_dloop
813
814.align 16
815.Lcbc_ddone:
816 mov $_ivp,%rdx
817 cmp \$0,%rcx
818 jne .Lcbc_dec_tail
819
820 mov @S[0],0($out)
821 mov @S[1],4($out)
822 mov @S[2],8($out)
823 mov @S[3],12($out)
824
825 mov %rax,(%rdx) # write out IV residue
826 mov %rbx,8(%rdx)
827 jmp .Lcbc_done
828.align 16
829.Lcbc_dec_tail:
830 mov @S[0],0+$ivec
831 mov @S[1],4+$ivec
832 mov @S[2],8+$ivec
833 mov @S[3],12+$ivec
834
835.Lcbc_dec_pushf:
836 pushfq
837 cld
838 lea 8+$ivec,%rsi
839 lea ($out),%rdi
840 .long 0x9066A4F3 # rep movsb
841 popfq
842.Lcbc_dec_popf:
843
844 mov %rax,(%rdx) # write out IV residue
845 mov %rbx,8(%rdx)
846 jmp .Lcbc_done
847
848.align 16
849.Lcbc_done:
850 mov $_rsp,%rcx
851 mov 0(%rcx),%r15
852 mov 8(%rcx),%r14
853 mov 16(%rcx),%r13
854 mov 24(%rcx),%r12
855 mov 32(%rcx),%rbp
856 mov 40(%rcx),%rbx
857 lea 48(%rcx),%rsp
858.Lcbc_abort:
859 ret
860.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
861
862.asciz "Camellia for x86_64 by <appro@openssl.org>"
863___
864}
865
866# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
867# CONTEXT *context,DISPATCHER_CONTEXT *disp)
868if ($win64) {
869$rec="%rcx";
870$frame="%rdx";
871$context="%r8";
872$disp="%r9";
873
874$code.=<<___;
875.extern __imp_RtlVirtualUnwind
876.type common_se_handler,\@abi-omnipotent
877.align 16
878common_se_handler:
879 push %rsi
880 push %rdi
881 push %rbx
882 push %rbp
883 push %r12
884 push %r13
885 push %r14
886 push %r15
887 pushfq
888 lea -64(%rsp),%rsp
889
890 mov 120($context),%rax # pull context->Rax
891 mov 248($context),%rbx # pull context->Rip
892
893 mov 8($disp),%rsi # disp->ImageBase
894 mov 56($disp),%r11 # disp->HandlerData
895
896 mov 0(%r11),%r10d # HandlerData[0]
897 lea (%rsi,%r10),%r10 # prologue label
898 cmp %r10,%rbx # context->Rip<prologue label
899 jb .Lin_prologue
900
901 mov 152($context),%rax # pull context->Rsp
902
903 mov 4(%r11),%r10d # HandlerData[1]
904 lea (%rsi,%r10),%r10 # epilogue label
905 cmp %r10,%rbx # context->Rip>=epilogue label
906 jae .Lin_prologue
907
908 lea 40(%rax),%rax
909 mov -8(%rax),%rbx
910 mov -16(%rax),%rbp
911 mov -24(%rax),%r13
912 mov -32(%rax),%r14
913 mov -40(%rax),%r15
914 mov %rbx,144($context) # restore context->Rbx
915 mov %rbp,160($context) # restore context->Rbp
916 mov %r13,224($context) # restore context->R13
917 mov %r14,232($context) # restore context->R14
918 mov %r15,240($context) # restore context->R15
919
920.Lin_prologue:
921 mov 8(%rax),%rdi
922 mov 16(%rax),%rsi
923 mov %rax,152($context) # restore context->Rsp
924 mov %rsi,168($context) # restore context->Rsi
925 mov %rdi,176($context) # restore context->Rdi
926
927 jmp .Lcommon_seh_exit
928.size common_se_handler,.-common_se_handler
929
930.type cbc_se_handler,\@abi-omnipotent
931.align 16
932cbc_se_handler:
933 push %rsi
934 push %rdi
935 push %rbx
936 push %rbp
937 push %r12
938 push %r13
939 push %r14
940 push %r15
941 pushfq
942 lea -64(%rsp),%rsp
943
944 mov 120($context),%rax # pull context->Rax
945 mov 248($context),%rbx # pull context->Rip
946
947 lea .Lcbc_prologue(%rip),%r10
948 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
949 jb .Lin_cbc_prologue
950
951 lea .Lcbc_body(%rip),%r10
952 cmp %r10,%rbx # context->Rip<.Lcbc_body
953 jb .Lin_cbc_frame_setup
954
955 mov 152($context),%rax # pull context->Rsp
956
957 lea .Lcbc_abort(%rip),%r10
958 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
959 jae .Lin_cbc_prologue
960
961 # handle pushf/popf in Camellia_cbc_encrypt
962 lea .Lcbc_enc_pushf(%rip),%r10
963 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
964 jbe .Lin_cbc_no_flag
965 lea 8(%rax),%rax
966 lea .Lcbc_enc_popf(%rip),%r10
967 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
968 jb .Lin_cbc_no_flag
969 lea -8(%rax),%rax
970 lea .Lcbc_dec_pushf(%rip),%r10
971 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
972 jbe .Lin_cbc_no_flag
973 lea 8(%rax),%rax
974 lea .Lcbc_dec_popf(%rip),%r10
975 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
976 jb .Lin_cbc_no_flag
977 lea -8(%rax),%rax
978
979.Lin_cbc_no_flag:
980 mov 48(%rax),%rax # $_rsp
981 lea 48(%rax),%rax
982
983.Lin_cbc_frame_setup:
984 mov -8(%rax),%rbx
985 mov -16(%rax),%rbp
986 mov -24(%rax),%r12
987 mov -32(%rax),%r13
988 mov -40(%rax),%r14
989 mov -48(%rax),%r15
990 mov %rbx,144($context) # restore context->Rbx
991 mov %rbp,160($context) # restore context->Rbp
992 mov %r12,216($context) # restore context->R12
993 mov %r13,224($context) # restore context->R13
994 mov %r14,232($context) # restore context->R14
995 mov %r15,240($context) # restore context->R15
996
997.Lin_cbc_prologue:
998 mov 8(%rax),%rdi
999 mov 16(%rax),%rsi
1000 mov %rax,152($context) # restore context->Rsp
1001 mov %rsi,168($context) # restore context->Rsi
1002 mov %rdi,176($context) # restore context->Rdi
1003
1004.align 4
1005.Lcommon_seh_exit:
1006
1007 mov 40($disp),%rdi # disp->ContextRecord
1008 mov $context,%rsi # context
1009 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1010 .long 0xa548f3fc # cld; rep movsq
1011
1012 mov $disp,%rsi
1013 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1014 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1015 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1016 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1017 mov 40(%rsi),%r10 # disp->ContextRecord
1018 lea 56(%rsi),%r11 # &disp->HandlerData
1019 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1020 mov %r10,32(%rsp) # arg5
1021 mov %r11,40(%rsp) # arg6
1022 mov %r12,48(%rsp) # arg7
1023 mov %rcx,56(%rsp) # arg8, (NULL)
1024 call *__imp_RtlVirtualUnwind(%rip)
1025
1026 mov \$1,%eax # ExceptionContinueSearch
1027 lea 64(%rsp),%rsp
1028 popfq
1029 pop %r15
1030 pop %r14
1031 pop %r13
1032 pop %r12
1033 pop %rbp
1034 pop %rbx
1035 pop %rdi
1036 pop %rsi
1037 ret
1038.size cbc_se_handler,.-cbc_se_handler
1039
1040.section .pdata
1041.align 4
1042 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1043 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1044 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1045
1046 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1047 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1048 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1049
1050 .rva .LSEH_begin_Camellia_Ekeygen
1051 .rva .LSEH_end_Camellia_Ekeygen
1052 .rva .LSEH_info_Camellia_Ekeygen
1053
1054 .rva .LSEH_begin_Camellia_cbc_encrypt
1055 .rva .LSEH_end_Camellia_cbc_encrypt
1056 .rva .LSEH_info_Camellia_cbc_encrypt
1057
1058.section .xdata
1059.align 8
1060.LSEH_info_Camellia_EncryptBlock_Rounds:
1061 .byte 9,0,0,0
1062 .rva common_se_handler
1063 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1064.LSEH_info_Camellia_DecryptBlock_Rounds:
1065 .byte 9,0,0,0
1066 .rva common_se_handler
1067 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1068.LSEH_info_Camellia_Ekeygen:
1069 .byte 9,0,0,0
1070 .rva common_se_handler
1071 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1072.LSEH_info_Camellia_cbc_encrypt:
1073 .byte 9,0,0,0
1074 .rva cbc_se_handler
1075___
1076}
1077
1078$code =~ s/\`([^\`]*)\`/eval $1/gem;
1079print $code;
1080close STDOUT;