summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/camellia
diff options
context:
space:
mode:
authorjsing <>2024-03-29 07:09:37 +0000
committerjsing <>2024-03-29 07:09:37 +0000
commit4ec549c6565a669978a6935c82214593858df970 (patch)
tree27051ea9828720d1f9e7472ca7a98f23b52b2bbb /src/lib/libcrypto/camellia
parent15d5545d9c99970c4f30d3c0c9626aa3e6df02f4 (diff)
downloadopenbsd-4ec549c6565a669978a6935c82214593858df970.tar.gz
openbsd-4ec549c6565a669978a6935c82214593858df970.tar.bz2
openbsd-4ec549c6565a669978a6935c82214593858df970.zip
Remove now unused camellia assembly implementations.
Diffstat (limited to 'src/lib/libcrypto/camellia')
-rw-r--r--src/lib/libcrypto/camellia/asm/cmll-x86.pl1126
-rw-r--r--src/lib/libcrypto/camellia/asm/cmll-x86_64.pl875
2 files changed, 0 insertions, 2001 deletions
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
deleted file mode 100644
index a4ab11e54d..0000000000
--- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl
+++ /dev/null
@@ -1,1126 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD K8 Core2 PIII P4
18# -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
21#
22# camellia-128-cbc 17.3 21.1 23.9 25.9
23#
24# 128-bit key setup 196 280 256 240 cycles/key
25# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
27#
28# Pairs of numbers in "+" rows represent performance improvement over
29# compiler generated position-independent code, PIC, and non-PIC
30# respectively. PIC results are of greater relevance, as this module
31# is position-independent, i.e. suitable for a shared library or PIE.
32# Position independence "costs" one register, which is why compilers
33# are so close with non-PIC results, they have an extra register to
34# spare. CBC results are better than ECB ones thanks to "zero-copy"
35# private _x86_* interface, and are ~30-40% better than with compiler
36# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37# same CPU (where applicable).
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40push(@INC,"${dir}","${dir}../../perlasm");
41require "x86asm.pl";
42
43$OPENSSL=1;
44
45&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
46
47@T=("eax","ebx","ecx","edx");
48$idx="esi";
49$key="edi";
50$Tbl="ebp";
51
52# stack frame layout in _x86_Camellia_* routines, frame is allocated
53# by caller
54$__ra=&DWP(0,"esp"); # return address
55$__s0=&DWP(4,"esp"); # s0 backing store
56$__s1=&DWP(8,"esp"); # s1 backing store
57$__s2=&DWP(12,"esp"); # s2 backing store
58$__s3=&DWP(16,"esp"); # s3 backing store
59$__end=&DWP(20,"esp"); # pointer to end/start of key schedule
60
61# stack frame layout in Camellia_[en|crypt] routines, which differs from
62# above by 4 and overlaps by pointer to end/start of key schedule
63$_end=&DWP(16,"esp");
64$_esp=&DWP(20,"esp");
65
66# const unsigned int Camellia_SBOX[4][256];
67# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68# and [2][] - with [3][]. This is done to optimize code size.
69$SBOX1_1110=0; # Camellia_SBOX[0]
70$SBOX4_4404=4; # Camellia_SBOX[1]
71$SBOX2_0222=2048; # Camellia_SBOX[2]
72$SBOX3_3033=2052; # Camellia_SBOX[3]
73&static_label("Camellia_SIGMA");
74&static_label("Camellia_SBOX");
75
76sub Camellia_Feistel {
77my $i=@_[0];
78my $seed=defined(@_[1])?@_[1]:0;
79my $scale=$seed<0?-8:8;
80my $frame=defined(@_[2])?@_[2]:0;
81my $j=($i&1)*2;
82my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
83
84 &xor ($t0,$idx); # t0^=key[0]
85 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
86 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
87 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
88 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
89 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
90 &shr ($t0,16);
91 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
92 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
93 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
94 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
95 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
96 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
97 &shr ($t1,16);
98 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
99 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
100 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
101 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
102 &xor ($t2,$t3); # t2^=t3
103 &rotr ($t3,8); # t3=RightRotate(t3,8)
104 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
105 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
106 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
107 &xor ($t3,$t0); # t3^=s3
108 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
109 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
110 &xor ($t3,$t2); # t3^=t2
111 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
112 &xor ($t2,$t1); # t2^=s2
113 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
114}
115
116# void Camellia_EncryptBlock_Rounds(
117# int grandRounds,
118# const Byte plaintext[],
119# const KEY_TABLE_TYPE keyTable,
120# Byte ciphertext[])
121&function_begin("Camellia_EncryptBlock_Rounds");
122 &mov ("eax",&wparam(0)); # load grandRounds
123 &mov ($idx,&wparam(1)); # load plaintext pointer
124 &mov ($key,&wparam(2)); # load key schedule pointer
125
126 &mov ("ebx","esp");
127 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
128 &and ("esp",-64);
129
130 # place stack frame just "above mod 1024" the key schedule
131 # this ensures that cache associativity of 2 suffices
132 &lea ("ecx",&DWP(-64-63,$key));
133 &sub ("ecx","esp");
134 &neg ("ecx");
135 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
136 &sub ("esp","ecx");
137 &add ("esp",4); # 4 is reserved for callee's return address
138
139 &shl ("eax",6);
140 &lea ("eax",&DWP(0,$key,"eax"));
141 &mov ($_esp,"ebx"); # save %esp
142 &mov ($_end,"eax"); # save keyEnd
143
144 &picsetup($Tbl);
145 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
146
147 &mov (@T[0],&DWP(0,$idx)); # load plaintext
148 &mov (@T[1],&DWP(4,$idx));
149 &mov (@T[2],&DWP(8,$idx));
150 &bswap (@T[0]);
151 &mov (@T[3],&DWP(12,$idx));
152 &bswap (@T[1]);
153 &bswap (@T[2]);
154 &bswap (@T[3]);
155
156 &call ("_x86_Camellia_encrypt");
157
158 &mov ("esp",$_esp);
159 &bswap (@T[0]);
160 &mov ($idx,&wparam(3)); # load ciphertext pointer
161 &bswap (@T[1]);
162 &bswap (@T[2]);
163 &bswap (@T[3]);
164 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
165 &mov (&DWP(4,$idx),@T[1]);
166 &mov (&DWP(8,$idx),@T[2]);
167 &mov (&DWP(12,$idx),@T[3]);
168&function_end("Camellia_EncryptBlock_Rounds");
169# V1.x API
170&function_begin_B("Camellia_EncryptBlock");
171 &mov ("eax",128);
172 &sub ("eax",&wparam(0)); # load keyBitLength
173 &mov ("eax",3);
174 &adc ("eax",0); # keyBitLength==128?3:4
175 &mov (&wparam(0),"eax");
176 &jmp (&label("Camellia_EncryptBlock_Rounds"));
177&function_end_B("Camellia_EncryptBlock");
178
179if ($OPENSSL) {
180# void Camellia_encrypt(
181# const unsigned char *in,
182# unsigned char *out,
183# const CAMELLIA_KEY *key)
184&function_begin("Camellia_encrypt");
185 &mov ($idx,&wparam(0)); # load plaintext pointer
186 &mov ($key,&wparam(2)); # load key schedule pointer
187
188 &mov ("ebx","esp");
189 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
190 &and ("esp",-64);
191 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
192
193 # place stack frame just "above mod 1024" the key schedule
194 # this ensures that cache associativity of 2 suffices
195 &lea ("ecx",&DWP(-64-63,$key));
196 &sub ("ecx","esp");
197 &neg ("ecx");
198 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
199 &sub ("esp","ecx");
200 &add ("esp",4); # 4 is reserved for callee's return address
201
202 &shl ("eax",6);
203 &lea ("eax",&DWP(0,$key,"eax"));
204 &mov ($_esp,"ebx"); # save %esp
205 &mov ($_end,"eax"); # save keyEnd
206
207 &picsetup($Tbl);
208 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
209
210 &mov (@T[0],&DWP(0,$idx)); # load plaintext
211 &mov (@T[1],&DWP(4,$idx));
212 &mov (@T[2],&DWP(8,$idx));
213 &bswap (@T[0]);
214 &mov (@T[3],&DWP(12,$idx));
215 &bswap (@T[1]);
216 &bswap (@T[2]);
217 &bswap (@T[3]);
218
219 &call ("_x86_Camellia_encrypt");
220
221 &mov ("esp",$_esp);
222 &bswap (@T[0]);
223 &mov ($idx,&wparam(1)); # load ciphertext pointer
224 &bswap (@T[1]);
225 &bswap (@T[2]);
226 &bswap (@T[3]);
227 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
228 &mov (&DWP(4,$idx),@T[1]);
229 &mov (&DWP(8,$idx),@T[2]);
230 &mov (&DWP(12,$idx),@T[3]);
231&function_end("Camellia_encrypt");
232}
233
234&function_begin_B("_x86_Camellia_encrypt");
235 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
236 &xor (@T[1],&DWP(4,$key));
237 &xor (@T[2],&DWP(8,$key));
238 &xor (@T[3],&DWP(12,$key));
239 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
240
241 &mov ($__s0,@T[0]); # save s[0-3]
242 &mov ($__s1,@T[1]);
243 &mov ($__s2,@T[2]);
244 &mov ($__s3,@T[3]);
245
246&set_label("loop",16);
247 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
248
249 &add ($key,16*4);
250 &cmp ($key,$__end);
251 &je (&label("done"));
252
253 # @T[0-1] are preloaded, $idx is preloaded with key[0]
254 &and ($idx,@T[0]);
255 &mov (@T[3],$__s3);
256 &rotl ($idx,1);
257 &mov (@T[2],@T[3]);
258 &xor (@T[1],$idx);
259 &or (@T[2],&DWP(12,$key));
260 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
261 &xor (@T[2],$__s2);
262
263 &mov ($idx,&DWP(4,$key));
264 &mov ($__s2,@T[2]); # s2^=s3|key[3];
265 &or ($idx,@T[1]);
266 &and (@T[2],&DWP(8,$key));
267 &xor (@T[0],$idx);
268 &rotl (@T[2],1);
269 &mov ($__s0,@T[0]); # s0^=s1|key[1];
270 &xor (@T[3],@T[2]);
271 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
272 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
273 &jmp (&label("loop"));
274
275&set_label("done",8);
276 &mov (@T[2],@T[0]); # SwapHalf
277 &mov (@T[3],@T[1]);
278 &mov (@T[0],$__s2);
279 &mov (@T[1],$__s3);
280 &xor (@T[0],$idx); # $idx is preloaded with key[0]
281 &xor (@T[1],&DWP(4,$key));
282 &xor (@T[2],&DWP(8,$key));
283 &xor (@T[3],&DWP(12,$key));
284 &ret ();
285&function_end_B("_x86_Camellia_encrypt");
286
287# void Camellia_DecryptBlock_Rounds(
288# int grandRounds,
289# const Byte ciphertext[],
290# const KEY_TABLE_TYPE keyTable,
291# Byte plaintext[])
292&function_begin("Camellia_DecryptBlock_Rounds");
293 &mov ("eax",&wparam(0)); # load grandRounds
294 &mov ($idx,&wparam(1)); # load ciphertext pointer
295 &mov ($key,&wparam(2)); # load key schedule pointer
296
297 &mov ("ebx","esp");
298 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
299 &and ("esp",-64);
300
301 # place stack frame just "above mod 1024" the key schedule
302 # this ensures that cache associativity of 2 suffices
303 &lea ("ecx",&DWP(-64-63,$key));
304 &sub ("ecx","esp");
305 &neg ("ecx");
306 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
307 &sub ("esp","ecx");
308 &add ("esp",4); # 4 is reserved for callee's return address
309
310 &shl ("eax",6);
311 &mov (&DWP(4*4,"esp"),$key); # save keyStart
312 &lea ($key,&DWP(0,$key,"eax"));
313 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
314
315 &picsetup($Tbl);
316 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
317
318 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
319 &mov (@T[1],&DWP(4,$idx));
320 &mov (@T[2],&DWP(8,$idx));
321 &bswap (@T[0]);
322 &mov (@T[3],&DWP(12,$idx));
323 &bswap (@T[1]);
324 &bswap (@T[2]);
325 &bswap (@T[3]);
326
327 &call ("_x86_Camellia_decrypt");
328
329 &mov ("esp",&DWP(5*4,"esp"));
330 &bswap (@T[0]);
331 &mov ($idx,&wparam(3)); # load plaintext pointer
332 &bswap (@T[1]);
333 &bswap (@T[2]);
334 &bswap (@T[3]);
335 &mov (&DWP(0,$idx),@T[0]); # write plaintext
336 &mov (&DWP(4,$idx),@T[1]);
337 &mov (&DWP(8,$idx),@T[2]);
338 &mov (&DWP(12,$idx),@T[3]);
339&function_end("Camellia_DecryptBlock_Rounds");
340# V1.x API
341&function_begin_B("Camellia_DecryptBlock");
342 &mov ("eax",128);
343 &sub ("eax",&wparam(0)); # load keyBitLength
344 &mov ("eax",3);
345 &adc ("eax",0); # keyBitLength==128?3:4
346 &mov (&wparam(0),"eax");
347 &jmp (&label("Camellia_DecryptBlock_Rounds"));
348&function_end_B("Camellia_DecryptBlock");
349
350if ($OPENSSL) {
351# void Camellia_decrypt(
352# const unsigned char *in,
353# unsigned char *out,
354# const CAMELLIA_KEY *key)
355&function_begin("Camellia_decrypt");
356 &mov ($idx,&wparam(0)); # load ciphertext pointer
357 &mov ($key,&wparam(2)); # load key schedule pointer
358
359 &mov ("ebx","esp");
360 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
361 &and ("esp",-64);
362 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
363
364 # place stack frame just "above mod 1024" the key schedule
365 # this ensures that cache associativity of 2 suffices
366 &lea ("ecx",&DWP(-64-63,$key));
367 &sub ("ecx","esp");
368 &neg ("ecx");
369 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
370 &sub ("esp","ecx");
371 &add ("esp",4); # 4 is reserved for callee's return address
372
373 &shl ("eax",6);
374 &mov (&DWP(4*4,"esp"),$key); # save keyStart
375 &lea ($key,&DWP(0,$key,"eax"));
376 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
377
378 &picsetup($Tbl);
379 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
380
381 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
382 &mov (@T[1],&DWP(4,$idx));
383 &mov (@T[2],&DWP(8,$idx));
384 &bswap (@T[0]);
385 &mov (@T[3],&DWP(12,$idx));
386 &bswap (@T[1]);
387 &bswap (@T[2]);
388 &bswap (@T[3]);
389
390 &call ("_x86_Camellia_decrypt");
391
392 &mov ("esp",&DWP(5*4,"esp"));
393 &bswap (@T[0]);
394 &mov ($idx,&wparam(1)); # load plaintext pointer
395 &bswap (@T[1]);
396 &bswap (@T[2]);
397 &bswap (@T[3]);
398 &mov (&DWP(0,$idx),@T[0]); # write plaintext
399 &mov (&DWP(4,$idx),@T[1]);
400 &mov (&DWP(8,$idx),@T[2]);
401 &mov (&DWP(12,$idx),@T[3]);
402&function_end("Camellia_decrypt");
403}
404
405&function_begin_B("_x86_Camellia_decrypt");
406 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
407 &xor (@T[1],&DWP(4,$key));
408 &xor (@T[2],&DWP(8,$key));
409 &xor (@T[3],&DWP(12,$key));
410 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
411
412 &mov ($__s0,@T[0]); # save s[0-3]
413 &mov ($__s1,@T[1]);
414 &mov ($__s2,@T[2]);
415 &mov ($__s3,@T[3]);
416
417&set_label("loop",16);
418 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
419
420 &sub ($key,16*4);
421 &cmp ($key,$__end);
422 &je (&label("done"));
423
424 # @T[0-1] are preloaded, $idx is preloaded with key[2]
425 &and ($idx,@T[0]);
426 &mov (@T[3],$__s3);
427 &rotl ($idx,1);
428 &mov (@T[2],@T[3]);
429 &xor (@T[1],$idx);
430 &or (@T[2],&DWP(4,$key));
431 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
432 &xor (@T[2],$__s2);
433
434 &mov ($idx,&DWP(12,$key));
435 &mov ($__s2,@T[2]); # s2^=s3|key[3];
436 &or ($idx,@T[1]);
437 &and (@T[2],&DWP(0,$key));
438 &xor (@T[0],$idx);
439 &rotl (@T[2],1);
440 &mov ($__s0,@T[0]); # s0^=s1|key[1];
441 &xor (@T[3],@T[2]);
442 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
443 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
444 &jmp (&label("loop"));
445
446&set_label("done",8);
447 &mov (@T[2],@T[0]); # SwapHalf
448 &mov (@T[3],@T[1]);
449 &mov (@T[0],$__s2);
450 &mov (@T[1],$__s3);
451 &xor (@T[2],$idx); # $idx is preloaded with key[2]
452 &xor (@T[3],&DWP(12,$key));
453 &xor (@T[0],&DWP(0,$key));
454 &xor (@T[1],&DWP(4,$key));
455 &ret ();
456&function_end_B("_x86_Camellia_decrypt");
457
458# shld is very slow on Intel P4 family. Even on AMD it limits
459# instruction decode rate [because it's VectorPath] and consequently
460# performance. PIII, PM and Core[2] seem to be the only ones which
461# execute this code ~7% faster...
462sub __rotl128 {
463 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
464
465 $rnd *= 2;
466 if ($rot) {
467 &mov ($idx,$i0);
468 &shld ($i0,$i1,$rot);
469 &shld ($i1,$i2,$rot);
470 &shld ($i2,$i3,$rot);
471 &shld ($i3,$idx,$rot);
472 }
473 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
474 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
475 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
476 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
477}
478
479# ... Implementing 128-bit rotate without shld gives >3x performance
480# improvement on P4, only ~7% degradation on other Intel CPUs and
481# not worse performance on AMD. This is therefore preferred.
482sub _rotl128 {
483 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
484
485 $rnd *= 2;
486 if ($rot) {
487 &mov ($Tbl,$i0);
488 &shl ($i0,$rot);
489 &mov ($idx,$i1);
490 &shr ($idx,32-$rot);
491 &shl ($i1,$rot);
492 &or ($i0,$idx);
493 &mov ($idx,$i2);
494 &shl ($i2,$rot);
495 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
496 &shr ($idx,32-$rot);
497 &or ($i1,$idx);
498 &shr ($Tbl,32-$rot);
499 &mov ($idx,$i3);
500 &shr ($idx,32-$rot);
501 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
502 &shl ($i3,$rot);
503 &or ($i2,$idx);
504 &or ($i3,$Tbl);
505 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
506 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
507 } else {
508 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
509 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
510 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
511 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
512 }
513}
514
515sub _saveround {
516my ($rnd,$key,@T)=@_;
517my $bias=int(@T[0])?shift(@T):0;
518
519 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
520 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
521 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
522 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
523}
524
525sub _loadround {
526my ($rnd,$key,@T)=@_;
527my $bias=int(@T[0])?shift(@T):0;
528
529 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
530 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
531 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
532 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
533}
534
535# void Camellia_Ekeygen(
536# const int keyBitLength,
537# const Byte *rawKey,
538# KEY_TABLE_TYPE keyTable)
539&function_begin("Camellia_Ekeygen");
540{ my $step=0;
541
542 &stack_push(4); # place for s[0-3]
543
544 &mov ($Tbl,&wparam(0)); # load arguments
545 &mov ($idx,&wparam(1));
546 &mov ($key,&wparam(2));
547
548 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
549 &mov (@T[1],&DWP(4,$idx));
550 &mov (@T[2],&DWP(8,$idx));
551 &mov (@T[3],&DWP(12,$idx));
552
553 &bswap (@T[0]);
554 &bswap (@T[1]);
555 &bswap (@T[2]);
556 &bswap (@T[3]);
557
558 &_saveround (0,$key,@T); # KL<<<0
559
560 &cmp ($Tbl,128);
561 &je (&label("1st128"));
562
563 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
564 &mov (@T[1],&DWP(20,$idx));
565 &cmp ($Tbl,192);
566 &je (&label("1st192"));
567 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
568 &mov (@T[3],&DWP(28,$idx));
569 &jmp (&label("1st256"));
570&set_label("1st192",4);
571 &mov (@T[2],@T[0]);
572 &mov (@T[3],@T[1]);
573 &not (@T[2]);
574 &not (@T[3]);
575&set_label("1st256",4);
576 &bswap (@T[0]);
577 &bswap (@T[1]);
578 &bswap (@T[2]);
579 &bswap (@T[3]);
580
581 &_saveround (4,$key,@T); # temporary storage for KR!
582
583 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
584 &xor (@T[1],&DWP(0*8+4,$key));
585 &xor (@T[2],&DWP(1*8+0,$key));
586 &xor (@T[3],&DWP(1*8+4,$key));
587
588&set_label("1st128",4);
589 &picsetup($Tbl);
590 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
591 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
592
593 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
594 &mov (&swtmp(0),@T[0]); # save s[0-3]
595 &mov (&swtmp(1),@T[1]);
596 &mov (&swtmp(2),@T[2]);
597 &mov (&swtmp(3),@T[3]);
598 &Camellia_Feistel($step++);
599 &Camellia_Feistel($step++);
600 &mov (@T[2],&swtmp(2));
601 &mov (@T[3],&swtmp(3));
602
603 &mov ($idx,&wparam(2));
604 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
605 &xor (@T[1],&DWP(0*8+4,$idx));
606 &xor (@T[2],&DWP(1*8+0,$idx));
607 &xor (@T[3],&DWP(1*8+4,$idx));
608
609 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
610 &mov (&swtmp(0),@T[0]); # save s[0-3]
611 &mov (&swtmp(1),@T[1]);
612 &mov (&swtmp(2),@T[2]);
613 &mov (&swtmp(3),@T[3]);
614 &Camellia_Feistel($step++);
615 &Camellia_Feistel($step++);
616 &mov (@T[2],&swtmp(2));
617 &mov (@T[3],&swtmp(3));
618
619 &mov ($idx,&wparam(0));
620 &cmp ($idx,128);
621 &jne (&label("2nd256"));
622
623 &mov ($key,&wparam(2));
624 &lea ($key,&DWP(128,$key)); # size optimization
625
626 ####### process KA
627 &_saveround (2,$key,-128,@T); # KA<<<0
628 &_rotl128 (@T,15,6,@T); # KA<<<15
629 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
630 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
631 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
632 push (@T,shift(@T)); # rotl128(@T,32);
633 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
634 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
635
636 ####### process KL
637 &_loadround (0,$key,-128,@T); # load KL
638 &_rotl128 (@T,15,4,@T); # KL<<<15
639 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
640 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
641 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
642 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
643 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
644
645 while (@T[0] ne "eax") # restore order
646 { unshift (@T,pop(@T)); }
647
648 &mov ("eax",3); # 3 grandRounds
649 &jmp (&label("done"));
650
651&set_label("2nd256",16);
652 &mov ($idx,&wparam(2));
653 &_saveround (6,$idx,@T); # temporary storage for KA!
654
655 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
656 &xor (@T[1],&DWP(4*8+4,$idx));
657 &xor (@T[2],&DWP(5*8+0,$idx));
658 &xor (@T[3],&DWP(5*8+4,$idx));
659
660 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
661 &mov (&swtmp(0),@T[0]); # save s[0-3]
662 &mov (&swtmp(1),@T[1]);
663 &mov (&swtmp(2),@T[2]);
664 &mov (&swtmp(3),@T[3]);
665 &Camellia_Feistel($step++);
666 &Camellia_Feistel($step++);
667 &mov (@T[2],&swtmp(2));
668 &mov (@T[3],&swtmp(3));
669
670 &mov ($key,&wparam(2));
671 &lea ($key,&DWP(128,$key)); # size optimization
672
673 ####### process KB
674 &_saveround (2,$key,-128,@T); # KB<<<0
675 &_rotl128 (@T,30,10,@T); # KB<<<30
676 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
677 push (@T,shift(@T)); # rotl128(@T,32);
678 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
679
680 ####### process KR
681 &_loadround (4,$key,-128,@T); # load KR
682 &_rotl128 (@T,15,4,@T); # KR<<<15
683 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
684 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
685 push (@T,shift(@T)); # rotl128(@T,32);
686 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
687
688 ####### process KA
689 &_loadround (6,$key,-128,@T); # load KA
690 &_rotl128 (@T,15,6,@T); # KA<<<15
691 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
692 push (@T,shift(@T)); # rotl128(@T,32);
693 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
694 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
695
696 ####### process KL
697 &_loadround (0,$key,-128,@T); # load KL
698 push (@T,shift(@T)); # rotl128(@T,32);
699 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
700 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
701 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
702 push (@T,shift(@T)); # rotl128(@T,32);
703 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
704
705 while (@T[0] ne "eax") # restore order
706 { unshift (@T,pop(@T)); }
707
708 &mov ("eax",4); # 4 grandRounds
709&set_label("done");
710 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
711 &stack_pop(4);
712}
713&function_end("Camellia_Ekeygen");
714
715if ($OPENSSL) {
716# int Camellia_set_key (
717# const unsigned char *userKey,
718# int bits,
719# CAMELLIA_KEY *key)
720&function_begin_B("Camellia_set_key");
721 &push ("ebx");
722 &mov ("ecx",&wparam(0)); # pull arguments
723 &mov ("ebx",&wparam(1));
724 &mov ("edx",&wparam(2));
725
726 &mov ("eax",-1);
727 &test ("ecx","ecx");
728 &jz (&label("done")); # userKey==NULL?
729 &test ("edx","edx");
730 &jz (&label("done")); # key==NULL?
731
732 &mov ("eax",-2);
733 &cmp ("ebx",256);
734 &je (&label("arg_ok")); # bits==256?
735 &cmp ("ebx",192);
736 &je (&label("arg_ok")); # bits==192?
737 &cmp ("ebx",128);
738 &jne (&label("done")); # bits!=128?
739&set_label("arg_ok",4);
740
741 &push ("edx"); # push arguments
742 &push ("ecx");
743 &push ("ebx");
744 &call ("Camellia_Ekeygen");
745 &stack_pop(3);
746
747 # eax holds grandRounds and edx points at where to put it
748 &mov (&DWP(0,"edx"),"eax");
749 &xor ("eax","eax");
750&set_label("done",4);
751 &pop ("ebx");
752 &ret ();
753&function_end_B("Camellia_set_key");
754}
755
756@SBOX=(
757112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
758 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
759134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
760166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
761139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
762223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
763 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
764254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
765170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
766 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
767135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
768 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
769233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
770120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
771114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
772 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
773
774sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
775sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
776sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
777sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
778
779 &rodataseg();
780&set_label("Camellia_SIGMA",64);
781&data_word(
782 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
783 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
784 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
785 0, 0, 0, 0);
786&set_label("Camellia_SBOX",64);
787# tables are interleaved, remember?
788for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
789for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
790 &previous();
791
792# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
793# size_t length, const CAMELLIA_KEY *key,
794# unsigned char *ivp,const int enc);
795{
796# stack frame layout
797# -4(%esp) # return address 0(%esp)
798# 0(%esp) # s0 4(%esp)
799# 4(%esp) # s1 8(%esp)
800# 8(%esp) # s2 12(%esp)
801# 12(%esp) # s3 16(%esp)
802# 16(%esp) # end of key schedule 20(%esp)
803# 20(%esp) # %esp backup
804my $_inp=&DWP(24,"esp"); #copy of wparam(0)
805my $_out=&DWP(28,"esp"); #copy of wparam(1)
806my $_len=&DWP(32,"esp"); #copy of wparam(2)
807my $_key=&DWP(36,"esp"); #copy of wparam(3)
808my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
809my $ivec=&DWP(44,"esp"); #ivec[16]
810my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
811my ($s0,$s1,$s2,$s3) = @T;
812
813&function_begin("Camellia_cbc_encrypt");
814 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
815 &cmp ($s2,0);
816 &je (&label("enc_out"));
817
818 &pushf ();
819 &cld ();
820
821 &mov ($s0,&wparam(0)); # load inp
822 &mov ($s1,&wparam(1)); # load out
823 #&mov ($s2,&wparam(2)); # load len
824 &mov ($s3,&wparam(3)); # load key
825 &mov ($Tbl,&wparam(4)); # load ivp
826
827 # allocate aligned stack frame...
828 &lea ($idx,&DWP(-64,"esp"));
829 &and ($idx,-64);
830
831 # place stack frame just "above mod 1024" the key schedule
832 # this ensures that cache associativity of 2 suffices
833 &lea ($key,&DWP(-64-63,$s3));
834 &sub ($key,$idx);
835 &neg ($key);
836 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
837 &sub ($idx,$key);
838
839 &mov ($key,&wparam(5)); # load enc
840
841 &exch ("esp",$idx);
842 &add ("esp",4); # reserve for return address!
843 &mov ($_esp,$idx); # save %esp
844
845 &mov ($_inp,$s0); # save copy of inp
846 &mov ($_out,$s1); # save copy of out
847 &mov ($_len,$s2); # save copy of len
848 &mov ($_key,$s3); # save copy of key
849 &mov ($_ivp,$Tbl); # save copy of ivp
850
851 &picsetup($Tbl);
852 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
853
854 &mov ($idx,32);
855 &set_label("prefetch_sbox",4);
856 &mov ($s0,&DWP(0,$Tbl));
857 &mov ($s1,&DWP(32,$Tbl));
858 &mov ($s2,&DWP(64,$Tbl));
859 &mov ($s3,&DWP(96,$Tbl));
860 &lea ($Tbl,&DWP(128,$Tbl));
861 &dec ($idx);
862 &jnz (&label("prefetch_sbox"));
863 &mov ($s0,$_key);
864 &sub ($Tbl,4096);
865 &mov ($idx,$_inp);
866 &mov ($s3,&DWP(272,$s0)); # load grandRounds
867
868 &cmp ($key,0);
869 &je (&label("DECRYPT"));
870
871 &mov ($s2,$_len);
872 &mov ($key,$_ivp);
873 &shl ($s3,6);
874 &lea ($s3,&DWP(0,$s0,$s3));
875 &mov ($_end,$s3);
876
877 &test ($s2,0xFFFFFFF0);
878 &jz (&label("enc_tail")); # short input...
879
880 &mov ($s0,&DWP(0,$key)); # load iv
881 &mov ($s1,&DWP(4,$key));
882
883 &set_label("enc_loop",4);
884 &mov ($s2,&DWP(8,$key));
885 &mov ($s3,&DWP(12,$key));
886
887 &xor ($s0,&DWP(0,$idx)); # xor input data
888 &xor ($s1,&DWP(4,$idx));
889 &xor ($s2,&DWP(8,$idx));
890 &bswap ($s0);
891 &xor ($s3,&DWP(12,$idx));
892 &bswap ($s1);
893 &mov ($key,$_key); # load key
894 &bswap ($s2);
895 &bswap ($s3);
896
897 &call ("_x86_Camellia_encrypt");
898
899 &mov ($idx,$_inp); # load inp
900 &mov ($key,$_out); # load out
901
902 &bswap ($s0);
903 &bswap ($s1);
904 &bswap ($s2);
905 &mov (&DWP(0,$key),$s0); # save output data
906 &bswap ($s3);
907 &mov (&DWP(4,$key),$s1);
908 &mov (&DWP(8,$key),$s2);
909 &mov (&DWP(12,$key),$s3);
910
911 &mov ($s2,$_len); # load len
912
913 &lea ($idx,&DWP(16,$idx));
914 &mov ($_inp,$idx); # save inp
915
916 &lea ($s3,&DWP(16,$key));
917 &mov ($_out,$s3); # save out
918
919 &sub ($s2,16);
920 &test ($s2,0xFFFFFFF0);
921 &mov ($_len,$s2); # save len
922 &jnz (&label("enc_loop"));
923 &test ($s2,15);
924 &jnz (&label("enc_tail"));
925 &mov ($idx,$_ivp); # load ivp
926 &mov ($s2,&DWP(8,$key)); # restore last dwords
927 &mov ($s3,&DWP(12,$key));
928 &mov (&DWP(0,$idx),$s0); # save ivec
929 &mov (&DWP(4,$idx),$s1);
930 &mov (&DWP(8,$idx),$s2);
931 &mov (&DWP(12,$idx),$s3);
932
933 &mov ("esp",$_esp);
934 &popf ();
935 &set_label("enc_out");
936 &function_end_A();
937 &pushf (); # kludge, never executed
938
939 &set_label("enc_tail",4);
940 &mov ($s0,$key eq "edi" ? $key : "");
941 &mov ($key,$_out); # load out
942 &push ($s0); # push ivp
943 &mov ($s1,16);
944 &sub ($s1,$s2);
945 &cmp ($key,$idx); # compare with inp
946 &je (&label("enc_in_place"));
947 &align (4);
948 &data_word(0xA4F3F689); # rep movsb # copy input
949 &jmp (&label("enc_skip_in_place"));
950 &set_label("enc_in_place");
951 &lea ($key,&DWP(0,$key,$s2));
952 &set_label("enc_skip_in_place");
953 &mov ($s2,$s1);
954 &xor ($s0,$s0);
955 &align (4);
956 &data_word(0xAAF3F689); # rep stosb # zero tail
957 &pop ($key); # pop ivp
958
959 &mov ($idx,$_out); # output as input
960 &mov ($s0,&DWP(0,$key));
961 &mov ($s1,&DWP(4,$key));
962 &mov ($_len,16); # len=16
963 &jmp (&label("enc_loop")); # one more spin...
964
965#----------------------------- DECRYPT -----------------------------#
966&set_label("DECRYPT",16);
967 &shl ($s3,6);
968 &lea ($s3,&DWP(0,$s0,$s3));
969 &mov ($_end,$s0);
970 &mov ($_key,$s3);
971
972 &cmp ($idx,$_out);
973 &je (&label("dec_in_place")); # in-place processing...
974
975 &mov ($key,$_ivp); # load ivp
976 &mov ($_tmp,$key);
977
978 &set_label("dec_loop",4);
979 &mov ($s0,&DWP(0,$idx)); # read input
980 &mov ($s1,&DWP(4,$idx));
981 &mov ($s2,&DWP(8,$idx));
982 &bswap ($s0);
983 &mov ($s3,&DWP(12,$idx));
984 &bswap ($s1);
985 &mov ($key,$_key); # load key
986 &bswap ($s2);
987 &bswap ($s3);
988
989 &call ("_x86_Camellia_decrypt");
990
991 &mov ($key,$_tmp); # load ivp
992 &mov ($idx,$_len); # load len
993
994 &bswap ($s0);
995 &bswap ($s1);
996 &bswap ($s2);
997 &xor ($s0,&DWP(0,$key)); # xor iv
998 &bswap ($s3);
999 &xor ($s1,&DWP(4,$key));
1000 &xor ($s2,&DWP(8,$key));
1001 &xor ($s3,&DWP(12,$key));
1002
1003 &sub ($idx,16);
1004 &jc (&label("dec_partial"));
1005 &mov ($_len,$idx); # save len
1006 &mov ($idx,$_inp); # load inp
1007 &mov ($key,$_out); # load out
1008
1009 &mov (&DWP(0,$key),$s0); # write output
1010 &mov (&DWP(4,$key),$s1);
1011 &mov (&DWP(8,$key),$s2);
1012 &mov (&DWP(12,$key),$s3);
1013
1014 &mov ($_tmp,$idx); # save ivp
1015 &lea ($idx,&DWP(16,$idx));
1016 &mov ($_inp,$idx); # save inp
1017
1018 &lea ($key,&DWP(16,$key));
1019 &mov ($_out,$key); # save out
1020
1021 &jnz (&label("dec_loop"));
1022 &mov ($key,$_tmp); # load temp ivp
1023 &set_label("dec_end");
1024 &mov ($idx,$_ivp); # load user ivp
1025 &mov ($s0,&DWP(0,$key)); # load iv
1026 &mov ($s1,&DWP(4,$key));
1027 &mov ($s2,&DWP(8,$key));
1028 &mov ($s3,&DWP(12,$key));
1029 &mov (&DWP(0,$idx),$s0); # copy back to user
1030 &mov (&DWP(4,$idx),$s1);
1031 &mov (&DWP(8,$idx),$s2);
1032 &mov (&DWP(12,$idx),$s3);
1033 &jmp (&label("dec_out"));
1034
1035 &set_label("dec_partial",4);
1036 &lea ($key,$ivec);
1037 &mov (&DWP(0,$key),$s0); # dump output to stack
1038 &mov (&DWP(4,$key),$s1);
1039 &mov (&DWP(8,$key),$s2);
1040 &mov (&DWP(12,$key),$s3);
1041 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1042 &mov ($idx eq "esi" ? $idx : "",$key);
1043 &mov ($key eq "edi" ? $key : "",$_out); # load out
1044 &data_word(0xA4F3F689); # rep movsb # copy output
1045 &mov ($key,$_inp); # use inp as temp ivp
1046 &jmp (&label("dec_end"));
1047
1048 &set_label("dec_in_place",4);
1049 &set_label("dec_in_place_loop");
1050 &lea ($key,$ivec);
1051 &mov ($s0,&DWP(0,$idx)); # read input
1052 &mov ($s1,&DWP(4,$idx));
1053 &mov ($s2,&DWP(8,$idx));
1054 &mov ($s3,&DWP(12,$idx));
1055
1056 &mov (&DWP(0,$key),$s0); # copy to temp
1057 &mov (&DWP(4,$key),$s1);
1058 &mov (&DWP(8,$key),$s2);
1059 &bswap ($s0);
1060 &mov (&DWP(12,$key),$s3);
1061 &bswap ($s1);
1062 &mov ($key,$_key); # load key
1063 &bswap ($s2);
1064 &bswap ($s3);
1065
1066 &call ("_x86_Camellia_decrypt");
1067
1068 &mov ($key,$_ivp); # load ivp
1069 &mov ($idx,$_out); # load out
1070
1071 &bswap ($s0);
1072 &bswap ($s1);
1073 &bswap ($s2);
1074 &xor ($s0,&DWP(0,$key)); # xor iv
1075 &bswap ($s3);
1076 &xor ($s1,&DWP(4,$key));
1077 &xor ($s2,&DWP(8,$key));
1078 &xor ($s3,&DWP(12,$key));
1079
1080 &mov (&DWP(0,$idx),$s0); # write output
1081 &mov (&DWP(4,$idx),$s1);
1082 &mov (&DWP(8,$idx),$s2);
1083 &mov (&DWP(12,$idx),$s3);
1084
1085 &lea ($idx,&DWP(16,$idx));
1086 &mov ($_out,$idx); # save out
1087
1088 &lea ($idx,$ivec);
1089 &mov ($s0,&DWP(0,$idx)); # read temp
1090 &mov ($s1,&DWP(4,$idx));
1091 &mov ($s2,&DWP(8,$idx));
1092 &mov ($s3,&DWP(12,$idx));
1093
1094 &mov (&DWP(0,$key),$s0); # copy iv
1095 &mov (&DWP(4,$key),$s1);
1096 &mov (&DWP(8,$key),$s2);
1097 &mov (&DWP(12,$key),$s3);
1098
1099 &mov ($idx,$_inp); # load inp
1100
1101 &lea ($idx,&DWP(16,$idx));
1102 &mov ($_inp,$idx); # save inp
1103
1104 &mov ($s2,$_len); # load len
1105 &sub ($s2,16);
1106 &jc (&label("dec_in_place_partial"));
1107 &mov ($_len,$s2); # save len
1108 &jnz (&label("dec_in_place_loop"));
1109 &jmp (&label("dec_out"));
1110
1111 &set_label("dec_in_place_partial",4);
1112 # one can argue if this is actually required...
1113 &mov ($key eq "edi" ? $key : "",$_out);
1114 &lea ($idx eq "esi" ? $idx : "",$ivec);
1115 &lea ($key,&DWP(0,$key,$s2));
1116 &lea ($idx,&DWP(16,$idx,$s2));
1117 &neg ($s2 eq "ecx" ? $s2 : "");
1118 &data_word(0xA4F3F689); # rep movsb # restore tail
1119
1120 &set_label("dec_out",4);
1121 &mov ("esp",$_esp);
1122 &popf ();
1123&function_end("Camellia_cbc_encrypt");
1124}
1125
1126&asm_finish();
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl b/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
deleted file mode 100644
index 187f0596d7..0000000000
--- a/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
+++ /dev/null
@@ -1,875 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD64 Core2 EM64T
18# -evp camellia-128-ecb 16.7 21.0 22.7
19# + over gcc 3.4.6 +25% +5% 0%
20#
21# camellia-128-cbc 15.7 20.4 21.1
22#
23# 128-bit key setup 128 216 205 cycles/key
24# + over gcc 3.4.6 +54% +39% +15%
25#
26# Numbers in "+" rows represent performance improvement over compiler
27# generated code. Key setup timings are impressive on AMD and Core2
28# thanks to 64-bit operations being covertly deployed. Improvement on
29# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30# apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41open OUT,"| \"$^X\" $xlate $flavour $output";
42*STDOUT=*OUT;
43
44sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
45sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
46 $r =~ s/%[er]([sd]i)/%\1l/;
47 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
48
49$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
50@S=("%r8d","%r9d","%r10d","%r11d");
51$i0="%esi";
52$i1="%edi";
53$Tbl="%rbp"; # size optimization
54$inp="%r12";
55$out="%r13";
56$key="%r14";
57$keyend="%r15";
58$arg0d="%edi";
59
60# const unsigned int Camellia_SBOX[4][256];
61# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
62# and [2][] - with [3][]. This is done to minimize code size.
63$SBOX1_1110=0; # Camellia_SBOX[0]
64$SBOX4_4404=4; # Camellia_SBOX[1]
65$SBOX2_0222=2048; # Camellia_SBOX[2]
66$SBOX3_3033=2052; # Camellia_SBOX[3]
67
68sub Camellia_Feistel {
69my $i=@_[0];
70my $seed=defined(@_[1])?@_[1]:0;
71my $scale=$seed<0?-8:8;
72my $j=($i&1)*2;
73my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
74
75$code.=<<___;
76 xor $s0,$t0 # t0^=key[0]
77 xor $s1,$t1 # t1^=key[1]
78 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
79 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
80 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
81 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
82 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
83 shr \$16,$t0
84 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
85 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
86 shr \$16,$t1
87 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
88 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
89 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
90 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
91 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
92 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
93 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
94 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
95 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
96 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
97 mov `$seed+($i+1)*$scale+4`($key),$t0
98 xor $t3,$t2 # t2^=t3
99 ror \$8,$t3 # t3=RightRotate(t3,8)
100 xor $t2,$s2
101 xor $t2,$s3
102 xor $t3,$s3
103___
104}
105
106# void Camellia_EncryptBlock_Rounds(
107# int grandRounds,
108# const Byte plaintext[],
109# const KEY_TABLE_TYPE keyTable,
110# Byte ciphertext[])
111$code=<<___;
112.text
113
114# V1.x API
115.globl Camellia_EncryptBlock
116.type Camellia_EncryptBlock,\@abi-omnipotent
117.align 16
118Camellia_EncryptBlock:
119 _CET_ENDBR
120 movl \$128,%eax
121 subl $arg0d,%eax
122 movl \$3,$arg0d
123 adcl \$0,$arg0d # keyBitLength==128?3:4
124 jmp .Lenc_rounds
125.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
126# V2
127.globl Camellia_EncryptBlock_Rounds
128.type Camellia_EncryptBlock_Rounds,\@function,4
129.align 16
130.Lenc_rounds:
131Camellia_EncryptBlock_Rounds:
132 _CET_ENDBR
133 push %rbx
134 push %rbp
135 push %r13
136 push %r14
137 push %r15
138.Lenc_prologue:
139
140 #mov %rsi,$inp # put away arguments
141 mov %rcx,$out
142 mov %rdx,$key
143
144 shl \$6,%edi # process grandRounds
145 lea .LCamellia_SBOX(%rip),$Tbl
146 lea ($key,%rdi),$keyend
147
148 mov 0(%rsi),@S[0] # load plaintext
149 mov 4(%rsi),@S[1]
150 mov 8(%rsi),@S[2]
151 bswap @S[0]
152 mov 12(%rsi),@S[3]
153 bswap @S[1]
154 bswap @S[2]
155 bswap @S[3]
156
157 call _x86_64_Camellia_encrypt
158
159 bswap @S[0]
160 bswap @S[1]
161 bswap @S[2]
162 mov @S[0],0($out)
163 bswap @S[3]
164 mov @S[1],4($out)
165 mov @S[2],8($out)
166 mov @S[3],12($out)
167
168 mov 0(%rsp),%r15
169 mov 8(%rsp),%r14
170 mov 16(%rsp),%r13
171 mov 24(%rsp),%rbp
172 mov 32(%rsp),%rbx
173 lea 40(%rsp),%rsp
174.Lenc_epilogue:
175 ret
176.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
177
178.type _x86_64_Camellia_encrypt,\@abi-omnipotent
179.align 16
180_x86_64_Camellia_encrypt:
181 _CET_ENDBR
182 xor 0($key),@S[1]
183 xor 4($key),@S[0] # ^=key[0-3]
184 xor 8($key),@S[3]
185 xor 12($key),@S[2]
186.align 16
187.Leloop:
188 mov 16($key),$t1 # prefetch key[4-5]
189 mov 20($key),$t0
190
191___
192 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
193$code.=<<___;
194 lea 16*4($key),$key
195 cmp $keyend,$key
196 mov 8($key),$t3 # prefetch key[2-3]
197 mov 12($key),$t2
198 je .Ledone
199
200 and @S[0],$t0
201 or @S[3],$t3
202 rol \$1,$t0
203 xor $t3,@S[2] # s2^=s3|key[3];
204 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
205 and @S[2],$t2
206 or @S[1],$t1
207 rol \$1,$t2
208 xor $t1,@S[0] # s0^=s1|key[1];
209 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
210 jmp .Leloop
211
212.align 16
213.Ledone:
214 xor @S[2],$t0 # SwapHalf
215 xor @S[3],$t1
216 xor @S[0],$t2
217 xor @S[1],$t3
218
219 mov $t0,@S[0]
220 mov $t1,@S[1]
221 mov $t2,@S[2]
222 mov $t3,@S[3]
223
224 retq
225.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
226
227# V1.x API
228.globl Camellia_DecryptBlock
229.type Camellia_DecryptBlock,\@abi-omnipotent
230.align 16
231Camellia_DecryptBlock:
232 _CET_ENDBR
233 movl \$128,%eax
234 subl $arg0d,%eax
235 movl \$3,$arg0d
236 adcl \$0,$arg0d # keyBitLength==128?3:4
237 jmp .Ldec_rounds
238.size Camellia_DecryptBlock,.-Camellia_DecryptBlock
239# V2
240.globl Camellia_DecryptBlock_Rounds
241.type Camellia_DecryptBlock_Rounds,\@function,4
242.align 16
243.Ldec_rounds:
244Camellia_DecryptBlock_Rounds:
245 _CET_ENDBR
246 push %rbx
247 push %rbp
248 push %r13
249 push %r14
250 push %r15
251.Ldec_prologue:
252
253 #mov %rsi,$inp # put away arguments
254 mov %rcx,$out
255 mov %rdx,$keyend
256
257 shl \$6,%edi # process grandRounds
258 lea .LCamellia_SBOX(%rip),$Tbl
259 lea ($keyend,%rdi),$key
260
261 mov 0(%rsi),@S[0] # load plaintext
262 mov 4(%rsi),@S[1]
263 mov 8(%rsi),@S[2]
264 bswap @S[0]
265 mov 12(%rsi),@S[3]
266 bswap @S[1]
267 bswap @S[2]
268 bswap @S[3]
269
270 call _x86_64_Camellia_decrypt
271
272 bswap @S[0]
273 bswap @S[1]
274 bswap @S[2]
275 mov @S[0],0($out)
276 bswap @S[3]
277 mov @S[1],4($out)
278 mov @S[2],8($out)
279 mov @S[3],12($out)
280
281 mov 0(%rsp),%r15
282 mov 8(%rsp),%r14
283 mov 16(%rsp),%r13
284 mov 24(%rsp),%rbp
285 mov 32(%rsp),%rbx
286 lea 40(%rsp),%rsp
287.Ldec_epilogue:
288 ret
289.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
290
291.type _x86_64_Camellia_decrypt,\@abi-omnipotent
292.align 16
293_x86_64_Camellia_decrypt:
294 _CET_ENDBR
295 xor 0($key),@S[1]
296 xor 4($key),@S[0] # ^=key[0-3]
297 xor 8($key),@S[3]
298 xor 12($key),@S[2]
299.align 16
300.Ldloop:
301 mov -8($key),$t1 # prefetch key[4-5]
302 mov -4($key),$t0
303
304___
305 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
306$code.=<<___;
307 lea -16*4($key),$key
308 cmp $keyend,$key
309 mov 0($key),$t3 # prefetch key[2-3]
310 mov 4($key),$t2
311 je .Lddone
312
313 and @S[0],$t0
314 or @S[3],$t3
315 rol \$1,$t0
316 xor $t3,@S[2] # s2^=s3|key[3];
317 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
318 and @S[2],$t2
319 or @S[1],$t1
320 rol \$1,$t2
321 xor $t1,@S[0] # s0^=s1|key[1];
322 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
323
324 jmp .Ldloop
325
326.align 16
327.Lddone:
328 xor @S[2],$t2
329 xor @S[3],$t3
330 xor @S[0],$t0
331 xor @S[1],$t1
332
333 mov $t2,@S[0] # SwapHalf
334 mov $t3,@S[1]
335 mov $t0,@S[2]
336 mov $t1,@S[3]
337
338 retq
339.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
340___
341
342sub _saveround {
343my ($rnd,$key,@T)=@_;
344my $bias=int(@T[0])?shift(@T):0;
345
346 if ($#T==3) {
347 $code.=<<___;
348 mov @T[1],`$bias+$rnd*8+0`($key)
349 mov @T[0],`$bias+$rnd*8+4`($key)
350 mov @T[3],`$bias+$rnd*8+8`($key)
351 mov @T[2],`$bias+$rnd*8+12`($key)
352___
353 } else {
354 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
355 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
356 }
357}
358
359sub _loadround {
360my ($rnd,$key,@T)=@_;
361my $bias=int(@T[0])?shift(@T):0;
362
363$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
364$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
365}
366
367# shld is very slow on Intel EM64T family. Even on AMD it limits
368# instruction decode rate [because it's VectorPath] and consequently
369# performance...
370sub __rotl128 {
371my ($i0,$i1,$rot)=@_;
372
373 if ($rot) {
374 $code.=<<___;
375 mov $i0,%r11
376 shld \$$rot,$i1,$i0
377 shld \$$rot,%r11,$i1
378___
379 }
380}
381
382# ... Implementing 128-bit rotate without shld gives 80% better
383# performance EM64T, +15% on AMD64 and only ~7% degradation on
384# Core2. This is therefore preferred.
385sub _rotl128 {
386my ($i0,$i1,$rot)=@_;
387
388 if ($rot) {
389 $code.=<<___;
390 mov $i0,%r11
391 shl \$$rot,$i0
392 mov $i1,%r9
393 shr \$`64-$rot`,%r9
394 shr \$`64-$rot`,%r11
395 or %r9,$i0
396 shl \$$rot,$i1
397 or %r11,$i1
398___
399 }
400}
401
402{ my $step=0;
403
404$code.=<<___;
405.globl Camellia_Ekeygen
406.type Camellia_Ekeygen,\@function,3
407.align 16
408Camellia_Ekeygen:
409 _CET_ENDBR
410 push %rbx
411 push %rbp
412 push %r13
413 push %r14
414 push %r15
415.Lkey_prologue:
416
417 mov %rdi,$keyend # put away arguments, keyBitLength
418 mov %rdx,$out # keyTable
419
420 mov 0(%rsi),@S[0] # load 0-127 bits
421 mov 4(%rsi),@S[1]
422 mov 8(%rsi),@S[2]
423 mov 12(%rsi),@S[3]
424
425 bswap @S[0]
426 bswap @S[1]
427 bswap @S[2]
428 bswap @S[3]
429___
430 &_saveround (0,$out,@S); # KL<<<0
431$code.=<<___;
432 cmp \$128,$keyend # check keyBitLength
433 je .L1st128
434
435 mov 16(%rsi),@S[0] # load 128-191 bits
436 mov 20(%rsi),@S[1]
437 cmp \$192,$keyend
438 je .L1st192
439 mov 24(%rsi),@S[2] # load 192-255 bits
440 mov 28(%rsi),@S[3]
441 jmp .L1st256
442.L1st192:
443 mov @S[0],@S[2]
444 mov @S[1],@S[3]
445 not @S[2]
446 not @S[3]
447.L1st256:
448 bswap @S[0]
449 bswap @S[1]
450 bswap @S[2]
451 bswap @S[3]
452___
453 &_saveround (4,$out,@S); # temp storage for KR!
454$code.=<<___;
455 xor 0($out),@S[1] # KR^KL
456 xor 4($out),@S[0]
457 xor 8($out),@S[3]
458 xor 12($out),@S[2]
459
460.L1st128:
461 lea .LCamellia_SIGMA(%rip),$key
462 lea .LCamellia_SBOX(%rip),$Tbl
463
464 mov 0($key),$t1
465 mov 4($key),$t0
466___
467 &Camellia_Feistel($step++);
468 &Camellia_Feistel($step++);
469$code.=<<___;
470 xor 0($out),@S[1] # ^KL
471 xor 4($out),@S[0]
472 xor 8($out),@S[3]
473 xor 12($out),@S[2]
474___
475 &Camellia_Feistel($step++);
476 &Camellia_Feistel($step++);
477$code.=<<___;
478 cmp \$128,$keyend
479 jne .L2nd256
480
481 lea 128($out),$out # size optimization
482 shl \$32,%r8 # @S[0]||
483 shl \$32,%r10 # @S[2]||
484 or %r9,%r8 # ||@S[1]
485 or %r11,%r10 # ||@S[3]
486___
487 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
488 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
489 &_rotl128 ("%rax","%rbx",15);
490 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
491 &_rotl128 ("%r8","%r10",15);
492 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
493 &_rotl128 ("%r8","%r10",15); # 15+15=30
494 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
495 &_rotl128 ("%rax","%rbx",30); # 15+30=45
496 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
497 &_rotl128 ("%r8","%r10",15); # 30+15=45
498 &_saveround (12,$out,-128,"%r8"); # KA<<<45
499 &_rotl128 ("%rax","%rbx",15); # 45+15=60
500 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
501 &_rotl128 ("%r8","%r10",15); # 45+15=60
502 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
503 &_rotl128 ("%rax","%rbx",17); # 60+17=77
504 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
505 &_rotl128 ("%rax","%rbx",17); # 77+17=94
506 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
507 &_rotl128 ("%r8","%r10",34); # 60+34=94
508 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
509 &_rotl128 ("%rax","%rbx",17); # 94+17=111
510 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
511 &_rotl128 ("%r8","%r10",17); # 94+17=111
512 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
513$code.=<<___;
514 mov \$3,%eax
515 jmp .Ldone
516.align 16
517.L2nd256:
518___
519 &_saveround (6,$out,@S); # temp storage for KA!
520$code.=<<___;
521 xor `4*8+0`($out),@S[1] # KA^KR
522 xor `4*8+4`($out),@S[0]
523 xor `5*8+0`($out),@S[3]
524 xor `5*8+4`($out),@S[2]
525___
526 &Camellia_Feistel($step++);
527 &Camellia_Feistel($step++);
528
529 &_loadround (0,$out,"%rax","%rbx"); # KL
530 &_loadround (4,$out,"%rcx","%rdx"); # KR
531 &_loadround (6,$out,"%r14","%r15"); # KA
532$code.=<<___;
533 lea 128($out),$out # size optimization
534 shl \$32,%r8 # @S[0]||
535 shl \$32,%r10 # @S[2]||
536 or %r9,%r8 # ||@S[1]
537 or %r11,%r10 # ||@S[3]
538___
539 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
540 &_rotl128 ("%rcx","%rdx",15);
541 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
542 &_rotl128 ("%r14","%r15",15);
543 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
544 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
545 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
546 &_rotl128 ("%r8","%r10",30);
547 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
548 &_rotl128 ("%rax","%rbx",45);
549 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
550 &_rotl128 ("%r14","%r15",30); # 15+30=45
551 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
552 &_rotl128 ("%rax","%rbx",15); # 45+15=60
553 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
554 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
555 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
556 &_rotl128 ("%r8","%r10",30); # 30+30=60
557 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
558 &_rotl128 ("%rax","%rbx",17); # 60+17=77
559 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
560 &_rotl128 ("%r14","%r15",32); # 45+32=77
561 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
562 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
563 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
564 &_rotl128 ("%r14","%r15",17); # 77+17=94
565 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
566 &_rotl128 ("%rax","%rbx",34); # 77+34=111
567 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
568 &_rotl128 ("%r8","%r10",51); # 60+51=111
569 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
570$code.=<<___;
571 mov \$4,%eax
572.Ldone:
573 mov 0(%rsp),%r15
574 mov 8(%rsp),%r14
575 mov 16(%rsp),%r13
576 mov 24(%rsp),%rbp
577 mov 32(%rsp),%rbx
578 lea 40(%rsp),%rsp
579.Lkey_epilogue:
580 ret
581.size Camellia_Ekeygen,.-Camellia_Ekeygen
582___
583}
584
585@SBOX=(
586112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
587 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
588134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
589166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
590139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
591223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
592 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
593254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
594170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
595 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
596135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
597 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
598233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
599120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
600114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
601 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
602
603sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
604sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
605sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
606sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
607
608$code.=<<___;
609.section .rodata
610.align 64
611.LCamellia_SIGMA:
612.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
613.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
614.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
615.long 0, 0, 0, 0
616.LCamellia_SBOX:
617___
618# tables are interleaved, remember?
619sub data_word { $code.=".long\t".join(',',@_)."\n"; }
620for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
621for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
622
623# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
624# size_t length, const CAMELLIA_KEY *key,
625# unsigned char *ivp,const int enc);
626{
627$_key="0(%rsp)";
628$_end="8(%rsp)"; # inp+len&~15
629$_res="16(%rsp)"; # len&15
630$ivec="24(%rsp)";
631$_ivp="40(%rsp)";
632$_rsp="48(%rsp)";
633
634$code.=<<___;
635.text
636.globl Camellia_cbc_encrypt
637.type Camellia_cbc_encrypt,\@function,6
638.align 16
639Camellia_cbc_encrypt:
640 _CET_ENDBR
641 cmp \$0,%rdx
642 je .Lcbc_abort
643 push %rbx
644 push %rbp
645 push %r12
646 push %r13
647 push %r14
648 push %r15
649.Lcbc_prologue:
650
651 mov %rsp,%rbp
652 sub \$64,%rsp
653 and \$-64,%rsp
654
655 # place stack frame just "above mod 1024" the key schedule,
656 # this ensures that cache associativity suffices
657 lea -64-63(%rcx),%r10
658 sub %rsp,%r10
659 neg %r10
660 and \$0x3C0,%r10
661 sub %r10,%rsp
662 #add \$8,%rsp # 8 is reserved for callee's ra
663
664 mov %rdi,$inp # inp argument
665 mov %rsi,$out # out argument
666 mov %r8,%rbx # ivp argument
667 mov %rcx,$key # key argument
668 mov 272(%rcx),${keyend}d # grandRounds
669
670 mov %r8,$_ivp
671 mov %rbp,$_rsp
672
673.Lcbc_body:
674 lea .LCamellia_SBOX(%rip),$Tbl
675
676 mov \$32,%ecx
677.align 4
678.Lcbc_prefetch_sbox:
679 mov 0($Tbl),%rax
680 mov 32($Tbl),%rsi
681 mov 64($Tbl),%rdi
682 mov 96($Tbl),%r11
683 lea 128($Tbl),$Tbl
684 loop .Lcbc_prefetch_sbox
685 sub \$4096,$Tbl
686 shl \$6,$keyend
687 mov %rdx,%rcx # len argument
688 lea ($key,$keyend),$keyend
689
690 cmp \$0,%r9d # enc argument
691 je .LCBC_DECRYPT
692
693 and \$-16,%rdx
694 and \$15,%rcx # length residue
695 lea ($inp,%rdx),%rdx
696 mov $key,$_key
697 mov %rdx,$_end
698 mov %rcx,$_res
699
700 cmp $inp,%rdx
701 mov 0(%rbx),@S[0] # load IV
702 mov 4(%rbx),@S[1]
703 mov 8(%rbx),@S[2]
704 mov 12(%rbx),@S[3]
705 je .Lcbc_enc_tail
706 jmp .Lcbc_eloop
707
708.align 16
709.Lcbc_eloop:
710 xor 0($inp),@S[0]
711 xor 4($inp),@S[1]
712 xor 8($inp),@S[2]
713 bswap @S[0]
714 xor 12($inp),@S[3]
715 bswap @S[1]
716 bswap @S[2]
717 bswap @S[3]
718
719 call _x86_64_Camellia_encrypt
720
721 mov $_key,$key # "rewind" the key
722 bswap @S[0]
723 mov $_end,%rdx
724 bswap @S[1]
725 mov $_res,%rcx
726 bswap @S[2]
727 mov @S[0],0($out)
728 bswap @S[3]
729 mov @S[1],4($out)
730 mov @S[2],8($out)
731 lea 16($inp),$inp
732 mov @S[3],12($out)
733 cmp %rdx,$inp
734 lea 16($out),$out
735 jne .Lcbc_eloop
736
737 cmp \$0,%rcx
738 jne .Lcbc_enc_tail
739
740 mov $_ivp,$out
741 mov @S[0],0($out) # write out IV residue
742 mov @S[1],4($out)
743 mov @S[2],8($out)
744 mov @S[3],12($out)
745 jmp .Lcbc_done
746
747.align 16
748.Lcbc_enc_tail:
749 xor %rax,%rax
750 mov %rax,0+$ivec
751 mov %rax,8+$ivec
752 mov %rax,$_res
753
754.Lcbc_enc_pushf:
755 pushfq
756 cld
757 mov $inp,%rsi
758 lea 8+$ivec,%rdi
759 .long 0x9066A4F3 # rep movsb
760 popfq
761.Lcbc_enc_popf:
762
763 lea $ivec,$inp
764 lea 16+$ivec,%rax
765 mov %rax,$_end
766 jmp .Lcbc_eloop # one more time
767
768.align 16
769.LCBC_DECRYPT:
770 xchg $key,$keyend
771 add \$15,%rdx
772 and \$15,%rcx # length residue
773 and \$-16,%rdx
774 mov $key,$_key
775 lea ($inp,%rdx),%rdx
776 mov %rdx,$_end
777 mov %rcx,$_res
778
779 mov (%rbx),%rax # load IV
780 mov 8(%rbx),%rbx
781 jmp .Lcbc_dloop
782.align 16
783.Lcbc_dloop:
784 mov 0($inp),@S[0]
785 mov 4($inp),@S[1]
786 mov 8($inp),@S[2]
787 bswap @S[0]
788 mov 12($inp),@S[3]
789 bswap @S[1]
790 mov %rax,0+$ivec # save IV to temporary storage
791 bswap @S[2]
792 mov %rbx,8+$ivec
793 bswap @S[3]
794
795 call _x86_64_Camellia_decrypt
796
797 mov $_key,$key # "rewind" the key
798 mov $_end,%rdx
799 mov $_res,%rcx
800
801 bswap @S[0]
802 mov ($inp),%rax # load IV for next iteration
803 bswap @S[1]
804 mov 8($inp),%rbx
805 bswap @S[2]
806 xor 0+$ivec,@S[0]
807 bswap @S[3]
808 xor 4+$ivec,@S[1]
809 xor 8+$ivec,@S[2]
810 lea 16($inp),$inp
811 xor 12+$ivec,@S[3]
812 cmp %rdx,$inp
813 je .Lcbc_ddone
814
815 mov @S[0],0($out)
816 mov @S[1],4($out)
817 mov @S[2],8($out)
818 mov @S[3],12($out)
819
820 lea 16($out),$out
821 jmp .Lcbc_dloop
822
823.align 16
824.Lcbc_ddone:
825 mov $_ivp,%rdx
826 cmp \$0,%rcx
827 jne .Lcbc_dec_tail
828
829 mov @S[0],0($out)
830 mov @S[1],4($out)
831 mov @S[2],8($out)
832 mov @S[3],12($out)
833
834 mov %rax,(%rdx) # write out IV residue
835 mov %rbx,8(%rdx)
836 jmp .Lcbc_done
837.align 16
838.Lcbc_dec_tail:
839 mov @S[0],0+$ivec
840 mov @S[1],4+$ivec
841 mov @S[2],8+$ivec
842 mov @S[3],12+$ivec
843
844.Lcbc_dec_pushf:
845 pushfq
846 cld
847 lea 8+$ivec,%rsi
848 lea ($out),%rdi
849 .long 0x9066A4F3 # rep movsb
850 popfq
851.Lcbc_dec_popf:
852
853 mov %rax,(%rdx) # write out IV residue
854 mov %rbx,8(%rdx)
855 jmp .Lcbc_done
856
857.align 16
858.Lcbc_done:
859 mov $_rsp,%rcx
860 mov 0(%rcx),%r15
861 mov 8(%rcx),%r14
862 mov 16(%rcx),%r13
863 mov 24(%rcx),%r12
864 mov 32(%rcx),%rbp
865 mov 40(%rcx),%rbx
866 lea 48(%rcx),%rsp
867.Lcbc_abort:
868 ret
869.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
870___
871}
872
873$code =~ s/\`([^\`]*)\`/eval $1/gem;
874print $code;
875close STDOUT;