summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/camellia/asm/cmll-x86.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/camellia/asm/cmll-x86.pl')
-rw-r--r--src/lib/libcrypto/camellia/asm/cmll-x86.pl1126
1 files changed, 0 insertions, 1126 deletions
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
deleted file mode 100644
index a4ab11e54d..0000000000
--- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl
+++ /dev/null
@@ -1,1126 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD K8 Core2 PIII P4
18# -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
21#
22# camellia-128-cbc 17.3 21.1 23.9 25.9
23#
24# 128-bit key setup 196 280 256 240 cycles/key
25# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
27#
28# Pairs of numbers in "+" rows represent performance improvement over
29# compiler generated position-independent code, PIC, and non-PIC
30# respectively. PIC results are of greater relevance, as this module
31# is position-independent, i.e. suitable for a shared library or PIE.
32# Position independence "costs" one register, which is why compilers
33# are so close with non-PIC results, they have an extra register to
34# spare. CBC results are better than ECB ones thanks to "zero-copy"
35# private _x86_* interface, and are ~30-40% better than with compiler
36# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37# same CPU (where applicable).
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40push(@INC,"${dir}","${dir}../../perlasm");
41require "x86asm.pl";
42
43$OPENSSL=1;
44
45&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
46
47@T=("eax","ebx","ecx","edx");
48$idx="esi";
49$key="edi";
50$Tbl="ebp";
51
52# stack frame layout in _x86_Camellia_* routines, frame is allocated
53# by caller
54$__ra=&DWP(0,"esp"); # return address
55$__s0=&DWP(4,"esp"); # s0 backing store
56$__s1=&DWP(8,"esp"); # s1 backing store
57$__s2=&DWP(12,"esp"); # s2 backing store
58$__s3=&DWP(16,"esp"); # s3 backing store
59$__end=&DWP(20,"esp"); # pointer to end/start of key schedule
60
61# stack frame layout in Camellia_[en|crypt] routines, which differs from
62# above by 4 and overlaps by pointer to end/start of key schedule
63$_end=&DWP(16,"esp");
64$_esp=&DWP(20,"esp");
65
66# const unsigned int Camellia_SBOX[4][256];
67# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68# and [2][] - with [3][]. This is done to optimize code size.
69$SBOX1_1110=0; # Camellia_SBOX[0]
70$SBOX4_4404=4; # Camellia_SBOX[1]
71$SBOX2_0222=2048; # Camellia_SBOX[2]
72$SBOX3_3033=2052; # Camellia_SBOX[3]
73&static_label("Camellia_SIGMA");
74&static_label("Camellia_SBOX");
75
76sub Camellia_Feistel {
77my $i=@_[0];
78my $seed=defined(@_[1])?@_[1]:0;
79my $scale=$seed<0?-8:8;
80my $frame=defined(@_[2])?@_[2]:0;
81my $j=($i&1)*2;
82my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
83
84 &xor ($t0,$idx); # t0^=key[0]
85 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
86 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
87 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
88 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
89 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
90 &shr ($t0,16);
91 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
92 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
93 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
94 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
95 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
96 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
97 &shr ($t1,16);
98 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
99 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
100 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
101 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
102 &xor ($t2,$t3); # t2^=t3
103 &rotr ($t3,8); # t3=RightRotate(t3,8)
104 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
105 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
106 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
107 &xor ($t3,$t0); # t3^=s3
108 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
109 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
110 &xor ($t3,$t2); # t3^=t2
111 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
112 &xor ($t2,$t1); # t2^=s2
113 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
114}
115
116# void Camellia_EncryptBlock_Rounds(
117# int grandRounds,
118# const Byte plaintext[],
119# const KEY_TABLE_TYPE keyTable,
120# Byte ciphertext[])
121&function_begin("Camellia_EncryptBlock_Rounds");
122 &mov ("eax",&wparam(0)); # load grandRounds
123 &mov ($idx,&wparam(1)); # load plaintext pointer
124 &mov ($key,&wparam(2)); # load key schedule pointer
125
126 &mov ("ebx","esp");
127 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
128 &and ("esp",-64);
129
130 # place stack frame just "above mod 1024" the key schedule
131 # this ensures that cache associativity of 2 suffices
132 &lea ("ecx",&DWP(-64-63,$key));
133 &sub ("ecx","esp");
134 &neg ("ecx");
135 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
136 &sub ("esp","ecx");
137 &add ("esp",4); # 4 is reserved for callee's return address
138
139 &shl ("eax",6);
140 &lea ("eax",&DWP(0,$key,"eax"));
141 &mov ($_esp,"ebx"); # save %esp
142 &mov ($_end,"eax"); # save keyEnd
143
144 &picsetup($Tbl);
145 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
146
147 &mov (@T[0],&DWP(0,$idx)); # load plaintext
148 &mov (@T[1],&DWP(4,$idx));
149 &mov (@T[2],&DWP(8,$idx));
150 &bswap (@T[0]);
151 &mov (@T[3],&DWP(12,$idx));
152 &bswap (@T[1]);
153 &bswap (@T[2]);
154 &bswap (@T[3]);
155
156 &call ("_x86_Camellia_encrypt");
157
158 &mov ("esp",$_esp);
159 &bswap (@T[0]);
160 &mov ($idx,&wparam(3)); # load ciphertext pointer
161 &bswap (@T[1]);
162 &bswap (@T[2]);
163 &bswap (@T[3]);
164 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
165 &mov (&DWP(4,$idx),@T[1]);
166 &mov (&DWP(8,$idx),@T[2]);
167 &mov (&DWP(12,$idx),@T[3]);
168&function_end("Camellia_EncryptBlock_Rounds");
169# V1.x API
170&function_begin_B("Camellia_EncryptBlock");
171 &mov ("eax",128);
172 &sub ("eax",&wparam(0)); # load keyBitLength
173 &mov ("eax",3);
174 &adc ("eax",0); # keyBitLength==128?3:4
175 &mov (&wparam(0),"eax");
176 &jmp (&label("Camellia_EncryptBlock_Rounds"));
177&function_end_B("Camellia_EncryptBlock");
178
179if ($OPENSSL) {
180# void Camellia_encrypt(
181# const unsigned char *in,
182# unsigned char *out,
183# const CAMELLIA_KEY *key)
184&function_begin("Camellia_encrypt");
185 &mov ($idx,&wparam(0)); # load plaintext pointer
186 &mov ($key,&wparam(2)); # load key schedule pointer
187
188 &mov ("ebx","esp");
189 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
190 &and ("esp",-64);
191 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
192
193 # place stack frame just "above mod 1024" the key schedule
194 # this ensures that cache associativity of 2 suffices
195 &lea ("ecx",&DWP(-64-63,$key));
196 &sub ("ecx","esp");
197 &neg ("ecx");
198 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
199 &sub ("esp","ecx");
200 &add ("esp",4); # 4 is reserved for callee's return address
201
202 &shl ("eax",6);
203 &lea ("eax",&DWP(0,$key,"eax"));
204 &mov ($_esp,"ebx"); # save %esp
205 &mov ($_end,"eax"); # save keyEnd
206
207 &picsetup($Tbl);
208 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
209
210 &mov (@T[0],&DWP(0,$idx)); # load plaintext
211 &mov (@T[1],&DWP(4,$idx));
212 &mov (@T[2],&DWP(8,$idx));
213 &bswap (@T[0]);
214 &mov (@T[3],&DWP(12,$idx));
215 &bswap (@T[1]);
216 &bswap (@T[2]);
217 &bswap (@T[3]);
218
219 &call ("_x86_Camellia_encrypt");
220
221 &mov ("esp",$_esp);
222 &bswap (@T[0]);
223 &mov ($idx,&wparam(1)); # load ciphertext pointer
224 &bswap (@T[1]);
225 &bswap (@T[2]);
226 &bswap (@T[3]);
227 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
228 &mov (&DWP(4,$idx),@T[1]);
229 &mov (&DWP(8,$idx),@T[2]);
230 &mov (&DWP(12,$idx),@T[3]);
231&function_end("Camellia_encrypt");
232}
233
234&function_begin_B("_x86_Camellia_encrypt");
235 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
236 &xor (@T[1],&DWP(4,$key));
237 &xor (@T[2],&DWP(8,$key));
238 &xor (@T[3],&DWP(12,$key));
239 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
240
241 &mov ($__s0,@T[0]); # save s[0-3]
242 &mov ($__s1,@T[1]);
243 &mov ($__s2,@T[2]);
244 &mov ($__s3,@T[3]);
245
246&set_label("loop",16);
247 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
248
249 &add ($key,16*4);
250 &cmp ($key,$__end);
251 &je (&label("done"));
252
253 # @T[0-1] are preloaded, $idx is preloaded with key[0]
254 &and ($idx,@T[0]);
255 &mov (@T[3],$__s3);
256 &rotl ($idx,1);
257 &mov (@T[2],@T[3]);
258 &xor (@T[1],$idx);
259 &or (@T[2],&DWP(12,$key));
260 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
261 &xor (@T[2],$__s2);
262
263 &mov ($idx,&DWP(4,$key));
264 &mov ($__s2,@T[2]); # s2^=s3|key[3];
265 &or ($idx,@T[1]);
266 &and (@T[2],&DWP(8,$key));
267 &xor (@T[0],$idx);
268 &rotl (@T[2],1);
269 &mov ($__s0,@T[0]); # s0^=s1|key[1];
270 &xor (@T[3],@T[2]);
271 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
272 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
273 &jmp (&label("loop"));
274
275&set_label("done",8);
276 &mov (@T[2],@T[0]); # SwapHalf
277 &mov (@T[3],@T[1]);
278 &mov (@T[0],$__s2);
279 &mov (@T[1],$__s3);
280 &xor (@T[0],$idx); # $idx is preloaded with key[0]
281 &xor (@T[1],&DWP(4,$key));
282 &xor (@T[2],&DWP(8,$key));
283 &xor (@T[3],&DWP(12,$key));
284 &ret ();
285&function_end_B("_x86_Camellia_encrypt");
286
287# void Camellia_DecryptBlock_Rounds(
288# int grandRounds,
289# const Byte ciphertext[],
290# const KEY_TABLE_TYPE keyTable,
291# Byte plaintext[])
292&function_begin("Camellia_DecryptBlock_Rounds");
293 &mov ("eax",&wparam(0)); # load grandRounds
294 &mov ($idx,&wparam(1)); # load ciphertext pointer
295 &mov ($key,&wparam(2)); # load key schedule pointer
296
297 &mov ("ebx","esp");
298 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
299 &and ("esp",-64);
300
301 # place stack frame just "above mod 1024" the key schedule
302 # this ensures that cache associativity of 2 suffices
303 &lea ("ecx",&DWP(-64-63,$key));
304 &sub ("ecx","esp");
305 &neg ("ecx");
306 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
307 &sub ("esp","ecx");
308 &add ("esp",4); # 4 is reserved for callee's return address
309
310 &shl ("eax",6);
311 &mov (&DWP(4*4,"esp"),$key); # save keyStart
312 &lea ($key,&DWP(0,$key,"eax"));
313 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
314
315 &picsetup($Tbl);
316 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
317
318 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
319 &mov (@T[1],&DWP(4,$idx));
320 &mov (@T[2],&DWP(8,$idx));
321 &bswap (@T[0]);
322 &mov (@T[3],&DWP(12,$idx));
323 &bswap (@T[1]);
324 &bswap (@T[2]);
325 &bswap (@T[3]);
326
327 &call ("_x86_Camellia_decrypt");
328
329 &mov ("esp",&DWP(5*4,"esp"));
330 &bswap (@T[0]);
331 &mov ($idx,&wparam(3)); # load plaintext pointer
332 &bswap (@T[1]);
333 &bswap (@T[2]);
334 &bswap (@T[3]);
335 &mov (&DWP(0,$idx),@T[0]); # write plaintext
336 &mov (&DWP(4,$idx),@T[1]);
337 &mov (&DWP(8,$idx),@T[2]);
338 &mov (&DWP(12,$idx),@T[3]);
339&function_end("Camellia_DecryptBlock_Rounds");
340# V1.x API
341&function_begin_B("Camellia_DecryptBlock");
342 &mov ("eax",128);
343 &sub ("eax",&wparam(0)); # load keyBitLength
344 &mov ("eax",3);
345 &adc ("eax",0); # keyBitLength==128?3:4
346 &mov (&wparam(0),"eax");
347 &jmp (&label("Camellia_DecryptBlock_Rounds"));
348&function_end_B("Camellia_DecryptBlock");
349
350if ($OPENSSL) {
351# void Camellia_decrypt(
352# const unsigned char *in,
353# unsigned char *out,
354# const CAMELLIA_KEY *key)
355&function_begin("Camellia_decrypt");
356 &mov ($idx,&wparam(0)); # load ciphertext pointer
357 &mov ($key,&wparam(2)); # load key schedule pointer
358
359 &mov ("ebx","esp");
360 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
361 &and ("esp",-64);
362 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
363
364 # place stack frame just "above mod 1024" the key schedule
365 # this ensures that cache associativity of 2 suffices
366 &lea ("ecx",&DWP(-64-63,$key));
367 &sub ("ecx","esp");
368 &neg ("ecx");
369 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
370 &sub ("esp","ecx");
371 &add ("esp",4); # 4 is reserved for callee's return address
372
373 &shl ("eax",6);
374 &mov (&DWP(4*4,"esp"),$key); # save keyStart
375 &lea ($key,&DWP(0,$key,"eax"));
376 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
377
378 &picsetup($Tbl);
379 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
380
381 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
382 &mov (@T[1],&DWP(4,$idx));
383 &mov (@T[2],&DWP(8,$idx));
384 &bswap (@T[0]);
385 &mov (@T[3],&DWP(12,$idx));
386 &bswap (@T[1]);
387 &bswap (@T[2]);
388 &bswap (@T[3]);
389
390 &call ("_x86_Camellia_decrypt");
391
392 &mov ("esp",&DWP(5*4,"esp"));
393 &bswap (@T[0]);
394 &mov ($idx,&wparam(1)); # load plaintext pointer
395 &bswap (@T[1]);
396 &bswap (@T[2]);
397 &bswap (@T[3]);
398 &mov (&DWP(0,$idx),@T[0]); # write plaintext
399 &mov (&DWP(4,$idx),@T[1]);
400 &mov (&DWP(8,$idx),@T[2]);
401 &mov (&DWP(12,$idx),@T[3]);
402&function_end("Camellia_decrypt");
403}
404
405&function_begin_B("_x86_Camellia_decrypt");
406 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
407 &xor (@T[1],&DWP(4,$key));
408 &xor (@T[2],&DWP(8,$key));
409 &xor (@T[3],&DWP(12,$key));
410 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
411
412 &mov ($__s0,@T[0]); # save s[0-3]
413 &mov ($__s1,@T[1]);
414 &mov ($__s2,@T[2]);
415 &mov ($__s3,@T[3]);
416
417&set_label("loop",16);
418 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
419
420 &sub ($key,16*4);
421 &cmp ($key,$__end);
422 &je (&label("done"));
423
424 # @T[0-1] are preloaded, $idx is preloaded with key[2]
425 &and ($idx,@T[0]);
426 &mov (@T[3],$__s3);
427 &rotl ($idx,1);
428 &mov (@T[2],@T[3]);
429 &xor (@T[1],$idx);
430 &or (@T[2],&DWP(4,$key));
431 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
432 &xor (@T[2],$__s2);
433
434 &mov ($idx,&DWP(12,$key));
435 &mov ($__s2,@T[2]); # s2^=s3|key[3];
436 &or ($idx,@T[1]);
437 &and (@T[2],&DWP(0,$key));
438 &xor (@T[0],$idx);
439 &rotl (@T[2],1);
440 &mov ($__s0,@T[0]); # s0^=s1|key[1];
441 &xor (@T[3],@T[2]);
442 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
443 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
444 &jmp (&label("loop"));
445
446&set_label("done",8);
447 &mov (@T[2],@T[0]); # SwapHalf
448 &mov (@T[3],@T[1]);
449 &mov (@T[0],$__s2);
450 &mov (@T[1],$__s3);
451 &xor (@T[2],$idx); # $idx is preloaded with key[2]
452 &xor (@T[3],&DWP(12,$key));
453 &xor (@T[0],&DWP(0,$key));
454 &xor (@T[1],&DWP(4,$key));
455 &ret ();
456&function_end_B("_x86_Camellia_decrypt");
457
458# shld is very slow on Intel P4 family. Even on AMD it limits
459# instruction decode rate [because it's VectorPath] and consequently
460# performance. PIII, PM and Core[2] seem to be the only ones which
461# execute this code ~7% faster...
462sub __rotl128 {
463 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
464
465 $rnd *= 2;
466 if ($rot) {
467 &mov ($idx,$i0);
468 &shld ($i0,$i1,$rot);
469 &shld ($i1,$i2,$rot);
470 &shld ($i2,$i3,$rot);
471 &shld ($i3,$idx,$rot);
472 }
473 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
474 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
475 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
476 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
477}
478
479# ... Implementing 128-bit rotate without shld gives >3x performance
480# improvement on P4, only ~7% degradation on other Intel CPUs and
481# not worse performance on AMD. This is therefore preferred.
482sub _rotl128 {
483 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
484
485 $rnd *= 2;
486 if ($rot) {
487 &mov ($Tbl,$i0);
488 &shl ($i0,$rot);
489 &mov ($idx,$i1);
490 &shr ($idx,32-$rot);
491 &shl ($i1,$rot);
492 &or ($i0,$idx);
493 &mov ($idx,$i2);
494 &shl ($i2,$rot);
495 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
496 &shr ($idx,32-$rot);
497 &or ($i1,$idx);
498 &shr ($Tbl,32-$rot);
499 &mov ($idx,$i3);
500 &shr ($idx,32-$rot);
501 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
502 &shl ($i3,$rot);
503 &or ($i2,$idx);
504 &or ($i3,$Tbl);
505 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
506 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
507 } else {
508 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
509 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
510 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
511 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
512 }
513}
514
515sub _saveround {
516my ($rnd,$key,@T)=@_;
517my $bias=int(@T[0])?shift(@T):0;
518
519 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
520 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
521 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
522 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
523}
524
525sub _loadround {
526my ($rnd,$key,@T)=@_;
527my $bias=int(@T[0])?shift(@T):0;
528
529 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
530 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
531 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
532 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
533}
534
535# void Camellia_Ekeygen(
536# const int keyBitLength,
537# const Byte *rawKey,
538# KEY_TABLE_TYPE keyTable)
539&function_begin("Camellia_Ekeygen");
540{ my $step=0;
541
542 &stack_push(4); # place for s[0-3]
543
544 &mov ($Tbl,&wparam(0)); # load arguments
545 &mov ($idx,&wparam(1));
546 &mov ($key,&wparam(2));
547
548 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
549 &mov (@T[1],&DWP(4,$idx));
550 &mov (@T[2],&DWP(8,$idx));
551 &mov (@T[3],&DWP(12,$idx));
552
553 &bswap (@T[0]);
554 &bswap (@T[1]);
555 &bswap (@T[2]);
556 &bswap (@T[3]);
557
558 &_saveround (0,$key,@T); # KL<<<0
559
560 &cmp ($Tbl,128);
561 &je (&label("1st128"));
562
563 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
564 &mov (@T[1],&DWP(20,$idx));
565 &cmp ($Tbl,192);
566 &je (&label("1st192"));
567 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
568 &mov (@T[3],&DWP(28,$idx));
569 &jmp (&label("1st256"));
570&set_label("1st192",4);
571 &mov (@T[2],@T[0]);
572 &mov (@T[3],@T[1]);
573 &not (@T[2]);
574 &not (@T[3]);
575&set_label("1st256",4);
576 &bswap (@T[0]);
577 &bswap (@T[1]);
578 &bswap (@T[2]);
579 &bswap (@T[3]);
580
581 &_saveround (4,$key,@T); # temporary storage for KR!
582
583 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
584 &xor (@T[1],&DWP(0*8+4,$key));
585 &xor (@T[2],&DWP(1*8+0,$key));
586 &xor (@T[3],&DWP(1*8+4,$key));
587
588&set_label("1st128",4);
589 &picsetup($Tbl);
590 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
591 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
592
593 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
594 &mov (&swtmp(0),@T[0]); # save s[0-3]
595 &mov (&swtmp(1),@T[1]);
596 &mov (&swtmp(2),@T[2]);
597 &mov (&swtmp(3),@T[3]);
598 &Camellia_Feistel($step++);
599 &Camellia_Feistel($step++);
600 &mov (@T[2],&swtmp(2));
601 &mov (@T[3],&swtmp(3));
602
603 &mov ($idx,&wparam(2));
604 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
605 &xor (@T[1],&DWP(0*8+4,$idx));
606 &xor (@T[2],&DWP(1*8+0,$idx));
607 &xor (@T[3],&DWP(1*8+4,$idx));
608
609 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
610 &mov (&swtmp(0),@T[0]); # save s[0-3]
611 &mov (&swtmp(1),@T[1]);
612 &mov (&swtmp(2),@T[2]);
613 &mov (&swtmp(3),@T[3]);
614 &Camellia_Feistel($step++);
615 &Camellia_Feistel($step++);
616 &mov (@T[2],&swtmp(2));
617 &mov (@T[3],&swtmp(3));
618
619 &mov ($idx,&wparam(0));
620 &cmp ($idx,128);
621 &jne (&label("2nd256"));
622
623 &mov ($key,&wparam(2));
624 &lea ($key,&DWP(128,$key)); # size optimization
625
626 ####### process KA
627 &_saveround (2,$key,-128,@T); # KA<<<0
628 &_rotl128 (@T,15,6,@T); # KA<<<15
629 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
630 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
631 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
632 push (@T,shift(@T)); # rotl128(@T,32);
633 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
634 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
635
636 ####### process KL
637 &_loadround (0,$key,-128,@T); # load KL
638 &_rotl128 (@T,15,4,@T); # KL<<<15
639 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
640 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
641 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
642 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
643 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
644
645 while (@T[0] ne "eax") # restore order
646 { unshift (@T,pop(@T)); }
647
648 &mov ("eax",3); # 3 grandRounds
649 &jmp (&label("done"));
650
651&set_label("2nd256",16);
652 &mov ($idx,&wparam(2));
653 &_saveround (6,$idx,@T); # temporary storage for KA!
654
655 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
656 &xor (@T[1],&DWP(4*8+4,$idx));
657 &xor (@T[2],&DWP(5*8+0,$idx));
658 &xor (@T[3],&DWP(5*8+4,$idx));
659
660 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
661 &mov (&swtmp(0),@T[0]); # save s[0-3]
662 &mov (&swtmp(1),@T[1]);
663 &mov (&swtmp(2),@T[2]);
664 &mov (&swtmp(3),@T[3]);
665 &Camellia_Feistel($step++);
666 &Camellia_Feistel($step++);
667 &mov (@T[2],&swtmp(2));
668 &mov (@T[3],&swtmp(3));
669
670 &mov ($key,&wparam(2));
671 &lea ($key,&DWP(128,$key)); # size optimization
672
673 ####### process KB
674 &_saveround (2,$key,-128,@T); # KB<<<0
675 &_rotl128 (@T,30,10,@T); # KB<<<30
676 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
677 push (@T,shift(@T)); # rotl128(@T,32);
678 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
679
680 ####### process KR
681 &_loadround (4,$key,-128,@T); # load KR
682 &_rotl128 (@T,15,4,@T); # KR<<<15
683 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
684 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
685 push (@T,shift(@T)); # rotl128(@T,32);
686 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
687
688 ####### process KA
689 &_loadround (6,$key,-128,@T); # load KA
690 &_rotl128 (@T,15,6,@T); # KA<<<15
691 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
692 push (@T,shift(@T)); # rotl128(@T,32);
693 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
694 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
695
696 ####### process KL
697 &_loadround (0,$key,-128,@T); # load KL
698 push (@T,shift(@T)); # rotl128(@T,32);
699 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
700 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
701 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
702 push (@T,shift(@T)); # rotl128(@T,32);
703 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
704
705 while (@T[0] ne "eax") # restore order
706 { unshift (@T,pop(@T)); }
707
708 &mov ("eax",4); # 4 grandRounds
709&set_label("done");
710 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
711 &stack_pop(4);
712}
713&function_end("Camellia_Ekeygen");
714
715if ($OPENSSL) {
716# int Camellia_set_key (
717# const unsigned char *userKey,
718# int bits,
719# CAMELLIA_KEY *key)
720&function_begin_B("Camellia_set_key");
721 &push ("ebx");
722 &mov ("ecx",&wparam(0)); # pull arguments
723 &mov ("ebx",&wparam(1));
724 &mov ("edx",&wparam(2));
725
726 &mov ("eax",-1);
727 &test ("ecx","ecx");
728 &jz (&label("done")); # userKey==NULL?
729 &test ("edx","edx");
730 &jz (&label("done")); # key==NULL?
731
732 &mov ("eax",-2);
733 &cmp ("ebx",256);
734 &je (&label("arg_ok")); # bits==256?
735 &cmp ("ebx",192);
736 &je (&label("arg_ok")); # bits==192?
737 &cmp ("ebx",128);
738 &jne (&label("done")); # bits!=128?
739&set_label("arg_ok",4);
740
741 &push ("edx"); # push arguments
742 &push ("ecx");
743 &push ("ebx");
744 &call ("Camellia_Ekeygen");
745 &stack_pop(3);
746
747 # eax holds grandRounds and edx points at where to put it
748 &mov (&DWP(0,"edx"),"eax");
749 &xor ("eax","eax");
750&set_label("done",4);
751 &pop ("ebx");
752 &ret ();
753&function_end_B("Camellia_set_key");
754}
755
756@SBOX=(
757112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
758 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
759134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
760166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
761139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
762223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
763 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
764254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
765170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
766 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
767135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
768 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
769233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
770120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
771114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
772 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
773
774sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
775sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
776sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
777sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
778
779 &rodataseg();
780&set_label("Camellia_SIGMA",64);
781&data_word(
782 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
783 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
784 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
785 0, 0, 0, 0);
786&set_label("Camellia_SBOX",64);
787# tables are interleaved, remember?
788for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
789for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
790 &previous();
791
792# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
793# size_t length, const CAMELLIA_KEY *key,
794# unsigned char *ivp,const int enc);
795{
796# stack frame layout
797# -4(%esp) # return address 0(%esp)
798# 0(%esp) # s0 4(%esp)
799# 4(%esp) # s1 8(%esp)
800# 8(%esp) # s2 12(%esp)
801# 12(%esp) # s3 16(%esp)
802# 16(%esp) # end of key schedule 20(%esp)
803# 20(%esp) # %esp backup
804my $_inp=&DWP(24,"esp"); #copy of wparam(0)
805my $_out=&DWP(28,"esp"); #copy of wparam(1)
806my $_len=&DWP(32,"esp"); #copy of wparam(2)
807my $_key=&DWP(36,"esp"); #copy of wparam(3)
808my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
809my $ivec=&DWP(44,"esp"); #ivec[16]
810my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
811my ($s0,$s1,$s2,$s3) = @T;
812
813&function_begin("Camellia_cbc_encrypt");
814 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
815 &cmp ($s2,0);
816 &je (&label("enc_out"));
817
818 &pushf ();
819 &cld ();
820
821 &mov ($s0,&wparam(0)); # load inp
822 &mov ($s1,&wparam(1)); # load out
823 #&mov ($s2,&wparam(2)); # load len
824 &mov ($s3,&wparam(3)); # load key
825 &mov ($Tbl,&wparam(4)); # load ivp
826
827 # allocate aligned stack frame...
828 &lea ($idx,&DWP(-64,"esp"));
829 &and ($idx,-64);
830
831 # place stack frame just "above mod 1024" the key schedule
832 # this ensures that cache associativity of 2 suffices
833 &lea ($key,&DWP(-64-63,$s3));
834 &sub ($key,$idx);
835 &neg ($key);
836 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
837 &sub ($idx,$key);
838
839 &mov ($key,&wparam(5)); # load enc
840
841 &exch ("esp",$idx);
842 &add ("esp",4); # reserve for return address!
843 &mov ($_esp,$idx); # save %esp
844
845 &mov ($_inp,$s0); # save copy of inp
846 &mov ($_out,$s1); # save copy of out
847 &mov ($_len,$s2); # save copy of len
848 &mov ($_key,$s3); # save copy of key
849 &mov ($_ivp,$Tbl); # save copy of ivp
850
851 &picsetup($Tbl);
852 &picsymbol($Tbl, &label("Camellia_SBOX"), $Tbl);
853
854 &mov ($idx,32);
855 &set_label("prefetch_sbox",4);
856 &mov ($s0,&DWP(0,$Tbl));
857 &mov ($s1,&DWP(32,$Tbl));
858 &mov ($s2,&DWP(64,$Tbl));
859 &mov ($s3,&DWP(96,$Tbl));
860 &lea ($Tbl,&DWP(128,$Tbl));
861 &dec ($idx);
862 &jnz (&label("prefetch_sbox"));
863 &mov ($s0,$_key);
864 &sub ($Tbl,4096);
865 &mov ($idx,$_inp);
866 &mov ($s3,&DWP(272,$s0)); # load grandRounds
867
868 &cmp ($key,0);
869 &je (&label("DECRYPT"));
870
871 &mov ($s2,$_len);
872 &mov ($key,$_ivp);
873 &shl ($s3,6);
874 &lea ($s3,&DWP(0,$s0,$s3));
875 &mov ($_end,$s3);
876
877 &test ($s2,0xFFFFFFF0);
878 &jz (&label("enc_tail")); # short input...
879
880 &mov ($s0,&DWP(0,$key)); # load iv
881 &mov ($s1,&DWP(4,$key));
882
883 &set_label("enc_loop",4);
884 &mov ($s2,&DWP(8,$key));
885 &mov ($s3,&DWP(12,$key));
886
887 &xor ($s0,&DWP(0,$idx)); # xor input data
888 &xor ($s1,&DWP(4,$idx));
889 &xor ($s2,&DWP(8,$idx));
890 &bswap ($s0);
891 &xor ($s3,&DWP(12,$idx));
892 &bswap ($s1);
893 &mov ($key,$_key); # load key
894 &bswap ($s2);
895 &bswap ($s3);
896
897 &call ("_x86_Camellia_encrypt");
898
899 &mov ($idx,$_inp); # load inp
900 &mov ($key,$_out); # load out
901
902 &bswap ($s0);
903 &bswap ($s1);
904 &bswap ($s2);
905 &mov (&DWP(0,$key),$s0); # save output data
906 &bswap ($s3);
907 &mov (&DWP(4,$key),$s1);
908 &mov (&DWP(8,$key),$s2);
909 &mov (&DWP(12,$key),$s3);
910
911 &mov ($s2,$_len); # load len
912
913 &lea ($idx,&DWP(16,$idx));
914 &mov ($_inp,$idx); # save inp
915
916 &lea ($s3,&DWP(16,$key));
917 &mov ($_out,$s3); # save out
918
919 &sub ($s2,16);
920 &test ($s2,0xFFFFFFF0);
921 &mov ($_len,$s2); # save len
922 &jnz (&label("enc_loop"));
923 &test ($s2,15);
924 &jnz (&label("enc_tail"));
925 &mov ($idx,$_ivp); # load ivp
926 &mov ($s2,&DWP(8,$key)); # restore last dwords
927 &mov ($s3,&DWP(12,$key));
928 &mov (&DWP(0,$idx),$s0); # save ivec
929 &mov (&DWP(4,$idx),$s1);
930 &mov (&DWP(8,$idx),$s2);
931 &mov (&DWP(12,$idx),$s3);
932
933 &mov ("esp",$_esp);
934 &popf ();
935 &set_label("enc_out");
936 &function_end_A();
937 &pushf (); # kludge, never executed
938
939 &set_label("enc_tail",4);
940 &mov ($s0,$key eq "edi" ? $key : "");
941 &mov ($key,$_out); # load out
942 &push ($s0); # push ivp
943 &mov ($s1,16);
944 &sub ($s1,$s2);
945 &cmp ($key,$idx); # compare with inp
946 &je (&label("enc_in_place"));
947 &align (4);
948 &data_word(0xA4F3F689); # rep movsb # copy input
949 &jmp (&label("enc_skip_in_place"));
950 &set_label("enc_in_place");
951 &lea ($key,&DWP(0,$key,$s2));
952 &set_label("enc_skip_in_place");
953 &mov ($s2,$s1);
954 &xor ($s0,$s0);
955 &align (4);
956 &data_word(0xAAF3F689); # rep stosb # zero tail
957 &pop ($key); # pop ivp
958
959 &mov ($idx,$_out); # output as input
960 &mov ($s0,&DWP(0,$key));
961 &mov ($s1,&DWP(4,$key));
962 &mov ($_len,16); # len=16
963 &jmp (&label("enc_loop")); # one more spin...
964
965#----------------------------- DECRYPT -----------------------------#
966&set_label("DECRYPT",16);
967 &shl ($s3,6);
968 &lea ($s3,&DWP(0,$s0,$s3));
969 &mov ($_end,$s0);
970 &mov ($_key,$s3);
971
972 &cmp ($idx,$_out);
973 &je (&label("dec_in_place")); # in-place processing...
974
975 &mov ($key,$_ivp); # load ivp
976 &mov ($_tmp,$key);
977
978 &set_label("dec_loop",4);
979 &mov ($s0,&DWP(0,$idx)); # read input
980 &mov ($s1,&DWP(4,$idx));
981 &mov ($s2,&DWP(8,$idx));
982 &bswap ($s0);
983 &mov ($s3,&DWP(12,$idx));
984 &bswap ($s1);
985 &mov ($key,$_key); # load key
986 &bswap ($s2);
987 &bswap ($s3);
988
989 &call ("_x86_Camellia_decrypt");
990
991 &mov ($key,$_tmp); # load ivp
992 &mov ($idx,$_len); # load len
993
994 &bswap ($s0);
995 &bswap ($s1);
996 &bswap ($s2);
997 &xor ($s0,&DWP(0,$key)); # xor iv
998 &bswap ($s3);
999 &xor ($s1,&DWP(4,$key));
1000 &xor ($s2,&DWP(8,$key));
1001 &xor ($s3,&DWP(12,$key));
1002
1003 &sub ($idx,16);
1004 &jc (&label("dec_partial"));
1005 &mov ($_len,$idx); # save len
1006 &mov ($idx,$_inp); # load inp
1007 &mov ($key,$_out); # load out
1008
1009 &mov (&DWP(0,$key),$s0); # write output
1010 &mov (&DWP(4,$key),$s1);
1011 &mov (&DWP(8,$key),$s2);
1012 &mov (&DWP(12,$key),$s3);
1013
1014 &mov ($_tmp,$idx); # save ivp
1015 &lea ($idx,&DWP(16,$idx));
1016 &mov ($_inp,$idx); # save inp
1017
1018 &lea ($key,&DWP(16,$key));
1019 &mov ($_out,$key); # save out
1020
1021 &jnz (&label("dec_loop"));
1022 &mov ($key,$_tmp); # load temp ivp
1023 &set_label("dec_end");
1024 &mov ($idx,$_ivp); # load user ivp
1025 &mov ($s0,&DWP(0,$key)); # load iv
1026 &mov ($s1,&DWP(4,$key));
1027 &mov ($s2,&DWP(8,$key));
1028 &mov ($s3,&DWP(12,$key));
1029 &mov (&DWP(0,$idx),$s0); # copy back to user
1030 &mov (&DWP(4,$idx),$s1);
1031 &mov (&DWP(8,$idx),$s2);
1032 &mov (&DWP(12,$idx),$s3);
1033 &jmp (&label("dec_out"));
1034
1035 &set_label("dec_partial",4);
1036 &lea ($key,$ivec);
1037 &mov (&DWP(0,$key),$s0); # dump output to stack
1038 &mov (&DWP(4,$key),$s1);
1039 &mov (&DWP(8,$key),$s2);
1040 &mov (&DWP(12,$key),$s3);
1041 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1042 &mov ($idx eq "esi" ? $idx : "",$key);
1043 &mov ($key eq "edi" ? $key : "",$_out); # load out
1044 &data_word(0xA4F3F689); # rep movsb # copy output
1045 &mov ($key,$_inp); # use inp as temp ivp
1046 &jmp (&label("dec_end"));
1047
1048 &set_label("dec_in_place",4);
1049 &set_label("dec_in_place_loop");
1050 &lea ($key,$ivec);
1051 &mov ($s0,&DWP(0,$idx)); # read input
1052 &mov ($s1,&DWP(4,$idx));
1053 &mov ($s2,&DWP(8,$idx));
1054 &mov ($s3,&DWP(12,$idx));
1055
1056 &mov (&DWP(0,$key),$s0); # copy to temp
1057 &mov (&DWP(4,$key),$s1);
1058 &mov (&DWP(8,$key),$s2);
1059 &bswap ($s0);
1060 &mov (&DWP(12,$key),$s3);
1061 &bswap ($s1);
1062 &mov ($key,$_key); # load key
1063 &bswap ($s2);
1064 &bswap ($s3);
1065
1066 &call ("_x86_Camellia_decrypt");
1067
1068 &mov ($key,$_ivp); # load ivp
1069 &mov ($idx,$_out); # load out
1070
1071 &bswap ($s0);
1072 &bswap ($s1);
1073 &bswap ($s2);
1074 &xor ($s0,&DWP(0,$key)); # xor iv
1075 &bswap ($s3);
1076 &xor ($s1,&DWP(4,$key));
1077 &xor ($s2,&DWP(8,$key));
1078 &xor ($s3,&DWP(12,$key));
1079
1080 &mov (&DWP(0,$idx),$s0); # write output
1081 &mov (&DWP(4,$idx),$s1);
1082 &mov (&DWP(8,$idx),$s2);
1083 &mov (&DWP(12,$idx),$s3);
1084
1085 &lea ($idx,&DWP(16,$idx));
1086 &mov ($_out,$idx); # save out
1087
1088 &lea ($idx,$ivec);
1089 &mov ($s0,&DWP(0,$idx)); # read temp
1090 &mov ($s1,&DWP(4,$idx));
1091 &mov ($s2,&DWP(8,$idx));
1092 &mov ($s3,&DWP(12,$idx));
1093
1094 &mov (&DWP(0,$key),$s0); # copy iv
1095 &mov (&DWP(4,$key),$s1);
1096 &mov (&DWP(8,$key),$s2);
1097 &mov (&DWP(12,$key),$s3);
1098
1099 &mov ($idx,$_inp); # load inp
1100
1101 &lea ($idx,&DWP(16,$idx));
1102 &mov ($_inp,$idx); # save inp
1103
1104 &mov ($s2,$_len); # load len
1105 &sub ($s2,16);
1106 &jc (&label("dec_in_place_partial"));
1107 &mov ($_len,$s2); # save len
1108 &jnz (&label("dec_in_place_loop"));
1109 &jmp (&label("dec_out"));
1110
1111 &set_label("dec_in_place_partial",4);
1112 # one can argue if this is actually required...
1113 &mov ($key eq "edi" ? $key : "",$_out);
1114 &lea ($idx eq "esi" ? $idx : "",$ivec);
1115 &lea ($key,&DWP(0,$key,$s2));
1116 &lea ($idx,&DWP(16,$idx,$s2));
1117 &neg ($s2 eq "ecx" ? $s2 : "");
1118 &data_word(0xA4F3F689); # rep movsb # restore tail
1119
1120 &set_label("dec_out",4);
1121 &mov ("esp",$_esp);
1122 &popf ();
1123&function_end("Camellia_cbc_encrypt");
1124}
1125
1126&asm_finish();