summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/camellia/asm/cmll-x86_64.pl')
-rw-r--r--src/lib/libcrypto/camellia/asm/cmll-x86_64.pl867
1 files changed, 0 insertions, 867 deletions
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl b/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
deleted file mode 100644
index a171c654b2..0000000000
--- a/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
+++ /dev/null
@@ -1,867 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD64 Core2 EM64T
18# -evp camellia-128-ecb 16.7 21.0 22.7
19# + over gcc 3.4.6 +25% +5% 0%
20#
21# camellia-128-cbc 15.7 20.4 21.1
22#
23# 128-bit key setup 128 216 205 cycles/key
24# + over gcc 3.4.6 +54% +39% +15%
25#
26# Numbers in "+" rows represent performance improvement over compiler
27# generated code. Key setup timings are impressive on AMD and Core2
28# thanks to 64-bit operations being covertly deployed. Improvement on
29# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30# apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41open OUT,"| \"$^X\" $xlate $flavour $output";
42*STDOUT=*OUT;
43
44sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
45sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
46 $r =~ s/%[er]([sd]i)/%\1l/;
47 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
48
49$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
50@S=("%r8d","%r9d","%r10d","%r11d");
51$i0="%esi";
52$i1="%edi";
53$Tbl="%rbp"; # size optimization
54$inp="%r12";
55$out="%r13";
56$key="%r14";
57$keyend="%r15";
58$arg0d="%edi";
59
60# const unsigned int Camellia_SBOX[4][256];
61# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
62# and [2][] - with [3][]. This is done to minimize code size.
63$SBOX1_1110=0; # Camellia_SBOX[0]
64$SBOX4_4404=4; # Camellia_SBOX[1]
65$SBOX2_0222=2048; # Camellia_SBOX[2]
66$SBOX3_3033=2052; # Camellia_SBOX[3]
67
68sub Camellia_Feistel {
69my $i=@_[0];
70my $seed=defined(@_[1])?@_[1]:0;
71my $scale=$seed<0?-8:8;
72my $j=($i&1)*2;
73my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
74
75$code.=<<___;
76 xor $s0,$t0 # t0^=key[0]
77 xor $s1,$t1 # t1^=key[1]
78 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
79 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
80 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
81 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
82 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
83 shr \$16,$t0
84 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
85 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
86 shr \$16,$t1
87 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
88 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
89 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
90 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
91 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
92 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
93 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
94 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
95 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
96 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
97 mov `$seed+($i+1)*$scale+4`($key),$t0
98 xor $t3,$t2 # t2^=t3
99 ror \$8,$t3 # t3=RightRotate(t3,8)
100 xor $t2,$s2
101 xor $t2,$s3
102 xor $t3,$s3
103___
104}
105
106# void Camellia_EncryptBlock_Rounds(
107# int grandRounds,
108# const Byte plaintext[],
109# const KEY_TABLE_TYPE keyTable,
110# Byte ciphertext[])
111$code=<<___;
112.text
113
114# V1.x API
115.globl Camellia_EncryptBlock
116.type Camellia_EncryptBlock,\@abi-omnipotent
117.align 16
118Camellia_EncryptBlock:
119 movl \$128,%eax
120 subl $arg0d,%eax
121 movl \$3,$arg0d
122 adcl \$0,$arg0d # keyBitLength==128?3:4
123 jmp .Lenc_rounds
124.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
125# V2
126.globl Camellia_EncryptBlock_Rounds
127.type Camellia_EncryptBlock_Rounds,\@function,4
128.align 16
129.Lenc_rounds:
130Camellia_EncryptBlock_Rounds:
131 push %rbx
132 push %rbp
133 push %r13
134 push %r14
135 push %r15
136.Lenc_prologue:
137
138 #mov %rsi,$inp # put away arguments
139 mov %rcx,$out
140 mov %rdx,$key
141
142 shl \$6,%edi # process grandRounds
143 lea .LCamellia_SBOX(%rip),$Tbl
144 lea ($key,%rdi),$keyend
145
146 mov 0(%rsi),@S[0] # load plaintext
147 mov 4(%rsi),@S[1]
148 mov 8(%rsi),@S[2]
149 bswap @S[0]
150 mov 12(%rsi),@S[3]
151 bswap @S[1]
152 bswap @S[2]
153 bswap @S[3]
154
155 call _x86_64_Camellia_encrypt
156
157 bswap @S[0]
158 bswap @S[1]
159 bswap @S[2]
160 mov @S[0],0($out)
161 bswap @S[3]
162 mov @S[1],4($out)
163 mov @S[2],8($out)
164 mov @S[3],12($out)
165
166 mov 0(%rsp),%r15
167 mov 8(%rsp),%r14
168 mov 16(%rsp),%r13
169 mov 24(%rsp),%rbp
170 mov 32(%rsp),%rbx
171 lea 40(%rsp),%rsp
172.Lenc_epilogue:
173 ret
174.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
175
176.type _x86_64_Camellia_encrypt,\@abi-omnipotent
177.align 16
178_x86_64_Camellia_encrypt:
179 xor 0($key),@S[1]
180 xor 4($key),@S[0] # ^=key[0-3]
181 xor 8($key),@S[3]
182 xor 12($key),@S[2]
183.align 16
184.Leloop:
185 mov 16($key),$t1 # prefetch key[4-5]
186 mov 20($key),$t0
187
188___
189 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
190$code.=<<___;
191 lea 16*4($key),$key
192 cmp $keyend,$key
193 mov 8($key),$t3 # prefetch key[2-3]
194 mov 12($key),$t2
195 je .Ledone
196
197 and @S[0],$t0
198 or @S[3],$t3
199 rol \$1,$t0
200 xor $t3,@S[2] # s2^=s3|key[3];
201 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
202 and @S[2],$t2
203 or @S[1],$t1
204 rol \$1,$t2
205 xor $t1,@S[0] # s0^=s1|key[1];
206 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
207 jmp .Leloop
208
209.align 16
210.Ledone:
211 xor @S[2],$t0 # SwapHalf
212 xor @S[3],$t1
213 xor @S[0],$t2
214 xor @S[1],$t3
215
216 mov $t0,@S[0]
217 mov $t1,@S[1]
218 mov $t2,@S[2]
219 mov $t3,@S[3]
220
221 .byte 0xf3,0xc3 # rep ret
222.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
223
224# V1.x API
225.globl Camellia_DecryptBlock
226.type Camellia_DecryptBlock,\@abi-omnipotent
227.align 16
228Camellia_DecryptBlock:
229 movl \$128,%eax
230 subl $arg0d,%eax
231 movl \$3,$arg0d
232 adcl \$0,$arg0d # keyBitLength==128?3:4
233 jmp .Ldec_rounds
234.size Camellia_DecryptBlock,.-Camellia_DecryptBlock
235# V2
236.globl Camellia_DecryptBlock_Rounds
237.type Camellia_DecryptBlock_Rounds,\@function,4
238.align 16
239.Ldec_rounds:
240Camellia_DecryptBlock_Rounds:
241 push %rbx
242 push %rbp
243 push %r13
244 push %r14
245 push %r15
246.Ldec_prologue:
247
248 #mov %rsi,$inp # put away arguments
249 mov %rcx,$out
250 mov %rdx,$keyend
251
252 shl \$6,%edi # process grandRounds
253 lea .LCamellia_SBOX(%rip),$Tbl
254 lea ($keyend,%rdi),$key
255
256 mov 0(%rsi),@S[0] # load plaintext
257 mov 4(%rsi),@S[1]
258 mov 8(%rsi),@S[2]
259 bswap @S[0]
260 mov 12(%rsi),@S[3]
261 bswap @S[1]
262 bswap @S[2]
263 bswap @S[3]
264
265 call _x86_64_Camellia_decrypt
266
267 bswap @S[0]
268 bswap @S[1]
269 bswap @S[2]
270 mov @S[0],0($out)
271 bswap @S[3]
272 mov @S[1],4($out)
273 mov @S[2],8($out)
274 mov @S[3],12($out)
275
276 mov 0(%rsp),%r15
277 mov 8(%rsp),%r14
278 mov 16(%rsp),%r13
279 mov 24(%rsp),%rbp
280 mov 32(%rsp),%rbx
281 lea 40(%rsp),%rsp
282.Ldec_epilogue:
283 ret
284.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
285
286.type _x86_64_Camellia_decrypt,\@abi-omnipotent
287.align 16
288_x86_64_Camellia_decrypt:
289 xor 0($key),@S[1]
290 xor 4($key),@S[0] # ^=key[0-3]
291 xor 8($key),@S[3]
292 xor 12($key),@S[2]
293.align 16
294.Ldloop:
295 mov -8($key),$t1 # prefetch key[4-5]
296 mov -4($key),$t0
297
298___
299 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
300$code.=<<___;
301 lea -16*4($key),$key
302 cmp $keyend,$key
303 mov 0($key),$t3 # prefetch key[2-3]
304 mov 4($key),$t2
305 je .Lddone
306
307 and @S[0],$t0
308 or @S[3],$t3
309 rol \$1,$t0
310 xor $t3,@S[2] # s2^=s3|key[3];
311 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
312 and @S[2],$t2
313 or @S[1],$t1
314 rol \$1,$t2
315 xor $t1,@S[0] # s0^=s1|key[1];
316 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
317
318 jmp .Ldloop
319
320.align 16
321.Lddone:
322 xor @S[2],$t2
323 xor @S[3],$t3
324 xor @S[0],$t0
325 xor @S[1],$t1
326
327 mov $t2,@S[0] # SwapHalf
328 mov $t3,@S[1]
329 mov $t0,@S[2]
330 mov $t1,@S[3]
331
332 .byte 0xf3,0xc3 # rep ret
333.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
334___
335
336sub _saveround {
337my ($rnd,$key,@T)=@_;
338my $bias=int(@T[0])?shift(@T):0;
339
340 if ($#T==3) {
341 $code.=<<___;
342 mov @T[1],`$bias+$rnd*8+0`($key)
343 mov @T[0],`$bias+$rnd*8+4`($key)
344 mov @T[3],`$bias+$rnd*8+8`($key)
345 mov @T[2],`$bias+$rnd*8+12`($key)
346___
347 } else {
348 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
349 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
350 }
351}
352
353sub _loadround {
354my ($rnd,$key,@T)=@_;
355my $bias=int(@T[0])?shift(@T):0;
356
357$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
358$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
359}
360
361# shld is very slow on Intel EM64T family. Even on AMD it limits
362# instruction decode rate [because it's VectorPath] and consequently
363# performance...
364sub __rotl128 {
365my ($i0,$i1,$rot)=@_;
366
367 if ($rot) {
368 $code.=<<___;
369 mov $i0,%r11
370 shld \$$rot,$i1,$i0
371 shld \$$rot,%r11,$i1
372___
373 }
374}
375
376# ... Implementing 128-bit rotate without shld gives 80% better
377# performance EM64T, +15% on AMD64 and only ~7% degradation on
378# Core2. This is therefore preferred.
379sub _rotl128 {
380my ($i0,$i1,$rot)=@_;
381
382 if ($rot) {
383 $code.=<<___;
384 mov $i0,%r11
385 shl \$$rot,$i0
386 mov $i1,%r9
387 shr \$`64-$rot`,%r9
388 shr \$`64-$rot`,%r11
389 or %r9,$i0
390 shl \$$rot,$i1
391 or %r11,$i1
392___
393 }
394}
395
396{ my $step=0;
397
398$code.=<<___;
399.globl Camellia_Ekeygen
400.type Camellia_Ekeygen,\@function,3
401.align 16
402Camellia_Ekeygen:
403 push %rbx
404 push %rbp
405 push %r13
406 push %r14
407 push %r15
408.Lkey_prologue:
409
410 mov %rdi,$keyend # put away arguments, keyBitLength
411 mov %rdx,$out # keyTable
412
413 mov 0(%rsi),@S[0] # load 0-127 bits
414 mov 4(%rsi),@S[1]
415 mov 8(%rsi),@S[2]
416 mov 12(%rsi),@S[3]
417
418 bswap @S[0]
419 bswap @S[1]
420 bswap @S[2]
421 bswap @S[3]
422___
423 &_saveround (0,$out,@S); # KL<<<0
424$code.=<<___;
425 cmp \$128,$keyend # check keyBitLength
426 je .L1st128
427
428 mov 16(%rsi),@S[0] # load 128-191 bits
429 mov 20(%rsi),@S[1]
430 cmp \$192,$keyend
431 je .L1st192
432 mov 24(%rsi),@S[2] # load 192-255 bits
433 mov 28(%rsi),@S[3]
434 jmp .L1st256
435.L1st192:
436 mov @S[0],@S[2]
437 mov @S[1],@S[3]
438 not @S[2]
439 not @S[3]
440.L1st256:
441 bswap @S[0]
442 bswap @S[1]
443 bswap @S[2]
444 bswap @S[3]
445___
446 &_saveround (4,$out,@S); # temp storage for KR!
447$code.=<<___;
448 xor 0($out),@S[1] # KR^KL
449 xor 4($out),@S[0]
450 xor 8($out),@S[3]
451 xor 12($out),@S[2]
452
453.L1st128:
454 lea .LCamellia_SIGMA(%rip),$key
455 lea .LCamellia_SBOX(%rip),$Tbl
456
457 mov 0($key),$t1
458 mov 4($key),$t0
459___
460 &Camellia_Feistel($step++);
461 &Camellia_Feistel($step++);
462$code.=<<___;
463 xor 0($out),@S[1] # ^KL
464 xor 4($out),@S[0]
465 xor 8($out),@S[3]
466 xor 12($out),@S[2]
467___
468 &Camellia_Feistel($step++);
469 &Camellia_Feistel($step++);
470$code.=<<___;
471 cmp \$128,$keyend
472 jne .L2nd256
473
474 lea 128($out),$out # size optimization
475 shl \$32,%r8 # @S[0]||
476 shl \$32,%r10 # @S[2]||
477 or %r9,%r8 # ||@S[1]
478 or %r11,%r10 # ||@S[3]
479___
480 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
481 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
482 &_rotl128 ("%rax","%rbx",15);
483 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
484 &_rotl128 ("%r8","%r10",15);
485 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
486 &_rotl128 ("%r8","%r10",15); # 15+15=30
487 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
488 &_rotl128 ("%rax","%rbx",30); # 15+30=45
489 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
490 &_rotl128 ("%r8","%r10",15); # 30+15=45
491 &_saveround (12,$out,-128,"%r8"); # KA<<<45
492 &_rotl128 ("%rax","%rbx",15); # 45+15=60
493 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
494 &_rotl128 ("%r8","%r10",15); # 45+15=60
495 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
496 &_rotl128 ("%rax","%rbx",17); # 60+17=77
497 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
498 &_rotl128 ("%rax","%rbx",17); # 77+17=94
499 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
500 &_rotl128 ("%r8","%r10",34); # 60+34=94
501 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
502 &_rotl128 ("%rax","%rbx",17); # 94+17=111
503 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
504 &_rotl128 ("%r8","%r10",17); # 94+17=111
505 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
506$code.=<<___;
507 mov \$3,%eax
508 jmp .Ldone
509.align 16
510.L2nd256:
511___
512 &_saveround (6,$out,@S); # temp storage for KA!
513$code.=<<___;
514 xor `4*8+0`($out),@S[1] # KA^KR
515 xor `4*8+4`($out),@S[0]
516 xor `5*8+0`($out),@S[3]
517 xor `5*8+4`($out),@S[2]
518___
519 &Camellia_Feistel($step++);
520 &Camellia_Feistel($step++);
521
522 &_loadround (0,$out,"%rax","%rbx"); # KL
523 &_loadround (4,$out,"%rcx","%rdx"); # KR
524 &_loadround (6,$out,"%r14","%r15"); # KA
525$code.=<<___;
526 lea 128($out),$out # size optimization
527 shl \$32,%r8 # @S[0]||
528 shl \$32,%r10 # @S[2]||
529 or %r9,%r8 # ||@S[1]
530 or %r11,%r10 # ||@S[3]
531___
532 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
533 &_rotl128 ("%rcx","%rdx",15);
534 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
535 &_rotl128 ("%r14","%r15",15);
536 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
537 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
538 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
539 &_rotl128 ("%r8","%r10",30);
540 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
541 &_rotl128 ("%rax","%rbx",45);
542 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
543 &_rotl128 ("%r14","%r15",30); # 15+30=45
544 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
545 &_rotl128 ("%rax","%rbx",15); # 45+15=60
546 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
547 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
548 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
549 &_rotl128 ("%r8","%r10",30); # 30+30=60
550 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
551 &_rotl128 ("%rax","%rbx",17); # 60+17=77
552 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
553 &_rotl128 ("%r14","%r15",32); # 45+32=77
554 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
555 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
556 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
557 &_rotl128 ("%r14","%r15",17); # 77+17=94
558 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
559 &_rotl128 ("%rax","%rbx",34); # 77+34=111
560 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
561 &_rotl128 ("%r8","%r10",51); # 60+51=111
562 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
563$code.=<<___;
564 mov \$4,%eax
565.Ldone:
566 mov 0(%rsp),%r15
567 mov 8(%rsp),%r14
568 mov 16(%rsp),%r13
569 mov 24(%rsp),%rbp
570 mov 32(%rsp),%rbx
571 lea 40(%rsp),%rsp
572.Lkey_epilogue:
573 ret
574.size Camellia_Ekeygen,.-Camellia_Ekeygen
575___
576}
577
578@SBOX=(
579112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
580 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
581134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
582166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
583139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
584223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
585 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
586254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
587170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
588 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
589135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
590 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
591233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
592120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
593114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
594 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
595
596sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
597sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
598sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
599sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
600
601$code.=<<___;
602.align 64
603.LCamellia_SIGMA:
604.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
605.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
606.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
607.long 0, 0, 0, 0
608.LCamellia_SBOX:
609___
610# tables are interleaved, remember?
611sub data_word { $code.=".long\t".join(',',@_)."\n"; }
612for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
613for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
614
615# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
616# size_t length, const CAMELLIA_KEY *key,
617# unsigned char *ivp,const int enc);
618{
619$_key="0(%rsp)";
620$_end="8(%rsp)"; # inp+len&~15
621$_res="16(%rsp)"; # len&15
622$ivec="24(%rsp)";
623$_ivp="40(%rsp)";
624$_rsp="48(%rsp)";
625
626$code.=<<___;
627.globl Camellia_cbc_encrypt
628.type Camellia_cbc_encrypt,\@function,6
629.align 16
630Camellia_cbc_encrypt:
631 cmp \$0,%rdx
632 je .Lcbc_abort
633 push %rbx
634 push %rbp
635 push %r12
636 push %r13
637 push %r14
638 push %r15
639.Lcbc_prologue:
640
641 mov %rsp,%rbp
642 sub \$64,%rsp
643 and \$-64,%rsp
644
645 # place stack frame just "above mod 1024" the key schedule,
646 # this ensures that cache associativity suffices
647 lea -64-63(%rcx),%r10
648 sub %rsp,%r10
649 neg %r10
650 and \$0x3C0,%r10
651 sub %r10,%rsp
652 #add \$8,%rsp # 8 is reserved for callee's ra
653
654 mov %rdi,$inp # inp argument
655 mov %rsi,$out # out argument
656 mov %r8,%rbx # ivp argument
657 mov %rcx,$key # key argument
658 mov 272(%rcx),${keyend}d # grandRounds
659
660 mov %r8,$_ivp
661 mov %rbp,$_rsp
662
663.Lcbc_body:
664 lea .LCamellia_SBOX(%rip),$Tbl
665
666 mov \$32,%ecx
667.align 4
668.Lcbc_prefetch_sbox:
669 mov 0($Tbl),%rax
670 mov 32($Tbl),%rsi
671 mov 64($Tbl),%rdi
672 mov 96($Tbl),%r11
673 lea 128($Tbl),$Tbl
674 loop .Lcbc_prefetch_sbox
675 sub \$4096,$Tbl
676 shl \$6,$keyend
677 mov %rdx,%rcx # len argument
678 lea ($key,$keyend),$keyend
679
680 cmp \$0,%r9d # enc argument
681 je .LCBC_DECRYPT
682
683 and \$-16,%rdx
684 and \$15,%rcx # length residue
685 lea ($inp,%rdx),%rdx
686 mov $key,$_key
687 mov %rdx,$_end
688 mov %rcx,$_res
689
690 cmp $inp,%rdx
691 mov 0(%rbx),@S[0] # load IV
692 mov 4(%rbx),@S[1]
693 mov 8(%rbx),@S[2]
694 mov 12(%rbx),@S[3]
695 je .Lcbc_enc_tail
696 jmp .Lcbc_eloop
697
698.align 16
699.Lcbc_eloop:
700 xor 0($inp),@S[0]
701 xor 4($inp),@S[1]
702 xor 8($inp),@S[2]
703 bswap @S[0]
704 xor 12($inp),@S[3]
705 bswap @S[1]
706 bswap @S[2]
707 bswap @S[3]
708
709 call _x86_64_Camellia_encrypt
710
711 mov $_key,$key # "rewind" the key
712 bswap @S[0]
713 mov $_end,%rdx
714 bswap @S[1]
715 mov $_res,%rcx
716 bswap @S[2]
717 mov @S[0],0($out)
718 bswap @S[3]
719 mov @S[1],4($out)
720 mov @S[2],8($out)
721 lea 16($inp),$inp
722 mov @S[3],12($out)
723 cmp %rdx,$inp
724 lea 16($out),$out
725 jne .Lcbc_eloop
726
727 cmp \$0,%rcx
728 jne .Lcbc_enc_tail
729
730 mov $_ivp,$out
731 mov @S[0],0($out) # write out IV residue
732 mov @S[1],4($out)
733 mov @S[2],8($out)
734 mov @S[3],12($out)
735 jmp .Lcbc_done
736
737.align 16
738.Lcbc_enc_tail:
739 xor %rax,%rax
740 mov %rax,0+$ivec
741 mov %rax,8+$ivec
742 mov %rax,$_res
743
744.Lcbc_enc_pushf:
745 pushfq
746 cld
747 mov $inp,%rsi
748 lea 8+$ivec,%rdi
749 .long 0x9066A4F3 # rep movsb
750 popfq
751.Lcbc_enc_popf:
752
753 lea $ivec,$inp
754 lea 16+$ivec,%rax
755 mov %rax,$_end
756 jmp .Lcbc_eloop # one more time
757
758.align 16
759.LCBC_DECRYPT:
760 xchg $key,$keyend
761 add \$15,%rdx
762 and \$15,%rcx # length residue
763 and \$-16,%rdx
764 mov $key,$_key
765 lea ($inp,%rdx),%rdx
766 mov %rdx,$_end
767 mov %rcx,$_res
768
769 mov (%rbx),%rax # load IV
770 mov 8(%rbx),%rbx
771 jmp .Lcbc_dloop
772.align 16
773.Lcbc_dloop:
774 mov 0($inp),@S[0]
775 mov 4($inp),@S[1]
776 mov 8($inp),@S[2]
777 bswap @S[0]
778 mov 12($inp),@S[3]
779 bswap @S[1]
780 mov %rax,0+$ivec # save IV to temporary storage
781 bswap @S[2]
782 mov %rbx,8+$ivec
783 bswap @S[3]
784
785 call _x86_64_Camellia_decrypt
786
787 mov $_key,$key # "rewind" the key
788 mov $_end,%rdx
789 mov $_res,%rcx
790
791 bswap @S[0]
792 mov ($inp),%rax # load IV for next iteration
793 bswap @S[1]
794 mov 8($inp),%rbx
795 bswap @S[2]
796 xor 0+$ivec,@S[0]
797 bswap @S[3]
798 xor 4+$ivec,@S[1]
799 xor 8+$ivec,@S[2]
800 lea 16($inp),$inp
801 xor 12+$ivec,@S[3]
802 cmp %rdx,$inp
803 je .Lcbc_ddone
804
805 mov @S[0],0($out)
806 mov @S[1],4($out)
807 mov @S[2],8($out)
808 mov @S[3],12($out)
809
810 lea 16($out),$out
811 jmp .Lcbc_dloop
812
813.align 16
814.Lcbc_ddone:
815 mov $_ivp,%rdx
816 cmp \$0,%rcx
817 jne .Lcbc_dec_tail
818
819 mov @S[0],0($out)
820 mov @S[1],4($out)
821 mov @S[2],8($out)
822 mov @S[3],12($out)
823
824 mov %rax,(%rdx) # write out IV residue
825 mov %rbx,8(%rdx)
826 jmp .Lcbc_done
827.align 16
828.Lcbc_dec_tail:
829 mov @S[0],0+$ivec
830 mov @S[1],4+$ivec
831 mov @S[2],8+$ivec
832 mov @S[3],12+$ivec
833
834.Lcbc_dec_pushf:
835 pushfq
836 cld
837 lea 8+$ivec,%rsi
838 lea ($out),%rdi
839 .long 0x9066A4F3 # rep movsb
840 popfq
841.Lcbc_dec_popf:
842
843 mov %rax,(%rdx) # write out IV residue
844 mov %rbx,8(%rdx)
845 jmp .Lcbc_done
846
847.align 16
848.Lcbc_done:
849 mov $_rsp,%rcx
850 mov 0(%rcx),%r15
851 mov 8(%rcx),%r14
852 mov 16(%rcx),%r13
853 mov 24(%rcx),%r12
854 mov 32(%rcx),%rbp
855 mov 40(%rcx),%rbx
856 lea 48(%rcx),%rsp
857.Lcbc_abort:
858 ret
859.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
860
861.asciz "Camellia for x86_64 by <appro\@openssl.org>"
862___
863}
864
865$code =~ s/\`([^\`]*)\`/eval $1/gem;
866print $code;
867close STDOUT;