summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/modes
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2014-04-13 15:49:51 +0000
committercvs2svn <admin@example.com>2014-04-13 15:49:51 +0000
commit9ef0d5fb5b0acfd35d73a5557198f46525ab1667 (patch)
tree61e7e25839f716a30db270f15cddf0be6903781f /src/lib/libcrypto/modes
parentff237038a541d51619efa5b36fb251c8dc1e9637 (diff)
downloadopenbsd-butholakala.tar.gz
openbsd-butholakala.tar.bz2
openbsd-butholakala.zip
This commit was manufactured by cvs2git to create tag 'butholakala'.butholakala
Diffstat (limited to 'src/lib/libcrypto/modes')
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-alpha.pl460
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-armv4.pl429
-rwxr-xr-xsrc/lib/libcrypto/modes/asm/ghash-ia64.pl463
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-parisc.pl731
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-s390x.pl262
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86.pl1342
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86_64.pl806
-rw-r--r--src/lib/libcrypto/modes/cbc128.c205
-rw-r--r--src/lib/libcrypto/modes/ccm128.c441
-rw-r--r--src/lib/libcrypto/modes/cfb128.c242
-rw-r--r--src/lib/libcrypto/modes/ctr128.c252
-rw-r--r--src/lib/libcrypto/modes/cts128.c453
-rw-r--r--src/lib/libcrypto/modes/gcm128.c1905
-rw-r--r--src/lib/libcrypto/modes/modes.h135
-rw-r--r--src/lib/libcrypto/modes/modes_lcl.h128
-rw-r--r--src/lib/libcrypto/modes/ofb128.c121
-rw-r--r--src/lib/libcrypto/modes/xts128.c187
18 files changed, 0 insertions, 8892 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index aa36029386..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,460 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#ifdef __linux__
249#include <asm/regdef.h>
250#else
251#include <asm.h>
252#include <regdef.h>
253#endif
254
255.text
256
257.set noat
258.set noreorder
259.globl gcm_gmult_4bit
260.align 4
261.ent gcm_gmult_4bit
262gcm_gmult_4bit:
263 .frame sp,0,ra
264 .prologue 0
265
266 ldq $Xlo,8($Xi)
267 ldq $Xhi,0($Xi)
268
269 bsr $t0,picmeup
270 nop
271___
272
273 &loop();
274
275$code.=<<___;
276 srl $Zlo,24,$t0 # byte swap
277 srl $Zlo,8,$t1
278
279 sll $Zlo,8,$t2
280 sll $Zlo,24,$Zlo
281 zapnot $t0,0x11,$t0
282 zapnot $t1,0x22,$t1
283
284 zapnot $Zlo,0x88,$Zlo
285 or $t0,$t1,$t0
286 zapnot $t2,0x44,$t2
287
288 or $Zlo,$t0,$Zlo
289 srl $Zhi,24,$t0
290 srl $Zhi,8,$t1
291
292 or $Zlo,$t2,$Zlo
293 sll $Zhi,8,$t2
294 sll $Zhi,24,$Zhi
295
296 srl $Zlo,32,$Xlo
297 sll $Zlo,32,$Zlo
298
299 zapnot $t0,0x11,$t0
300 zapnot $t1,0x22,$t1
301 or $Zlo,$Xlo,$Xlo
302
303 zapnot $Zhi,0x88,$Zhi
304 or $t0,$t1,$t0
305 zapnot $t2,0x44,$t2
306
307 or $Zhi,$t0,$Zhi
308 or $Zhi,$t2,$Zhi
309
310 srl $Zhi,32,$Xhi
311 sll $Zhi,32,$Zhi
312
313 or $Zhi,$Xhi,$Xhi
314 stq $Xlo,8($Xi)
315 stq $Xhi,0($Xi)
316
317 ret (ra)
318.end gcm_gmult_4bit
319___
320
321$inhi="s0";
322$inlo="s1";
323
324$code.=<<___;
325.globl gcm_ghash_4bit
326.align 4
327.ent gcm_ghash_4bit
328gcm_ghash_4bit:
329 lda sp,-32(sp)
330 stq ra,0(sp)
331 stq s0,8(sp)
332 stq s1,16(sp)
333 .mask 0x04000600,-32
334 .frame sp,32,ra
335 .prologue 0
336
337 ldq_u $inhi,0($inp)
338 ldq_u $Thi0,7($inp)
339 ldq_u $inlo,8($inp)
340 ldq_u $Tlo0,15($inp)
341 ldq $Xhi,0($Xi)
342 ldq $Xlo,8($Xi)
343
344 bsr $t0,picmeup
345 nop
346
347.Louter:
348 extql $inhi,$inp,$inhi
349 extqh $Thi0,$inp,$Thi0
350 or $inhi,$Thi0,$inhi
351 lda $inp,16($inp)
352
353 extql $inlo,$inp,$inlo
354 extqh $Tlo0,$inp,$Tlo0
355 or $inlo,$Tlo0,$inlo
356 subq $len,16,$len
357
358 xor $Xlo,$inlo,$Xlo
359 xor $Xhi,$inhi,$Xhi
360___
361
362 &loop();
363
364$code.=<<___;
365 srl $Zlo,24,$t0 # byte swap
366 srl $Zlo,8,$t1
367
368 sll $Zlo,8,$t2
369 sll $Zlo,24,$Zlo
370 zapnot $t0,0x11,$t0
371 zapnot $t1,0x22,$t1
372
373 zapnot $Zlo,0x88,$Zlo
374 or $t0,$t1,$t0
375 zapnot $t2,0x44,$t2
376
377 or $Zlo,$t0,$Zlo
378 srl $Zhi,24,$t0
379 srl $Zhi,8,$t1
380
381 or $Zlo,$t2,$Zlo
382 sll $Zhi,8,$t2
383 sll $Zhi,24,$Zhi
384
385 srl $Zlo,32,$Xlo
386 sll $Zlo,32,$Zlo
387 beq $len,.Ldone
388
389 zapnot $t0,0x11,$t0
390 zapnot $t1,0x22,$t1
391 or $Zlo,$Xlo,$Xlo
392 ldq_u $inhi,0($inp)
393
394 zapnot $Zhi,0x88,$Zhi
395 or $t0,$t1,$t0
396 zapnot $t2,0x44,$t2
397 ldq_u $Thi0,7($inp)
398
399 or $Zhi,$t0,$Zhi
400 or $Zhi,$t2,$Zhi
401 ldq_u $inlo,8($inp)
402 ldq_u $Tlo0,15($inp)
403
404 srl $Zhi,32,$Xhi
405 sll $Zhi,32,$Zhi
406
407 or $Zhi,$Xhi,$Xhi
408 br zero,.Louter
409
410.Ldone:
411 zapnot $t0,0x11,$t0
412 zapnot $t1,0x22,$t1
413 or $Zlo,$Xlo,$Xlo
414
415 zapnot $Zhi,0x88,$Zhi
416 or $t0,$t1,$t0
417 zapnot $t2,0x44,$t2
418
419 or $Zhi,$t0,$Zhi
420 or $Zhi,$t2,$Zhi
421
422 srl $Zhi,32,$Xhi
423 sll $Zhi,32,$Zhi
424
425 or $Zhi,$Xhi,$Xhi
426
427 stq $Xlo,8($Xi)
428 stq $Xhi,0($Xi)
429
430 .set noreorder
431 /*ldq ra,0(sp)*/
432 ldq s0,8(sp)
433 ldq s1,16(sp)
434 lda sp,32(sp)
435 ret (ra)
436.end gcm_ghash_4bit
437
438.align 4
439.ent picmeup
440picmeup:
441 .frame sp,0,$t0
442 .prologue 0
443 br $rem_4bit,.Lpic
444.Lpic: lda $rem_4bit,12($rem_4bit)
445 ret ($t0)
446.end picmeup
447 nop
448rem_4bit:
449 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
450 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
451 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
452 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
453.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
454.align 4
455
456___
457$output=shift and open STDOUT,">$output";
458print $code;
459close STDOUT;
460
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index d91586ee29..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,429 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+32 bytes shared table]. There is no
15# experimental performance data available yet. The only approximation
16# that can be made at this point is based on code size. Inner loop is
17# 32 instructions long and on single-issue core should execute in <40
18# cycles. Having verified that gcc 3.4 didn't unroll corresponding
19# loop, this assembler loop body was found to be ~3x smaller than
20# compiler-generated one...
21#
22# July 2010
23#
24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
25# Cortex A8 core and ~25 cycles per processed byte (which was observed
26# to be ~3 times faster than gcc-generated code:-)
27#
28# February 2011
29#
30# Profiler-assisted and platform-specific optimization resulted in 7%
31# improvement on Cortex A8 core and ~23.5 cycles per byte.
32#
33# March 2011
34#
35# Add NEON implementation featuring polynomial multiplication, i.e. no
36# lookup tables involved. On Cortex A8 it was measured to process one
37# byte in 15 cycles or 55% faster than integer-only code.
38
39# ====================================================================
40# Note about "528B" variant. In ARM case it makes lesser sense to
41# implement it for following reasons:
42#
43# - performance improvement won't be anywhere near 50%, because 128-
44# bit shift operation is neatly fused with 128-bit xor here, and
45# "538B" variant would eliminate only 4-5 instructions out of 32
46# in the inner loop (meaning that estimated improvement is ~15%);
47# - ARM-based systems are often embedded ones and extra memory
48# consumption might be unappreciated (for so little improvement);
49#
50# Byte order [in]dependence. =========================================
51#
52# Caller is expected to maintain specific *dword* order in Htable,
53# namely with *least* significant dword of 128-bit value at *lower*
54# address. This differs completely from C code and has everything to
55# do with ldm instruction and order in which dwords are "consumed" by
56# algorithm. *Byte* order within these dwords in turn is whatever
57# *native* byte order on current platform. See gcm128.c for working
58# example...
59
60while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61open STDOUT,">$output";
62
63$Xi="r0"; # argument block
64$Htbl="r1";
65$inp="r2";
66$len="r3";
67
68$Zll="r4"; # variables
69$Zlh="r5";
70$Zhl="r6";
71$Zhh="r7";
72$Tll="r8";
73$Tlh="r9";
74$Thl="r10";
75$Thh="r11";
76$nlo="r12";
77################# r13 is stack pointer
78$nhi="r14";
79################# r15 is program counter
80
81$rem_4bit=$inp; # used in gcm_gmult_4bit
82$cnt=$len;
83
84sub Zsmash() {
85 my $i=12;
86 my @args=@_;
87 for ($Zll,$Zlh,$Zhl,$Zhh) {
88 $code.=<<___;
89#if __ARM_ARCH__>=7 && defined(__ARMEL__)
90 rev $_,$_
91 str $_,[$Xi,#$i]
92#elif defined(__ARMEB__)
93 str $_,[$Xi,#$i]
94#else
95 mov $Tlh,$_,lsr#8
96 strb $_,[$Xi,#$i+3]
97 mov $Thl,$_,lsr#16
98 strb $Tlh,[$Xi,#$i+2]
99 mov $Thh,$_,lsr#24
100 strb $Thl,[$Xi,#$i+1]
101 strb $Thh,[$Xi,#$i]
102#endif
103___
104 $code.="\t".shift(@args)."\n";
105 $i-=4;
106 }
107}
108
109$code=<<___;
110#include "arm_arch.h"
111
112.text
113.code 32
114
115.type rem_4bit,%object
116.align 5
117rem_4bit:
118.short 0x0000,0x1C20,0x3840,0x2460
119.short 0x7080,0x6CA0,0x48C0,0x54E0
120.short 0xE100,0xFD20,0xD940,0xC560
121.short 0x9180,0x8DA0,0xA9C0,0xB5E0
122.size rem_4bit,.-rem_4bit
123
124.type rem_4bit_get,%function
125rem_4bit_get:
126 sub $rem_4bit,pc,#8
127 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
128 b .Lrem_4bit_got
129 nop
130.size rem_4bit_get,.-rem_4bit_get
131
132.global gcm_ghash_4bit
133.type gcm_ghash_4bit,%function
134gcm_ghash_4bit:
135 sub r12,pc,#8
136 add $len,$inp,$len @ $len to point at the end
137 stmdb sp!,{r3-r11,lr} @ save $len/end too
138 sub r12,r12,#48 @ &rem_4bit
139
140 ldmia r12,{r4-r11} @ copy rem_4bit ...
141 stmdb sp!,{r4-r11} @ ... to stack
142
143 ldrb $nlo,[$inp,#15]
144 ldrb $nhi,[$Xi,#15]
145.Louter:
146 eor $nlo,$nlo,$nhi
147 and $nhi,$nlo,#0xf0
148 and $nlo,$nlo,#0x0f
149 mov $cnt,#14
150
151 add $Zhh,$Htbl,$nlo,lsl#4
152 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
153 add $Thh,$Htbl,$nhi
154 ldrb $nlo,[$inp,#14]
155
156 and $nhi,$Zll,#0xf @ rem
157 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
158 add $nhi,$nhi,$nhi
159 eor $Zll,$Tll,$Zll,lsr#4
160 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
161 eor $Zll,$Zll,$Zlh,lsl#28
162 ldrb $nhi,[$Xi,#14]
163 eor $Zlh,$Tlh,$Zlh,lsr#4
164 eor $Zlh,$Zlh,$Zhl,lsl#28
165 eor $Zhl,$Thl,$Zhl,lsr#4
166 eor $Zhl,$Zhl,$Zhh,lsl#28
167 eor $Zhh,$Thh,$Zhh,lsr#4
168 eor $nlo,$nlo,$nhi
169 and $nhi,$nlo,#0xf0
170 and $nlo,$nlo,#0x0f
171 eor $Zhh,$Zhh,$Tll,lsl#16
172
173.Linner:
174 add $Thh,$Htbl,$nlo,lsl#4
175 and $nlo,$Zll,#0xf @ rem
176 subs $cnt,$cnt,#1
177 add $nlo,$nlo,$nlo
178 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
179 eor $Zll,$Tll,$Zll,lsr#4
180 eor $Zll,$Zll,$Zlh,lsl#28
181 eor $Zlh,$Tlh,$Zlh,lsr#4
182 eor $Zlh,$Zlh,$Zhl,lsl#28
183 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
184 eor $Zhl,$Thl,$Zhl,lsr#4
185 ldrplb $nlo,[$inp,$cnt]
186 eor $Zhl,$Zhl,$Zhh,lsl#28
187 eor $Zhh,$Thh,$Zhh,lsr#4
188
189 add $Thh,$Htbl,$nhi
190 and $nhi,$Zll,#0xf @ rem
191 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
192 add $nhi,$nhi,$nhi
193 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
194 eor $Zll,$Tll,$Zll,lsr#4
195 ldrplb $Tll,[$Xi,$cnt]
196 eor $Zll,$Zll,$Zlh,lsl#28
197 eor $Zlh,$Tlh,$Zlh,lsr#4
198 ldrh $Tlh,[sp,$nhi]
199 eor $Zlh,$Zlh,$Zhl,lsl#28
200 eor $Zhl,$Thl,$Zhl,lsr#4
201 eor $Zhl,$Zhl,$Zhh,lsl#28
202 eorpl $nlo,$nlo,$Tll
203 eor $Zhh,$Thh,$Zhh,lsr#4
204 andpl $nhi,$nlo,#0xf0
205 andpl $nlo,$nlo,#0x0f
206 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
207 bpl .Linner
208
209 ldr $len,[sp,#32] @ re-load $len/end
210 add $inp,$inp,#16
211 mov $nhi,$Zll
212___
213 &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
214$code.=<<___;
215 bne .Louter
216
217 add sp,sp,#36
218#if __ARM_ARCH__>=5
219 ldmia sp!,{r4-r11,pc}
220#else
221 ldmia sp!,{r4-r11,lr}
222 tst lr,#1
223 moveq pc,lr @ be binary compatible with V4, yet
224 bx lr @ interoperable with Thumb ISA:-)
225#endif
226.size gcm_ghash_4bit,.-gcm_ghash_4bit
227
228.global gcm_gmult_4bit
229.type gcm_gmult_4bit,%function
230gcm_gmult_4bit:
231 stmdb sp!,{r4-r11,lr}
232 ldrb $nlo,[$Xi,#15]
233 b rem_4bit_get
234.Lrem_4bit_got:
235 and $nhi,$nlo,#0xf0
236 and $nlo,$nlo,#0x0f
237 mov $cnt,#14
238
239 add $Zhh,$Htbl,$nlo,lsl#4
240 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
241 ldrb $nlo,[$Xi,#14]
242
243 add $Thh,$Htbl,$nhi
244 and $nhi,$Zll,#0xf @ rem
245 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
246 add $nhi,$nhi,$nhi
247 eor $Zll,$Tll,$Zll,lsr#4
248 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
249 eor $Zll,$Zll,$Zlh,lsl#28
250 eor $Zlh,$Tlh,$Zlh,lsr#4
251 eor $Zlh,$Zlh,$Zhl,lsl#28
252 eor $Zhl,$Thl,$Zhl,lsr#4
253 eor $Zhl,$Zhl,$Zhh,lsl#28
254 eor $Zhh,$Thh,$Zhh,lsr#4
255 and $nhi,$nlo,#0xf0
256 eor $Zhh,$Zhh,$Tll,lsl#16
257 and $nlo,$nlo,#0x0f
258
259.Loop:
260 add $Thh,$Htbl,$nlo,lsl#4
261 and $nlo,$Zll,#0xf @ rem
262 subs $cnt,$cnt,#1
263 add $nlo,$nlo,$nlo
264 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
265 eor $Zll,$Tll,$Zll,lsr#4
266 eor $Zll,$Zll,$Zlh,lsl#28
267 eor $Zlh,$Tlh,$Zlh,lsr#4
268 eor $Zlh,$Zlh,$Zhl,lsl#28
269 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
270 eor $Zhl,$Thl,$Zhl,lsr#4
271 ldrplb $nlo,[$Xi,$cnt]
272 eor $Zhl,$Zhl,$Zhh,lsl#28
273 eor $Zhh,$Thh,$Zhh,lsr#4
274
275 add $Thh,$Htbl,$nhi
276 and $nhi,$Zll,#0xf @ rem
277 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
278 add $nhi,$nhi,$nhi
279 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
280 eor $Zll,$Tll,$Zll,lsr#4
281 eor $Zll,$Zll,$Zlh,lsl#28
282 eor $Zlh,$Tlh,$Zlh,lsr#4
283 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
284 eor $Zlh,$Zlh,$Zhl,lsl#28
285 eor $Zhl,$Thl,$Zhl,lsr#4
286 eor $Zhl,$Zhl,$Zhh,lsl#28
287 eor $Zhh,$Thh,$Zhh,lsr#4
288 andpl $nhi,$nlo,#0xf0
289 andpl $nlo,$nlo,#0x0f
290 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
291 bpl .Loop
292___
293 &Zsmash();
294$code.=<<___;
295#if __ARM_ARCH__>=5
296 ldmia sp!,{r4-r11,pc}
297#else
298 ldmia sp!,{r4-r11,lr}
299 tst lr,#1
300 moveq pc,lr @ be binary compatible with V4, yet
301 bx lr @ interoperable with Thumb ISA:-)
302#endif
303.size gcm_gmult_4bit,.-gcm_gmult_4bit
304___
305{
306my $cnt=$Htbl; # $Htbl is used once in the very beginning
307
308my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
309my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
310
311# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
312# in Zo. Or should I say "top bit", because GHASH is specified in
313# reverse bit order? Otherwise straightforward 128-bt H by one input
314# byte multiplication and modulo-reduction, times 16.
315
316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
318sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
319
320$code.=<<___;
321#if __ARM_ARCH__>=7
322.fpu neon
323
324.global gcm_gmult_neon
325.type gcm_gmult_neon,%function
326.align 4
327gcm_gmult_neon:
328 sub $Htbl,#16 @ point at H in GCM128_CTX
329 vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
330 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
331 vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
332 vshr.u64 $mod,#32
333 vldmia $Htbl,{$Hhi-$Hlo} @ load H
334 veor $zero,$zero
335#ifdef __ARMEL__
336 vrev64.8 $IN,$IN
337#endif
338 veor $Qpost,$Qpost
339 veor $R,$R
340 mov $cnt,#16
341 veor $Z,$Z
342 mov $len,#16
343 veor $Zo,$Zo
344 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
345 b .Linner_neon
346.size gcm_gmult_neon,.-gcm_gmult_neon
347
348.global gcm_ghash_neon
349.type gcm_ghash_neon,%function
350.align 4
351gcm_ghash_neon:
352 vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
353 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
354 vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
355 vshr.u64 $mod,#32
356 vldmia $Xi,{$Hhi-$Hlo} @ load H
357 veor $zero,$zero
358 nop
359#ifdef __ARMEL__
360 vrev64.8 $Z,$Z
361#endif
362.Louter_neon:
363 vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
364 veor $Qpost,$Qpost
365 vld1.64 `&Dlo($IN)`,[$inp]!
366 veor $R,$R
367 mov $cnt,#16
368#ifdef __ARMEL__
369 vrev64.8 $IN,$IN
370#endif
371 veor $Zo,$Zo
372 veor $IN,$Z @ inp^=Xi
373 veor $Z,$Z
374 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
375.Linner_neon:
376 subs $cnt,$cnt,#1
377 vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
378 vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
379 vext.8 $IN,$zero,#1 @ IN>>=8
380
381 veor $Z,$Qpost @ modulo-scheduled part
382 vshl.i64 `&Dlo("$R")`,#48
383 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
384 veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
385
386 veor `&Dhi("$Z")`,`&Dlo("$R")`
387 vuzp.8 $Qlo,$Qhi
388 vsli.8 $Zo,$T,#1 @ compose the "carry" byte
389 vext.8 $Z,$zero,#1 @ Z>>=8
390
391 vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
392 vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
393 vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
394 veor $Z,$Qhi
395 bne .Linner_neon
396
397 veor $Z,$Qpost @ modulo-scheduled artefact
398 vshl.i64 `&Dlo("$R")`,#48
399 veor `&Dhi("$Z")`,`&Dlo("$R")`
400
401 @ finalization, normalize Z:Zo
402 vand $Zo,$mod @ suffices to mask the bit
403 vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
404 vshl.i64 $Z,#1
405 subs $len,#16
406 vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
407 bne .Louter_neon
408
409#ifdef __ARMEL__
410 vrev64.8 $Z,$Z
411#endif
412 sub $Xi,#16
413 vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
414 vst1.64 `&Dlo("$Z")`,[$Xi,:64]
415
416 bx lr
417.size gcm_ghash_neon,.-gcm_ghash_neon
418#endif
419___
420}
421$code.=<<___;
422.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
423.align 2
424___
425
426$code =~ s/\`([^\`]*)\`/eval $1/gem;
427$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
428print $code;
429close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
deleted file mode 100755
index 0354c95444..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-ia64.pl
+++ /dev/null
@@ -1,463 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15# GHASH performance was measured to be 6.67 cycles per processed byte
16# on Itanium 2, which is >90% better than Microsoft compiler generated
17# code. To anchor to something else sha1-ia64.pl module processes one
18# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19# byte.
20
21# September 2010
22#
23# It was originally thought that it makes lesser sense to implement
24# "528B" variant on Itanium 2 for following reason. Because number of
25# functional units is naturally limited, it appeared impossible to
26# implement "528B" loop in 4 cycles, only in 5. This would mean that
27# theoretically performance improvement couldn't be more than 20%.
28# But occasionally you prove yourself wrong:-) I figured out a way to
29# fold couple of instructions and having freed yet another instruction
30# slot by unrolling the loop... Resulting performance is 4.45 cycles
31# per processed byte and 50% better than "256B" version. On original
32# Itanium performance should remain the same as the "256B" version,
33# i.e. ~8.5 cycles.
34
35$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36
37if ($^O eq "hpux") {
38 $ADDP="addp4";
39 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40} else { $ADDP="add"; }
41for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
42 $big_endian=0 if (/\-DL_ENDIAN/); }
43if (!defined($big_endian))
44 { $big_endian=(unpack('L',pack('N',1))==1); }
45
46sub loop() {
47my $label=shift;
48my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49
50# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51# in scalable manner;-) Naturally assuming data in L1 cache...
52# Special note about 'dep' instruction, which is used to construct
53# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54# bytes boundary and lower 7 bits of its address are guaranteed to
55# be zero.
56$code.=<<___;
57$label:
58{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
59 (p19) dep rem=Zlo,rem_4bitp,3,4 }
60{ .mfi; (p19) xor Zhi=Zhi,Hhi
61 ($p17) xor xi[1]=xi[1],in[1] };;
62{ .mfi; (p18) ld8 Hhi=[Hi[1]]
63 (p19) shrp Zlo=Zhi,Zlo,4 }
64{ .mfi; (p19) ld8 rem=[rem]
65 (p18) and Hi[1]=mask0xf0,xi[2] };;
66{ .mmi; ($p16) ld1 in[0]=[inp],-1
67 (p18) xor Zlo=Zlo,Hlo
68 (p19) shr.u Zhi=Zhi,4 }
69{ .mib; (p19) xor Hhi=Hhi,rem
70 (p18) add Hi[1]=Htbl,Hi[1] };;
71
72{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
73 (p18) dep rem=Zlo,rem_4bitp,3,4 }
74{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
75 (p18) xor Zhi=Zhi,Hhi };;
76{ .mfi; (p18) ld8 Hhi=[Hi[1]]
77 (p18) shrp Zlo=Zhi,Zlo,4 }
78{ .mfi; (p18) ld8 rem=[rem]
79 (p17) and Hi[0]=mask0xf0,Hi[0] };;
80{ .mmi; (p16) ld1 xi[0]=[Xi],-1
81 (p18) xor Zlo=Zlo,Hlo
82 (p18) shr.u Zhi=Zhi,4 }
83{ .mib; (p18) xor Hhi=Hhi,rem
84 (p17) add Hi[0]=Htbl,Hi[0]
85 br.ctop.sptk $label };;
86___
87}
88
89$code=<<___;
90.explicit
91.text
92
93prevfs=r2; prevlc=r3; prevpr=r8;
94mask0xf0=r21;
95rem=r22; rem_4bitp=r23;
96Xi=r24; Htbl=r25;
97inp=r26; end=r27;
98Hhi=r28; Hlo=r29;
99Zhi=r30; Zlo=r31;
100
101.align 128
102.skip 16 // aligns loop body
103.global gcm_gmult_4bit#
104.proc gcm_gmult_4bit#
105gcm_gmult_4bit:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,2,6,0,8
109 $ADDP Xi=15,in0 // &Xi[15]
110 mov rem_4bitp=ip }
111{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
112 .save ar.lc,prevlc
113 mov prevlc=ar.lc
114 .save pr,prevpr
115 mov prevpr=pr };;
116
117 .body
118 .rotr in[3],xi[3],Hi[2]
119
120{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
121 mov mask0xf0=0xf0
122 brp.loop.imp .Loop1,.Lend1-16};;
123{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
124 };;
125{ .mii; shladd Hi[1]=xi[2],4,r0
126 mov pr.rot=0x7<<16
127 mov ar.lc=13 };;
128{ .mii; and Hi[1]=mask0xf0,Hi[1]
129 mov ar.ec=3
130 xor Zlo=Zlo,Zlo };;
131{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
132 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133 xor Zhi=Zhi,Zhi };;
134___
135 &loop (".Loop1",1);
136$code.=<<___;
137.Lend1:
138{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
139{ .mib; mux1 Zlo=Zlo,\@rev };;
140{ .mib; mux1 Zhi=Zhi,\@rev };;
141{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
142 add Hhi=1,Xi };; // pipeline flush on Itanium
143{ .mib; st8 [Hlo]=Zlo
144 mov pr=prevpr,0x1ffff };;
145{ .mib; st8 [Hhi]=Zhi
146 mov ar.lc=prevlc
147 br.ret.sptk.many b0 };;
148.endp gcm_gmult_4bit#
149___
150
151######################################################################
152# "528B" (well, "512B" actualy) streamed GHASH
153#
154$Xip="in0";
155$Htbl="in1";
156$inp="in2";
157$len="in3";
158$rem_8bit="loc0";
159$mask0xff="loc1";
160($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161
162sub load_htable() {
163 for (my $i=0;$i<8;$i++) {
164 $code.=<<___;
165{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
166 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
167{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
168 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
169___
170 $code.=shift if (($i+$#_)==7);
171 $code.="\t};;\n"
172 }
173}
174
175$code.=<<___;
176prevsp=r3;
177
178.align 32
179.skip 16 // aligns loop body
180.global gcm_ghash_4bit#
181.proc gcm_ghash_4bit#
182gcm_ghash_4bit:
183 .prologue
184{ .mmi; .save ar.pfs,prevfs
185 alloc prevfs=ar.pfs,4,2,0,0
186 .vframe prevsp
187 mov prevsp=sp
188 mov $rem_8bit=ip };;
189 .body
190{ .mfi; $ADDP r8=0+0,$Htbl
191 $ADDP r9=0+8,$Htbl }
192{ .mfi; $ADDP r10=128+0,$Htbl
193 $ADDP r11=128+8,$Htbl };;
194___
195 &load_htable(
196 " $ADDP $Xip=15,$Xip", # &Xi[15]
197 " $ADDP $len=$len,$inp", # &inp[len]
198 " $ADDP $inp=15,$inp", # &inp[15]
199 " mov $mask0xff=0xff",
200 " add sp=-512,sp",
201 " andcm sp=sp,$mask0xff", # align stack frame
202 " add r14=0,sp",
203 " add r15=8,sp");
204$code.=<<___;
205{ .mmi; $sum 1<<1 // go big-endian
206 add r8=256+0,sp
207 add r9=256+8,sp }
208{ .mmi; add r10=256+128+0,sp
209 add r11=256+128+8,sp
210 add $len=-17,$len };;
211___
212for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
213my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214$code.=<<___;
215{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
216 st8 [r9]=$rhi,16 // Htable[$i].hi
217 shrp $rlo=$rhi,$rlo,4 }//;;
218{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
219 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
220 shr.u $rhi=$rhi,4 };;
221{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
222 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
223___
224}
225$code.=<<___;
226{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
227 ld8 r17=[r9],16 };; // Htable[8].hi
228{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
229 ld8 r19=[r9],16 } // Htable[9].hi
230{ .mmi; rum 1<<5 // clear um.mfh
231 shrp r16=r17,r16,4 };;
232___
233for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
234$code.=<<___;
235{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
236 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
237 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
238{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
239 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
240 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
241___
242}
243$code.=<<___;
244{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248{ .mmi; add $Htbl=256,sp // &Htable[0]
249 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
251{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
252 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
253___
254
255$in="r15";
256@xi=("r16","r17");
257@rem=("r18","r19");
258($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259($Atbl,$Btbl)=("r26","r27");
260
261$code.=<<___; # (p16)
262{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
263 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
264 cmp.eq p0,p6=r0,r0 };; // clear p6
265___
266push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
267
268$code.=<<___; # (p16),(p17)
269{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
270 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
271{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
272 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
273 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
274.align 32
275.LOOP:
276{ .mmi;
277(p6) st8 [$Xip]=$Zhi,13
278 xor $Zlo=$Zlo,$Zlo
279 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
280___
281push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
282
283$code.=<<___; # (p16),(p17),(p18)
284{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
285 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
287{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
288 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
289{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
290 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
291{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292 ld1 $in=[$inp],-1 } //(p16) *inp--
293{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
294 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
295 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
296{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
297 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
298 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
301___
302push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
303
304for ($i=1;$i<14;$i++) {
305# Above and below fragments are derived from this one by removing
306# unsuitable (p??) instructions.
307$code.=<<___; # (p16),(p17),(p18),(p19)
308{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
309 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
311{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
312 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
313 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
314{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
315 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
316 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
317{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
318 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
319 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
320{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321 ld1 $in=[$inp],-1 //(p16) *inp--
322 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
323{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
324 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
325 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
326{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
327 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
328 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
331 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
332___
333push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
334}
335
336$code.=<<___; # (p17),(p18),(p19)
337{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
338 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
340{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
341 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
342 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
343{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
344 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
345 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
346{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
347 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
348 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
349{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
351{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
352 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
353 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
354{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
355 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
358 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
359___
360push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
361
362$code.=<<___; # (p18),(p19)
363{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
364 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
365{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
366 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
367{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
368 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
369{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
370 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
371{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
372 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
373{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
374 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
375{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
376 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
379___
380push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
381
382$code.=<<___; # (p19)
383{ .mmi; cmp.ltu p6,p0=$inp,$len
384 add $inp=32,$inp
385 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
386{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
387 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
388 add $Xip=9,$Xip };; // &Xi.lo
389{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
390(p6) ld1 $in=[$inp],-1 //[p16] *inp--
391(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
392{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
393(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
394{ .mmi; st8 [$Xip]=$Zlo,-8
395(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
396 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
397{ .mmi;
398(p6) ld1 $in=[$inp],-1 //[p16] *inp--
399 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
400(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
401{ .mib;
402(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
403(p6) br.cond.dptk.many .LOOP };;
404
405{ .mib; st8 [$Xip]=$Zhi };;
406{ .mib; $rum 1<<1 // return to little-endian
407 .restore sp
408 mov sp=prevsp
409 br.ret.sptk.many b0 };;
410.endp gcm_ghash_4bit#
411___
412$code.=<<___;
413.align 128
414.type rem_4bit#,\@object
415rem_4bit:
416 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420.size rem_4bit#,128
421.type rem_8bit#,\@object
422rem_8bit:
423 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455.size rem_8bit#,512
456stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457___
458
459$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461
462print $code;
463close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index d5ad96b403..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,731 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36 $NREGS =6;
37} else {
38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46 $NREGS =11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50 # [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26"; # argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl; # variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73 $Zhl="%r6";
74 $Zlh="%r7";
75 $Hhl="%r8";
76 $Hlh="%r9";
77 $Thl="%r10";
78 $Tlh="%r11";
79}
80$rem2="%r6"; # used in PA-RISC 2.0 code
81
82$code.=<<___;
83 .LEVEL $LEVEL
84 .SPACE \$TEXT\$
85 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
86
87 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
88 .ALIGN 64
89gcm_gmult_4bit
90 .PROC
91 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
92 .ENTRY
93 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
94 $PUSHMA %r3,$FRAME(%sp)
95 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
96 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
97 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
98___
99$code.=<<___ if ($SIZE_T==4);
100 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
101 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
102 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
103 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
104 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
105___
106$code.=<<___;
107 blr %r0,$rem_4bit
108 ldi 3,$rem
109L\$pic_gmult
110 andcm $rem_4bit,$rem,$rem_4bit
111 addl $inp,$len,$len
112 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
113 ldi 0xf0,$mask0xf0
114___
115$code.=<<___ if ($SIZE_T==4);
116 ldi 31,$rem
117 mtctl $rem,%cr11
118 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
119 b L\$parisc1_gmult
120 nop
121___
122
123$code.=<<___;
124 ldb 15($Xi),$nlo
125 ldo 8($Htbl),$Hll
126
127 and $mask0xf0,$nlo,$nhi
128 depd,z $nlo,59,4,$nlo
129
130 ldd $nlo($Hll),$Zll
131 ldd $nlo($Hhh),$Zhh
132
133 depd,z $Zll,60,4,$rem
134 shrpd $Zhh,$Zll,4,$Zll
135 extrd,u $Zhh,59,60,$Zhh
136 ldb 14($Xi),$nlo
137
138 ldd $nhi($Hll),$Tll
139 ldd $nhi($Hhh),$Thh
140 and $mask0xf0,$nlo,$nhi
141 depd,z $nlo,59,4,$nlo
142
143 xor $Tll,$Zll,$Zll
144 xor $Thh,$Zhh,$Zhh
145 ldd $rem($rem_4bit),$rem
146 b L\$oop_gmult_pa2
147 ldi 13,$cnt
148
149 .ALIGN 8
150L\$oop_gmult_pa2
151 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
152 depd,z $Zll,60,4,$rem
153
154 shrpd $Zhh,$Zll,4,$Zll
155 extrd,u $Zhh,59,60,$Zhh
156 ldd $nlo($Hll),$Tll
157 ldd $nlo($Hhh),$Thh
158
159 xor $Tll,$Zll,$Zll
160 xor $Thh,$Zhh,$Zhh
161 ldd $rem($rem_4bit),$rem
162
163 xor $rem,$Zhh,$Zhh
164 depd,z $Zll,60,4,$rem
165 ldbx $cnt($Xi),$nlo
166
167 shrpd $Zhh,$Zll,4,$Zll
168 extrd,u $Zhh,59,60,$Zhh
169 ldd $nhi($Hll),$Tll
170 ldd $nhi($Hhh),$Thh
171
172 and $mask0xf0,$nlo,$nhi
173 depd,z $nlo,59,4,$nlo
174 ldd $rem($rem_4bit),$rem
175
176 xor $Tll,$Zll,$Zll
177 addib,uv -1,$cnt,L\$oop_gmult_pa2
178 xor $Thh,$Zhh,$Zhh
179
180 xor $rem,$Zhh,$Zhh
181 depd,z $Zll,60,4,$rem
182
183 shrpd $Zhh,$Zll,4,$Zll
184 extrd,u $Zhh,59,60,$Zhh
185 ldd $nlo($Hll),$Tll
186 ldd $nlo($Hhh),$Thh
187
188 xor $Tll,$Zll,$Zll
189 xor $Thh,$Zhh,$Zhh
190 ldd $rem($rem_4bit),$rem
191
192 xor $rem,$Zhh,$Zhh
193 depd,z $Zll,60,4,$rem
194
195 shrpd $Zhh,$Zll,4,$Zll
196 extrd,u $Zhh,59,60,$Zhh
197 ldd $nhi($Hll),$Tll
198 ldd $nhi($Hhh),$Thh
199
200 xor $Tll,$Zll,$Zll
201 xor $Thh,$Zhh,$Zhh
202 ldd $rem($rem_4bit),$rem
203
204 xor $rem,$Zhh,$Zhh
205 std $Zll,8($Xi)
206 std $Zhh,0($Xi)
207___
208
209$code.=<<___ if ($SIZE_T==4);
210 b L\$done_gmult
211 nop
212
213L\$parisc1_gmult
214 ldb 15($Xi),$nlo
215 ldo 12($Htbl),$Hll
216 ldo 8($Htbl),$Hlh
217 ldo 4($Htbl),$Hhl
218
219 and $mask0xf0,$nlo,$nhi
220 zdep $nlo,27,4,$nlo
221
222 ldwx $nlo($Hll),$Zll
223 ldwx $nlo($Hlh),$Zlh
224 ldwx $nlo($Hhl),$Zhl
225 ldwx $nlo($Hhh),$Zhh
226 zdep $Zll,28,4,$rem
227 ldb 14($Xi),$nlo
228 ldwx $rem($rem_4bit),$rem
229 shrpw $Zlh,$Zll,4,$Zll
230 ldwx $nhi($Hll),$Tll
231 shrpw $Zhl,$Zlh,4,$Zlh
232 ldwx $nhi($Hlh),$Tlh
233 shrpw $Zhh,$Zhl,4,$Zhl
234 ldwx $nhi($Hhl),$Thl
235 extru $Zhh,27,28,$Zhh
236 ldwx $nhi($Hhh),$Thh
237 xor $rem,$Zhh,$Zhh
238 and $mask0xf0,$nlo,$nhi
239 zdep $nlo,27,4,$nlo
240
241 xor $Tll,$Zll,$Zll
242 ldwx $nlo($Hll),$Tll
243 xor $Tlh,$Zlh,$Zlh
244 ldwx $nlo($Hlh),$Tlh
245 xor $Thl,$Zhl,$Zhl
246 b L\$oop_gmult_pa1
247 ldi 13,$cnt
248
249 .ALIGN 8
250L\$oop_gmult_pa1
251 zdep $Zll,28,4,$rem
252 ldwx $nlo($Hhl),$Thl
253 xor $Thh,$Zhh,$Zhh
254 ldwx $rem($rem_4bit),$rem
255 shrpw $Zlh,$Zll,4,$Zll
256 ldwx $nlo($Hhh),$Thh
257 shrpw $Zhl,$Zlh,4,$Zlh
258 ldbx $cnt($Xi),$nlo
259 xor $Tll,$Zll,$Zll
260 ldwx $nhi($Hll),$Tll
261 shrpw $Zhh,$Zhl,4,$Zhl
262 xor $Tlh,$Zlh,$Zlh
263 ldwx $nhi($Hlh),$Tlh
264 extru $Zhh,27,28,$Zhh
265 xor $Thl,$Zhl,$Zhl
266 ldwx $nhi($Hhl),$Thl
267 xor $rem,$Zhh,$Zhh
268 zdep $Zll,28,4,$rem
269 xor $Thh,$Zhh,$Zhh
270 ldwx $nhi($Hhh),$Thh
271 shrpw $Zlh,$Zll,4,$Zll
272 ldwx $rem($rem_4bit),$rem
273 shrpw $Zhl,$Zlh,4,$Zlh
274 shrpw $Zhh,$Zhl,4,$Zhl
275 and $mask0xf0,$nlo,$nhi
276 extru $Zhh,27,28,$Zhh
277 zdep $nlo,27,4,$nlo
278 xor $Tll,$Zll,$Zll
279 ldwx $nlo($Hll),$Tll
280 xor $Tlh,$Zlh,$Zlh
281 ldwx $nlo($Hlh),$Tlh
282 xor $rem,$Zhh,$Zhh
283 addib,uv -1,$cnt,L\$oop_gmult_pa1
284 xor $Thl,$Zhl,$Zhl
285
286 zdep $Zll,28,4,$rem
287 ldwx $nlo($Hhl),$Thl
288 xor $Thh,$Zhh,$Zhh
289 ldwx $rem($rem_4bit),$rem
290 shrpw $Zlh,$Zll,4,$Zll
291 ldwx $nlo($Hhh),$Thh
292 shrpw $Zhl,$Zlh,4,$Zlh
293 xor $Tll,$Zll,$Zll
294 ldwx $nhi($Hll),$Tll
295 shrpw $Zhh,$Zhl,4,$Zhl
296 xor $Tlh,$Zlh,$Zlh
297 ldwx $nhi($Hlh),$Tlh
298 extru $Zhh,27,28,$Zhh
299 xor $rem,$Zhh,$Zhh
300 xor $Thl,$Zhl,$Zhl
301 ldwx $nhi($Hhl),$Thl
302 xor $Thh,$Zhh,$Zhh
303 ldwx $nhi($Hhh),$Thh
304 zdep $Zll,28,4,$rem
305 ldwx $rem($rem_4bit),$rem
306 shrpw $Zlh,$Zll,4,$Zll
307 shrpw $Zhl,$Zlh,4,$Zlh
308 shrpw $Zhh,$Zhl,4,$Zhl
309 extru $Zhh,27,28,$Zhh
310 xor $Tll,$Zll,$Zll
311 xor $Tlh,$Zlh,$Zlh
312 xor $rem,$Zhh,$Zhh
313 stw $Zll,12($Xi)
314 xor $Thl,$Zhl,$Zhl
315 stw $Zlh,8($Xi)
316 xor $Thh,$Zhh,$Zhh
317 stw $Zhl,4($Xi)
318 stw $Zhh,0($Xi)
319___
320$code.=<<___;
321L\$done_gmult
322 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
323 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
324 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
325 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
326___
327$code.=<<___ if ($SIZE_T==4);
328 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
329 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
330 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
331 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
332 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
333___
334$code.=<<___;
335 bv (%r2)
336 .EXIT
337 $POPMB -$FRAME(%sp),%r3
338 .PROCEND
339
340 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
341 .ALIGN 64
342gcm_ghash_4bit
343 .PROC
344 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
345 .ENTRY
346 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
347 $PUSHMA %r3,$FRAME(%sp)
348 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
349 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
350 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
351___
352$code.=<<___ if ($SIZE_T==4);
353 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
354 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
355 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
356 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
357 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
358___
359$code.=<<___;
360 blr %r0,$rem_4bit
361 ldi 3,$rem
362L\$pic_ghash
363 andcm $rem_4bit,$rem,$rem_4bit
364 addl $inp,$len,$len
365 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
366 ldi 0xf0,$mask0xf0
367___
368$code.=<<___ if ($SIZE_T==4);
369 ldi 31,$rem
370 mtctl $rem,%cr11
371 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
372 b L\$parisc1_ghash
373 nop
374___
375
376$code.=<<___;
377 ldb 15($Xi),$nlo
378 ldo 8($Htbl),$Hll
379
380L\$outer_ghash_pa2
381 ldb 15($inp),$nhi
382 xor $nhi,$nlo,$nlo
383 and $mask0xf0,$nlo,$nhi
384 depd,z $nlo,59,4,$nlo
385
386 ldd $nlo($Hll),$Zll
387 ldd $nlo($Hhh),$Zhh
388
389 depd,z $Zll,60,4,$rem
390 shrpd $Zhh,$Zll,4,$Zll
391 extrd,u $Zhh,59,60,$Zhh
392 ldb 14($Xi),$nlo
393 ldb 14($inp),$byte
394
395 ldd $nhi($Hll),$Tll
396 ldd $nhi($Hhh),$Thh
397 xor $byte,$nlo,$nlo
398 and $mask0xf0,$nlo,$nhi
399 depd,z $nlo,59,4,$nlo
400
401 xor $Tll,$Zll,$Zll
402 xor $Thh,$Zhh,$Zhh
403 ldd $rem($rem_4bit),$rem
404 b L\$oop_ghash_pa2
405 ldi 13,$cnt
406
407 .ALIGN 8
408L\$oop_ghash_pa2
409 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
410 depd,z $Zll,60,4,$rem2
411
412 shrpd $Zhh,$Zll,4,$Zll
413 extrd,u $Zhh,59,60,$Zhh
414 ldd $nlo($Hll),$Tll
415 ldd $nlo($Hhh),$Thh
416
417 xor $Tll,$Zll,$Zll
418 xor $Thh,$Zhh,$Zhh
419 ldbx $cnt($Xi),$nlo
420 ldbx $cnt($inp),$byte
421
422 depd,z $Zll,60,4,$rem
423 shrpd $Zhh,$Zll,4,$Zll
424 ldd $rem2($rem_4bit),$rem2
425
426 xor $rem2,$Zhh,$Zhh
427 xor $byte,$nlo,$nlo
428 ldd $nhi($Hll),$Tll
429 ldd $nhi($Hhh),$Thh
430
431 and $mask0xf0,$nlo,$nhi
432 depd,z $nlo,59,4,$nlo
433
434 extrd,u $Zhh,59,60,$Zhh
435 xor $Tll,$Zll,$Zll
436
437 ldd $rem($rem_4bit),$rem
438 addib,uv -1,$cnt,L\$oop_ghash_pa2
439 xor $Thh,$Zhh,$Zhh
440
441 xor $rem,$Zhh,$Zhh
442 depd,z $Zll,60,4,$rem2
443
444 shrpd $Zhh,$Zll,4,$Zll
445 extrd,u $Zhh,59,60,$Zhh
446 ldd $nlo($Hll),$Tll
447 ldd $nlo($Hhh),$Thh
448
449 xor $Tll,$Zll,$Zll
450 xor $Thh,$Zhh,$Zhh
451
452 depd,z $Zll,60,4,$rem
453 shrpd $Zhh,$Zll,4,$Zll
454 ldd $rem2($rem_4bit),$rem2
455
456 xor $rem2,$Zhh,$Zhh
457 ldd $nhi($Hll),$Tll
458 ldd $nhi($Hhh),$Thh
459
460 extrd,u $Zhh,59,60,$Zhh
461 xor $Tll,$Zll,$Zll
462 xor $Thh,$Zhh,$Zhh
463 ldd $rem($rem_4bit),$rem
464
465 xor $rem,$Zhh,$Zhh
466 std $Zll,8($Xi)
467 ldo 16($inp),$inp
468 std $Zhh,0($Xi)
469 cmpb,*<> $inp,$len,L\$outer_ghash_pa2
470 copy $Zll,$nlo
471___
472
473$code.=<<___ if ($SIZE_T==4);
474 b L\$done_ghash
475 nop
476
477L\$parisc1_ghash
478 ldb 15($Xi),$nlo
479 ldo 12($Htbl),$Hll
480 ldo 8($Htbl),$Hlh
481 ldo 4($Htbl),$Hhl
482
483L\$outer_ghash_pa1
484 ldb 15($inp),$byte
485 xor $byte,$nlo,$nlo
486 and $mask0xf0,$nlo,$nhi
487 zdep $nlo,27,4,$nlo
488
489 ldwx $nlo($Hll),$Zll
490 ldwx $nlo($Hlh),$Zlh
491 ldwx $nlo($Hhl),$Zhl
492 ldwx $nlo($Hhh),$Zhh
493 zdep $Zll,28,4,$rem
494 ldb 14($Xi),$nlo
495 ldb 14($inp),$byte
496 ldwx $rem($rem_4bit),$rem
497 shrpw $Zlh,$Zll,4,$Zll
498 ldwx $nhi($Hll),$Tll
499 shrpw $Zhl,$Zlh,4,$Zlh
500 ldwx $nhi($Hlh),$Tlh
501 shrpw $Zhh,$Zhl,4,$Zhl
502 ldwx $nhi($Hhl),$Thl
503 extru $Zhh,27,28,$Zhh
504 ldwx $nhi($Hhh),$Thh
505 xor $byte,$nlo,$nlo
506 xor $rem,$Zhh,$Zhh
507 and $mask0xf0,$nlo,$nhi
508 zdep $nlo,27,4,$nlo
509
510 xor $Tll,$Zll,$Zll
511 ldwx $nlo($Hll),$Tll
512 xor $Tlh,$Zlh,$Zlh
513 ldwx $nlo($Hlh),$Tlh
514 xor $Thl,$Zhl,$Zhl
515 b L\$oop_ghash_pa1
516 ldi 13,$cnt
517
518 .ALIGN 8
519L\$oop_ghash_pa1
520 zdep $Zll,28,4,$rem
521 ldwx $nlo($Hhl),$Thl
522 xor $Thh,$Zhh,$Zhh
523 ldwx $rem($rem_4bit),$rem
524 shrpw $Zlh,$Zll,4,$Zll
525 ldwx $nlo($Hhh),$Thh
526 shrpw $Zhl,$Zlh,4,$Zlh
527 ldbx $cnt($Xi),$nlo
528 xor $Tll,$Zll,$Zll
529 ldwx $nhi($Hll),$Tll
530 shrpw $Zhh,$Zhl,4,$Zhl
531 ldbx $cnt($inp),$byte
532 xor $Tlh,$Zlh,$Zlh
533 ldwx $nhi($Hlh),$Tlh
534 extru $Zhh,27,28,$Zhh
535 xor $Thl,$Zhl,$Zhl
536 ldwx $nhi($Hhl),$Thl
537 xor $rem,$Zhh,$Zhh
538 zdep $Zll,28,4,$rem
539 xor $Thh,$Zhh,$Zhh
540 ldwx $nhi($Hhh),$Thh
541 shrpw $Zlh,$Zll,4,$Zll
542 ldwx $rem($rem_4bit),$rem
543 shrpw $Zhl,$Zlh,4,$Zlh
544 xor $byte,$nlo,$nlo
545 shrpw $Zhh,$Zhl,4,$Zhl
546 and $mask0xf0,$nlo,$nhi
547 extru $Zhh,27,28,$Zhh
548 zdep $nlo,27,4,$nlo
549 xor $Tll,$Zll,$Zll
550 ldwx $nlo($Hll),$Tll
551 xor $Tlh,$Zlh,$Zlh
552 ldwx $nlo($Hlh),$Tlh
553 xor $rem,$Zhh,$Zhh
554 addib,uv -1,$cnt,L\$oop_ghash_pa1
555 xor $Thl,$Zhl,$Zhl
556
557 zdep $Zll,28,4,$rem
558 ldwx $nlo($Hhl),$Thl
559 xor $Thh,$Zhh,$Zhh
560 ldwx $rem($rem_4bit),$rem
561 shrpw $Zlh,$Zll,4,$Zll
562 ldwx $nlo($Hhh),$Thh
563 shrpw $Zhl,$Zlh,4,$Zlh
564 xor $Tll,$Zll,$Zll
565 ldwx $nhi($Hll),$Tll
566 shrpw $Zhh,$Zhl,4,$Zhl
567 xor $Tlh,$Zlh,$Zlh
568 ldwx $nhi($Hlh),$Tlh
569 extru $Zhh,27,28,$Zhh
570 xor $rem,$Zhh,$Zhh
571 xor $Thl,$Zhl,$Zhl
572 ldwx $nhi($Hhl),$Thl
573 xor $Thh,$Zhh,$Zhh
574 ldwx $nhi($Hhh),$Thh
575 zdep $Zll,28,4,$rem
576 ldwx $rem($rem_4bit),$rem
577 shrpw $Zlh,$Zll,4,$Zll
578 shrpw $Zhl,$Zlh,4,$Zlh
579 shrpw $Zhh,$Zhl,4,$Zhl
580 extru $Zhh,27,28,$Zhh
581 xor $Tll,$Zll,$Zll
582 xor $Tlh,$Zlh,$Zlh
583 xor $rem,$Zhh,$Zhh
584 stw $Zll,12($Xi)
585 xor $Thl,$Zhl,$Zhl
586 stw $Zlh,8($Xi)
587 xor $Thh,$Zhh,$Zhh
588 stw $Zhl,4($Xi)
589 ldo 16($inp),$inp
590 stw $Zhh,0($Xi)
591 comb,<> $inp,$len,L\$outer_ghash_pa1
592 copy $Zll,$nlo
593___
594$code.=<<___;
595L\$done_ghash
596 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
597 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
598 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
599 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
600___
601$code.=<<___ if ($SIZE_T==4);
602 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
603 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
604 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
605 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
606 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
607___
608$code.=<<___;
609 bv (%r2)
610 .EXIT
611 $POPMB -$FRAME(%sp),%r3
612 .PROCEND
613
614 .ALIGN 64
615L\$rem_4bit
616 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
617 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
618 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
619 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
620 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
621 .ALIGN 64
622___
623
624# Explicitly encode PA-RISC 2.0 instructions used in this module, so
625# that it can be compiled with .LEVEL 1.0. It should be noted that I
626# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
627# directive...
628
629my $ldd = sub {
630 my ($mod,$args) = @_;
631 my $orig = "ldd$mod\t$args";
632
633 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
634 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
635 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
636 }
637 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
638 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
639 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
640 $opcode|=(1<<5) if ($mod =~ /^,m/);
641 $opcode|=(1<<13) if ($mod =~ /^,mb/);
642 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643 }
644 else { "\t".$orig; }
645};
646
647my $std = sub {
648 my ($mod,$args) = @_;
649 my $orig = "std$mod\t$args";
650
651 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
652 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
653 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
654 }
655 else { "\t".$orig; }
656};
657
658my $extrd = sub {
659 my ($mod,$args) = @_;
660 my $orig = "extrd$mod\t$args";
661
662 # I only have ",u" completer, it's implicitly encoded...
663 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
664 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
665 my $len=32-$3;
666 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
667 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
668 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
669 }
670 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
671 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
672 my $len=32-$2;
673 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
674 $opcode |= (1<<13) if ($mod =~ /,\**=/);
675 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676 }
677 else { "\t".$orig; }
678};
679
680my $shrpd = sub {
681 my ($mod,$args) = @_;
682 my $orig = "shrpd$mod\t$args";
683
684 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
685 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
686 my $cpos=63-$3;
687 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
688 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
689 }
690 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
691 { sprintf "\t.WORD\t0x%08x\t; %s",
692 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
693 }
694 else { "\t".$orig; }
695};
696
697my $depd = sub {
698 my ($mod,$args) = @_;
699 my $orig = "depd$mod\t$args";
700
701 # I only have ",z" completer, it's impicitly encoded...
702 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
703 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
704 my $cpos=63-$2;
705 my $len=32-$3;
706 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
707 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
708 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709 }
710 else { "\t".$orig; }
711};
712
713sub assemble {
714 my ($mnemonic,$mod,$args)=@_;
715 my $opcode = eval("\$$mnemonic");
716
717 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
718}
719
720foreach (split("\n",$code)) {
721 s/\`([^\`]*)\`/eval $1/ge;
722 if ($SIZE_T==4) {
723 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
724 s/cmpb,\*/comb,/;
725 s/,\*/,/;
726 }
727 s/\bbv\b/bve/ if ($SIZE_T==8);
728 print $_,"\n";
729}
730
731close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
deleted file mode 100644
index 6a40d5d89c..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-s390x.pl
+++ /dev/null
@@ -1,262 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# September 2010.
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# was measured to be ~18 cycles per processed byte on z10, which is
16# almost 40% better than gcc-generated code. It should be noted that
17# 18 cycles is worse result than expected: loop is scheduled for 12
18# and the result should be close to 12. In the lack of instruction-
19# level profiling data it's impossible to tell why...
20
21# November 2010.
22#
23# Adapt for -m31 build. If kernel supports what's called "highgprs"
24# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
25# instructions and achieve "64-bit" performance even in 31-bit legacy
26# application context. The feature is not specific to any particular
27# processor, as long as it's "z-CPU". Latter implies that the code
28# remains z/Architecture specific. On z990 it was measured to perform
29# 2.8x better than 32-bit code generated by gcc 4.3.
30
31# March 2011.
32#
33# Support for hardware KIMD-GHASH is verified to produce correct
34# result and therefore is engaged. On z196 it was measured to process
35# 8KB buffer ~7 faster than software implementation. It's not as
36# impressive for smaller buffer sizes and for smallest 16-bytes buffer
37# it's actually almost 2 times slower. Which is the reason why
38# KIMD-GHASH is not used in gcm_gmult_4bit.
39
40$flavour = shift;
41
42if ($flavour =~ /3[12]/) {
43 $SIZE_T=4;
44 $g="";
45} else {
46 $SIZE_T=8;
47 $g="g";
48}
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53$softonly=0;
54
55$Zhi="%r0";
56$Zlo="%r1";
57
58$Xi="%r2"; # argument block
59$Htbl="%r3";
60$inp="%r4";
61$len="%r5";
62
63$rem0="%r6"; # variables
64$rem1="%r7";
65$nlo="%r8";
66$nhi="%r9";
67$xi="%r10";
68$cnt="%r11";
69$tmp="%r12";
70$x78="%r13";
71$rem_4bit="%r14";
72
73$sp="%r15";
74
75$code.=<<___;
76.text
77
78.globl gcm_gmult_4bit
79.align 32
80gcm_gmult_4bit:
81___
82$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
83 larl %r1,OPENSSL_s390xcap_P
84 lg %r0,0(%r1)
85 tmhl %r0,0x4000 # check for message-security-assist
86 jz .Lsoft_gmult
87 lghi %r0,0
88 la %r1,16($sp)
89 .long 0xb93e0004 # kimd %r0,%r4
90 lg %r1,24($sp)
91 tmhh %r1,0x4000 # check for function 65
92 jz .Lsoft_gmult
93 stg %r0,16($sp) # arrange 16 bytes of zero input
94 stg %r0,24($sp)
95 lghi %r0,65 # function 65
96 la %r1,0($Xi) # H lies right after Xi in gcm128_context
97 la $inp,16($sp)
98 lghi $len,16
99 .long 0xb93e0004 # kimd %r0,$inp
100 brc 1,.-4 # pay attention to "partial completion"
101 br %r14
102.align 32
103.Lsoft_gmult:
104___
105$code.=<<___;
106 stm${g} %r6,%r14,6*$SIZE_T($sp)
107
108 aghi $Xi,-1
109 lghi $len,1
110 lghi $x78,`0xf<<3`
111 larl $rem_4bit,rem_4bit
112
113 lg $Zlo,8+1($Xi) # Xi
114 j .Lgmult_shortcut
115.type gcm_gmult_4bit,\@function
116.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
117
118.globl gcm_ghash_4bit
119.align 32
120gcm_ghash_4bit:
121___
122$code.=<<___ if(!$softonly);
123 larl %r1,OPENSSL_s390xcap_P
124 lg %r0,0(%r1)
125 tmhl %r0,0x4000 # check for message-security-assist
126 jz .Lsoft_ghash
127 lghi %r0,0
128 la %r1,16($sp)
129 .long 0xb93e0004 # kimd %r0,%r4
130 lg %r1,24($sp)
131 tmhh %r1,0x4000 # check for function 65
132 jz .Lsoft_ghash
133 lghi %r0,65 # function 65
134 la %r1,0($Xi) # H lies right after Xi in gcm128_context
135 .long 0xb93e0004 # kimd %r0,$inp
136 brc 1,.-4 # pay attention to "partial completion"
137 br %r14
138.align 32
139.Lsoft_ghash:
140___
141$code.=<<___ if ($flavour =~ /3[12]/);
142 llgfr $len,$len
143___
144$code.=<<___;
145 stm${g} %r6,%r14,6*$SIZE_T($sp)
146
147 aghi $Xi,-1
148 srlg $len,$len,4
149 lghi $x78,`0xf<<3`
150 larl $rem_4bit,rem_4bit
151
152 lg $Zlo,8+1($Xi) # Xi
153 lg $Zhi,0+1($Xi)
154 lghi $tmp,0
155.Louter:
156 xg $Zhi,0($inp) # Xi ^= inp
157 xg $Zlo,8($inp)
158 xgr $Zhi,$tmp
159 stg $Zlo,8+1($Xi)
160 stg $Zhi,0+1($Xi)
161
162.Lgmult_shortcut:
163 lghi $tmp,0xf0
164 sllg $nlo,$Zlo,4
165 srlg $xi,$Zlo,8 # extract second byte
166 ngr $nlo,$tmp
167 lgr $nhi,$Zlo
168 lghi $cnt,14
169 ngr $nhi,$tmp
170
171 lg $Zlo,8($nlo,$Htbl)
172 lg $Zhi,0($nlo,$Htbl)
173
174 sllg $nlo,$xi,4
175 sllg $rem0,$Zlo,3
176 ngr $nlo,$tmp
177 ngr $rem0,$x78
178 ngr $xi,$tmp
179
180 sllg $tmp,$Zhi,60
181 srlg $Zlo,$Zlo,4
182 srlg $Zhi,$Zhi,4
183 xg $Zlo,8($nhi,$Htbl)
184 xg $Zhi,0($nhi,$Htbl)
185 lgr $nhi,$xi
186 sllg $rem1,$Zlo,3
187 xgr $Zlo,$tmp
188 ngr $rem1,$x78
189 j .Lghash_inner
190.align 16
191.Lghash_inner:
192 srlg $Zlo,$Zlo,4
193 sllg $tmp,$Zhi,60
194 xg $Zlo,8($nlo,$Htbl)
195 srlg $Zhi,$Zhi,4
196 llgc $xi,0($cnt,$Xi)
197 xg $Zhi,0($nlo,$Htbl)
198 sllg $nlo,$xi,4
199 xg $Zhi,0($rem0,$rem_4bit)
200 nill $nlo,0xf0
201 sllg $rem0,$Zlo,3
202 xgr $Zlo,$tmp
203 ngr $rem0,$x78
204 nill $xi,0xf0
205
206 sllg $tmp,$Zhi,60
207 srlg $Zlo,$Zlo,4
208 srlg $Zhi,$Zhi,4
209 xg $Zlo,8($nhi,$Htbl)
210 xg $Zhi,0($nhi,$Htbl)
211 lgr $nhi,$xi
212 xg $Zhi,0($rem1,$rem_4bit)
213 sllg $rem1,$Zlo,3
214 xgr $Zlo,$tmp
215 ngr $rem1,$x78
216 brct $cnt,.Lghash_inner
217
218 sllg $tmp,$Zhi,60
219 srlg $Zlo,$Zlo,4
220 srlg $Zhi,$Zhi,4
221 xg $Zlo,8($nlo,$Htbl)
222 xg $Zhi,0($nlo,$Htbl)
223 sllg $xi,$Zlo,3
224 xg $Zhi,0($rem0,$rem_4bit)
225 xgr $Zlo,$tmp
226 ngr $xi,$x78
227
228 sllg $tmp,$Zhi,60
229 srlg $Zlo,$Zlo,4
230 srlg $Zhi,$Zhi,4
231 xg $Zlo,8($nhi,$Htbl)
232 xg $Zhi,0($nhi,$Htbl)
233 xgr $Zlo,$tmp
234 xg $Zhi,0($rem1,$rem_4bit)
235
236 lg $tmp,0($xi,$rem_4bit)
237 la $inp,16($inp)
238 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
239 brctg $len,.Louter
240
241 xgr $Zhi,$tmp
242 stg $Zlo,8+1($Xi)
243 stg $Zhi,0+1($Xi)
244 lm${g} %r6,%r14,6*$SIZE_T($sp)
245 br %r14
246.type gcm_ghash_4bit,\@function
247.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
248
249.align 64
250rem_4bit:
251 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
252 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
253 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
254 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
255.type rem_4bit,\@object
256.size rem_4bit,(.-rem_4bit)
257.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
258___
259
260$code =~ s/\`([^\`]*)\`/eval $1/gem;
261print $code;
262close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index 70e7b044a3..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,330 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18# gcc 3.3.x cc 5.2 this assembler
19#
20# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
21# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25# gcc 4.4.1 this assembler
26#
27# 32-bit build 566 50 (+1000%)
28# 64-bit build 56 50 (+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64) { $bias=2047; $frame=192; }
43else { $bias=0; $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0"; # 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0"; # small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0"; # input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section ".text",#alloc,#execinstr
71
72.align 64
73rem_4bit:
74 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type rem_4bit,#object
79.size rem_4bit,(.-rem_4bit)
80
81.globl gcm_ghash_4bit
82.align 32
83gcm_ghash_4bit:
84 save %sp,-$frame,%sp
85 ldub [$inp+15],$nlo
86 ldub [$Xi+15],$xi0
87 ldub [$Xi+14],$xi1
88 add $len,$inp,$len
89 add $Htbl,8,$Htblo
90
911: call .+8
92 add %o7,rem_4bit-1b,$rem_4bit
93
94.Louter:
95 xor $xi0,$nlo,$nlo
96 and $nlo,0xf0,$nhi
97 and $nlo,0x0f,$nlo
98 sll $nlo,4,$nlo
99 ldx [$Htblo+$nlo],$Zlo
100 ldx [$Htbl+$nlo],$Zhi
101
102 ldub [$inp+14],$nlo
103
104 ldx [$Htblo+$nhi],$Tlo
105 and $Zlo,0xf,$remi
106 ldx [$Htbl+$nhi],$Thi
107 sll $remi,3,$remi
108 ldx [$rem_4bit+$remi],$rem
109 srlx $Zlo,4,$Zlo
110 mov 13,$cnt
111 sllx $Zhi,60,$tmp
112 xor $Tlo,$Zlo,$Zlo
113 srlx $Zhi,4,$Zhi
114 xor $Zlo,$tmp,$Zlo
115
116 xor $xi1,$nlo,$nlo
117 and $Zlo,0xf,$remi
118 and $nlo,0xf0,$nhi
119 and $nlo,0x0f,$nlo
120 ba .Lghash_inner
121 sll $nlo,4,$nlo
122.align 32
123.Lghash_inner:
124 ldx [$Htblo+$nlo],$Tlo
125 sll $remi,3,$remi
126 xor $Thi,$Zhi,$Zhi
127 ldx [$Htbl+$nlo],$Thi
128 srlx $Zlo,4,$Zlo
129 xor $rem,$Zhi,$Zhi
130 ldx [$rem_4bit+$remi],$rem
131 sllx $Zhi,60,$tmp
132 xor $Tlo,$Zlo,$Zlo
133 ldub [$inp+$cnt],$nlo
134 srlx $Zhi,4,$Zhi
135 xor $Zlo,$tmp,$Zlo
136 ldub [$Xi+$cnt],$xi1
137 xor $Thi,$Zhi,$Zhi
138 and $Zlo,0xf,$remi
139
140 ldx [$Htblo+$nhi],$Tlo
141 sll $remi,3,$remi
142 xor $rem,$Zhi,$Zhi
143 ldx [$Htbl+$nhi],$Thi
144 srlx $Zlo,4,$Zlo
145 ldx [$rem_4bit+$remi],$rem
146 sllx $Zhi,60,$tmp
147 xor $xi1,$nlo,$nlo
148 srlx $Zhi,4,$Zhi
149 and $nlo,0xf0,$nhi
150 addcc $cnt,-1,$cnt
151 xor $Zlo,$tmp,$Zlo
152 and $nlo,0x0f,$nlo
153 xor $Tlo,$Zlo,$Zlo
154 sll $nlo,4,$nlo
155 blu .Lghash_inner
156 and $Zlo,0xf,$remi
157
158 ldx [$Htblo+$nlo],$Tlo
159 sll $remi,3,$remi
160 xor $Thi,$Zhi,$Zhi
161 ldx [$Htbl+$nlo],$Thi
162 srlx $Zlo,4,$Zlo
163 xor $rem,$Zhi,$Zhi
164 ldx [$rem_4bit+$remi],$rem
165 sllx $Zhi,60,$tmp
166 xor $Tlo,$Zlo,$Zlo
167 srlx $Zhi,4,$Zhi
168 xor $Zlo,$tmp,$Zlo
169 xor $Thi,$Zhi,$Zhi
170
171 add $inp,16,$inp
172 cmp $inp,$len
173 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
174 and $Zlo,0xf,$remi
175
176 ldx [$Htblo+$nhi],$Tlo
177 sll $remi,3,$remi
178 xor $rem,$Zhi,$Zhi
179 ldx [$Htbl+$nhi],$Thi
180 srlx $Zlo,4,$Zlo
181 ldx [$rem_4bit+$remi],$rem
182 sllx $Zhi,60,$tmp
183 xor $Tlo,$Zlo,$Zlo
184 ldub [$inp+15],$nlo
185 srlx $Zhi,4,$Zhi
186 xor $Zlo,$tmp,$Zlo
187 xor $Thi,$Zhi,$Zhi
188 stx $Zlo,[$Xi+8]
189 xor $rem,$Zhi,$Zhi
190 stx $Zhi,[$Xi]
191 srl $Zlo,8,$xi1
192 and $Zlo,0xff,$xi0
193 ba .Louter
194 and $xi1,0xff,$xi1
195.align 32
196.Ldone:
197 ldx [$Htblo+$nhi],$Tlo
198 sll $remi,3,$remi
199 xor $rem,$Zhi,$Zhi
200 ldx [$Htbl+$nhi],$Thi
201 srlx $Zlo,4,$Zlo
202 ldx [$rem_4bit+$remi],$rem
203 sllx $Zhi,60,$tmp
204 xor $Tlo,$Zlo,$Zlo
205 srlx $Zhi,4,$Zhi
206 xor $Zlo,$tmp,$Zlo
207 xor $Thi,$Zhi,$Zhi
208 stx $Zlo,[$Xi+8]
209 xor $rem,$Zhi,$Zhi
210 stx $Zhi,[$Xi]
211
212 ret
213 restore
214.type gcm_ghash_4bit,#function
215.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
216___
217
218undef $inp;
219undef $len;
220
221$code.=<<___;
222.globl gcm_gmult_4bit
223.align 32
224gcm_gmult_4bit:
225 save %sp,-$frame,%sp
226 ldub [$Xi+15],$nlo
227 add $Htbl,8,$Htblo
228
2291: call .+8
230 add %o7,rem_4bit-1b,$rem_4bit
231
232 and $nlo,0xf0,$nhi
233 and $nlo,0x0f,$nlo
234 sll $nlo,4,$nlo
235 ldx [$Htblo+$nlo],$Zlo
236 ldx [$Htbl+$nlo],$Zhi
237
238 ldub [$Xi+14],$nlo
239
240 ldx [$Htblo+$nhi],$Tlo
241 and $Zlo,0xf,$remi
242 ldx [$Htbl+$nhi],$Thi
243 sll $remi,3,$remi
244 ldx [$rem_4bit+$remi],$rem
245 srlx $Zlo,4,$Zlo
246 mov 13,$cnt
247 sllx $Zhi,60,$tmp
248 xor $Tlo,$Zlo,$Zlo
249 srlx $Zhi,4,$Zhi
250 xor $Zlo,$tmp,$Zlo
251
252 and $Zlo,0xf,$remi
253 and $nlo,0xf0,$nhi
254 and $nlo,0x0f,$nlo
255 ba .Lgmult_inner
256 sll $nlo,4,$nlo
257.align 32
258.Lgmult_inner:
259 ldx [$Htblo+$nlo],$Tlo
260 sll $remi,3,$remi
261 xor $Thi,$Zhi,$Zhi
262 ldx [$Htbl+$nlo],$Thi
263 srlx $Zlo,4,$Zlo
264 xor $rem,$Zhi,$Zhi
265 ldx [$rem_4bit+$remi],$rem
266 sllx $Zhi,60,$tmp
267 xor $Tlo,$Zlo,$Zlo
268 ldub [$Xi+$cnt],$nlo
269 srlx $Zhi,4,$Zhi
270 xor $Zlo,$tmp,$Zlo
271 xor $Thi,$Zhi,$Zhi
272 and $Zlo,0xf,$remi
273
274 ldx [$Htblo+$nhi],$Tlo
275 sll $remi,3,$remi
276 xor $rem,$Zhi,$Zhi
277 ldx [$Htbl+$nhi],$Thi
278 srlx $Zlo,4,$Zlo
279 ldx [$rem_4bit+$remi],$rem
280 sllx $Zhi,60,$tmp
281 srlx $Zhi,4,$Zhi
282 and $nlo,0xf0,$nhi
283 addcc $cnt,-1,$cnt
284 xor $Zlo,$tmp,$Zlo
285 and $nlo,0x0f,$nlo
286 xor $Tlo,$Zlo,$Zlo
287 sll $nlo,4,$nlo
288 blu .Lgmult_inner
289 and $Zlo,0xf,$remi
290
291 ldx [$Htblo+$nlo],$Tlo
292 sll $remi,3,$remi
293 xor $Thi,$Zhi,$Zhi
294 ldx [$Htbl+$nlo],$Thi
295 srlx $Zlo,4,$Zlo
296 xor $rem,$Zhi,$Zhi
297 ldx [$rem_4bit+$remi],$rem
298 sllx $Zhi,60,$tmp
299 xor $Tlo,$Zlo,$Zlo
300 srlx $Zhi,4,$Zhi
301 xor $Zlo,$tmp,$Zlo
302 xor $Thi,$Zhi,$Zhi
303 and $Zlo,0xf,$remi
304
305 ldx [$Htblo+$nhi],$Tlo
306 sll $remi,3,$remi
307 xor $rem,$Zhi,$Zhi
308 ldx [$Htbl+$nhi],$Thi
309 srlx $Zlo,4,$Zlo
310 ldx [$rem_4bit+$remi],$rem
311 sllx $Zhi,60,$tmp
312 xor $Tlo,$Zlo,$Zlo
313 srlx $Zhi,4,$Zhi
314 xor $Zlo,$tmp,$Zlo
315 xor $Thi,$Zhi,$Zhi
316 stx $Zlo,[$Xi+8]
317 xor $rem,$Zhi,$Zhi
318 stx $Zhi,[$Xi]
319
320 ret
321 restore
322.type gcm_gmult_4bit,#function
323.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
324.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
325.align 4
326___
327
328$code =~ s/\`([^\`]*)\`/eval $1/gem;
329print $code;
330close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 83c727e07f..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1342 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, May, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15# code paths: vanilla x86 and vanilla MMX. Former will be executed on
16# 486 and Pentium, latter on all others. MMX GHASH features so called
17# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18# of per-key storage [+512 bytes shared table]. Performance results
19# are for streamed GHASH subroutine and are expressed in cycles per
20# processed byte, less is better:
21#
22# gcc 2.95.3(*) MMX assembler x86 assembler
23#
24# Pentium 105/111(**) - 50
25# PIII 68 /75 12.2 24
26# P4 125/125 17.8 84(***)
27# Opteron 66 /70 10.1 30
28# Core2 54 /67 8.4 18
29#
30# (*) gcc 3.4.x was observed to generate few percent slower code,
31# which is one of reasons why 2.95.3 results were chosen,
32# another reason is lack of 3.4.x results for older CPUs;
33# comparison with MMX results is not completely fair, because C
34# results are for vanilla "256B" implementation, while
35# assembler results are for "528B";-)
36# (**) second number is result for code compiled with -fPIC flag,
37# which is actually more relevant, because assembler code is
38# position-independent;
39# (***) see comment in non-MMX routine for further details;
40#
41# To summarize, it's >2-5 times faster than gcc-generated code. To
42# anchor it to something else SHA1 assembler processes one byte in
43# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
44# particular, see comment at the end of the file...
45
46# May 2010
47#
48# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
49# The question is how close is it to theoretical limit? The pclmulqdq
50# instruction latency appears to be 14 cycles and there can't be more
51# than 2 of them executing at any given time. This means that single
52# Karatsuba multiplication would take 28 cycles *plus* few cycles for
53# pre- and post-processing. Then multiplication has to be followed by
54# modulo-reduction. Given that aggregated reduction method [see
55# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
56# white paper by Intel] allows you to perform reduction only once in
57# a while we can assume that asymptotic performance can be estimated
58# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
59# and Naggr is the aggregation factor.
60#
61# Before we proceed to this implementation let's have closer look at
62# the best-performing code suggested by Intel in their white paper.
63# By tracing inter-register dependencies Tmod is estimated as ~19
64# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
65# processed byte. As implied, this is quite optimistic estimate,
66# because it does not account for Karatsuba pre- and post-processing,
67# which for a single multiplication is ~5 cycles. Unfortunately Intel
68# does not provide performance data for GHASH alone. But benchmarking
69# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
70# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
71# the result accounts even for pre-computing of degrees of the hash
72# key H, but its portion is negligible at 16KB buffer size.
73#
74# Moving on to the implementation in question. Tmod is estimated as
75# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
76# 2.16. How is it possible that measured performance is better than
77# optimistic theoretical estimate? There is one thing Intel failed
78# to recognize. By serializing GHASH with CTR in same subroutine
79# former's performance is really limited to above (Tmul + Tmod/Naggr)
80# equation. But if GHASH procedure is detached, the modulo-reduction
81# can be interleaved with Naggr-1 multiplications at instruction level
82# and under ideal conditions even disappear from the equation. So that
83# optimistic theoretical estimate for this implementation is ...
84# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
85# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
86# where Tproc is time required for Karatsuba pre- and post-processing,
87# is more realistic estimate. In this case it gives ... 1.91 cycles.
88# Or in other words, depending on how well we can interleave reduction
89# and one of the two multiplications the performance should be betwen
90# 1.91 and 2.16. As already mentioned, this implementation processes
91# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
92# - in 2.02. x86_64 performance is better, because larger register
93# bank allows to interleave reduction and multiplication better.
94#
95# Does it make sense to increase Naggr? To start with it's virtually
96# impossible in 32-bit mode, because of limited register bank
97# capacity. Otherwise improvement has to be weighed agiainst slower
98# setup, as well as code size and complexity increase. As even
99# optimistic estimate doesn't promise 30% performance improvement,
100# there are currently no plans to increase Naggr.
101#
102# Special thanks to David Woodhouse <dwmw2@infradead.org> for
103# providing access to a Westmere-based system on behalf of Intel
104# Open Source Technology Centre.
105
106# January 2010
107#
108# Tweaked to optimize transitions between integer and FP operations
109# on same XMM register, PCLMULQDQ subroutine was measured to process
110# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
111# The minor regression on Westmere is outweighed by ~15% improvement
112# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
113# similar manner resulted in almost 20% degradation on Sandy Bridge,
114# where original 64-bit code processes one byte in 1.95 cycles.
115
116$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117push(@INC,"${dir}","${dir}../../perlasm");
118require "x86asm.pl";
119
120&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
121
122$sse2=0;
123for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
124
125($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
126$inp = "edi";
127$Htbl = "esi";
128
129$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
130 # than unrolled, which has to be weighted against
131 # 2.5x x86-specific code size reduction.
132
133sub x86_loop {
134 my $off = shift;
135 my $rem = "eax";
136
137 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
138 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
139 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
140 &mov ($Zll,&DWP(8,$Htbl,$Zll));
141 &xor ($rem,$rem); # avoid partial register stalls on PIII
142
143 # shrd practically kills P4, 2.5x deterioration, but P4 has
144 # MMX code-path to execute. shrd runs tad faster [than twice
145 # the shifts, move's and or's] on pre-MMX Pentium (as well as
146 # PIII and Core2), *but* minimizes code size, spares register
147 # and thus allows to fold the loop...
148 if (!$unroll) {
149 my $cnt = $inp;
150 &mov ($cnt,15);
151 &jmp (&label("x86_loop"));
152 &set_label("x86_loop",16);
153 for($i=1;$i<=2;$i++) {
154 &mov (&LB($rem),&LB($Zll));
155 &shrd ($Zll,$Zlh,4);
156 &and (&LB($rem),0xf);
157 &shrd ($Zlh,$Zhl,4);
158 &shrd ($Zhl,$Zhh,4);
159 &shr ($Zhh,4);
160 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
161
162 &mov (&LB($rem),&BP($off,"esp",$cnt));
163 if ($i&1) {
164 &and (&LB($rem),0xf0);
165 } else {
166 &shl (&LB($rem),4);
167 }
168
169 &xor ($Zll,&DWP(8,$Htbl,$rem));
170 &xor ($Zlh,&DWP(12,$Htbl,$rem));
171 &xor ($Zhl,&DWP(0,$Htbl,$rem));
172 &xor ($Zhh,&DWP(4,$Htbl,$rem));
173
174 if ($i&1) {
175 &dec ($cnt);
176 &js (&label("x86_break"));
177 } else {
178 &jmp (&label("x86_loop"));
179 }
180 }
181 &set_label("x86_break",16);
182 } else {
183 for($i=1;$i<32;$i++) {
184 &comment($i);
185 &mov (&LB($rem),&LB($Zll));
186 &shrd ($Zll,$Zlh,4);
187 &and (&LB($rem),0xf);
188 &shrd ($Zlh,$Zhl,4);
189 &shrd ($Zhl,$Zhh,4);
190 &shr ($Zhh,4);
191 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
192
193 if ($i&1) {
194 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
195 &and (&LB($rem),0xf0);
196 } else {
197 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
198 &shl (&LB($rem),4);
199 }
200
201 &xor ($Zll,&DWP(8,$Htbl,$rem));
202 &xor ($Zlh,&DWP(12,$Htbl,$rem));
203 &xor ($Zhl,&DWP(0,$Htbl,$rem));
204 &xor ($Zhh,&DWP(4,$Htbl,$rem));
205 }
206 }
207 &bswap ($Zll);
208 &bswap ($Zlh);
209 &bswap ($Zhl);
210 if (!$x86only) {
211 &bswap ($Zhh);
212 } else {
213 &mov ("eax",$Zhh);
214 &bswap ("eax");
215 &mov ($Zhh,"eax");
216 }
217}
218
219if ($unroll) {
220 &function_begin_B("_x86_gmult_4bit_inner");
221 &x86_loop(4);
222 &ret ();
223 &function_end_B("_x86_gmult_4bit_inner");
224}
225
226sub deposit_rem_4bit {
227 my $bias = shift;
228
229 &mov (&DWP($bias+0, "esp"),0x0000<<16);
230 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
231 &mov (&DWP($bias+8, "esp"),0x3840<<16);
232 &mov (&DWP($bias+12,"esp"),0x2460<<16);
233 &mov (&DWP($bias+16,"esp"),0x7080<<16);
234 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
235 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
236 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
237 &mov (&DWP($bias+32,"esp"),0xE100<<16);
238 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
239 &mov (&DWP($bias+40,"esp"),0xD940<<16);
240 &mov (&DWP($bias+44,"esp"),0xC560<<16);
241 &mov (&DWP($bias+48,"esp"),0x9180<<16);
242 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
243 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
244 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
245}
246
247$suffix = $x86only ? "" : "_x86";
248
249&function_begin("gcm_gmult_4bit".$suffix);
250 &stack_push(16+4+1); # +1 for stack alignment
251 &mov ($inp,&wparam(0)); # load Xi
252 &mov ($Htbl,&wparam(1)); # load Htable
253
254 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
255 &mov ($Zhl,&DWP(4,$inp));
256 &mov ($Zlh,&DWP(8,$inp));
257 &mov ($Zll,&DWP(12,$inp));
258
259 &deposit_rem_4bit(16);
260
261 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
262 &mov (&DWP(4,"esp"),$Zhl);
263 &mov (&DWP(8,"esp"),$Zlh);
264 &mov (&DWP(12,"esp"),$Zll);
265 &shr ($Zll,20);
266 &and ($Zll,0xf0);
267
268 if ($unroll) {
269 &call ("_x86_gmult_4bit_inner");
270 } else {
271 &x86_loop(0);
272 &mov ($inp,&wparam(0));
273 }
274
275 &mov (&DWP(12,$inp),$Zll);
276 &mov (&DWP(8,$inp),$Zlh);
277 &mov (&DWP(4,$inp),$Zhl);
278 &mov (&DWP(0,$inp),$Zhh);
279 &stack_pop(16+4+1);
280&function_end("gcm_gmult_4bit".$suffix);
281
282&function_begin("gcm_ghash_4bit".$suffix);
283 &stack_push(16+4+1); # +1 for 64-bit alignment
284 &mov ($Zll,&wparam(0)); # load Xi
285 &mov ($Htbl,&wparam(1)); # load Htable
286 &mov ($inp,&wparam(2)); # load in
287 &mov ("ecx",&wparam(3)); # load len
288 &add ("ecx",$inp);
289 &mov (&wparam(3),"ecx");
290
291 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
292 &mov ($Zhl,&DWP(4,$Zll));
293 &mov ($Zlh,&DWP(8,$Zll));
294 &mov ($Zll,&DWP(12,$Zll));
295
296 &deposit_rem_4bit(16);
297
298 &set_label("x86_outer_loop",16);
299 &xor ($Zll,&DWP(12,$inp)); # xor with input
300 &xor ($Zlh,&DWP(8,$inp));
301 &xor ($Zhl,&DWP(4,$inp));
302 &xor ($Zhh,&DWP(0,$inp));
303 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
304 &mov (&DWP(8,"esp"),$Zlh);
305 &mov (&DWP(4,"esp"),$Zhl);
306 &mov (&DWP(0,"esp"),$Zhh);
307
308 &shr ($Zll,20);
309 &and ($Zll,0xf0);
310
311 if ($unroll) {
312 &call ("_x86_gmult_4bit_inner");
313 } else {
314 &x86_loop(0);
315 &mov ($inp,&wparam(2));
316 }
317 &lea ($inp,&DWP(16,$inp));
318 &cmp ($inp,&wparam(3));
319 &mov (&wparam(2),$inp) if (!$unroll);
320 &jb (&label("x86_outer_loop"));
321
322 &mov ($inp,&wparam(0)); # load Xi
323 &mov (&DWP(12,$inp),$Zll);
324 &mov (&DWP(8,$inp),$Zlh);
325 &mov (&DWP(4,$inp),$Zhl);
326 &mov (&DWP(0,$inp),$Zhh);
327 &stack_pop(16+4+1);
328&function_end("gcm_ghash_4bit".$suffix);
329
330if (!$x86only) {{{
331
332&static_label("rem_4bit");
333
334if (!$sse2) {{ # pure-MMX "May" version...
335
336$S=12; # shift factor for rem_4bit
337
338&function_begin_B("_mmx_gmult_4bit_inner");
339# MMX version performs 3.5 times better on P4 (see comment in non-MMX
340# routine for further details), 100% better on Opteron, ~70% better
341# on Core2 and PIII... In other words effort is considered to be well
342# spent... Since initial release the loop was unrolled in order to
343# "liberate" register previously used as loop counter. Instead it's
344# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
345# The path involves move of Z.lo from MMX to integer register,
346# effective address calculation and finally merge of value to Z.hi.
347# Reference to rem_4bit is scheduled so late that I had to >>4
348# rem_4bit elements. This resulted in 20-45% procent improvement
349# on contemporary µ-archs.
350{
351 my $cnt;
352 my $rem_4bit = "eax";
353 my @rem = ($Zhh,$Zll);
354 my $nhi = $Zhl;
355 my $nlo = $Zlh;
356
357 my ($Zlo,$Zhi) = ("mm0","mm1");
358 my $tmp = "mm2";
359
360 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
361 &mov ($nhi,$Zll);
362 &mov (&LB($nlo),&LB($nhi));
363 &shl (&LB($nlo),4);
364 &and ($nhi,0xf0);
365 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
366 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
367 &movd ($rem[0],$Zlo);
368
369 for ($cnt=28;$cnt>=-2;$cnt--) {
370 my $odd = $cnt&1;
371 my $nix = $odd ? $nlo : $nhi;
372
373 &shl (&LB($nlo),4) if ($odd);
374 &psrlq ($Zlo,4);
375 &movq ($tmp,$Zhi);
376 &psrlq ($Zhi,4);
377 &pxor ($Zlo,&QWP(8,$Htbl,$nix));
378 &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
379 &psllq ($tmp,60);
380 &and ($nhi,0xf0) if ($odd);
381 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
382 &and ($rem[0],0xf);
383 &pxor ($Zhi,&QWP(0,$Htbl,$nix));
384 &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
385 &movd ($rem[1],$Zlo);
386 &pxor ($Zlo,$tmp);
387
388 push (@rem,shift(@rem)); # "rotate" registers
389 }
390
391 &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
392
393 &psrlq ($Zlo,32); # lower part of Zlo is already there
394 &movd ($Zhl,$Zhi);
395 &psrlq ($Zhi,32);
396 &movd ($Zlh,$Zlo);
397 &movd ($Zhh,$Zhi);
398 &shl ($inp,4); # compensate for rem_4bit[i] being >>4
399
400 &bswap ($Zll);
401 &bswap ($Zhl);
402 &bswap ($Zlh);
403 &xor ($Zhh,$inp);
404 &bswap ($Zhh);
405
406 &ret ();
407}
408&function_end_B("_mmx_gmult_4bit_inner");
409
410&function_begin("gcm_gmult_4bit_mmx");
411 &mov ($inp,&wparam(0)); # load Xi
412 &mov ($Htbl,&wparam(1)); # load Htable
413
414 &call (&label("pic_point"));
415 &set_label("pic_point");
416 &blindpop("eax");
417 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
418
419 &movz ($Zll,&BP(15,$inp));
420
421 &call ("_mmx_gmult_4bit_inner");
422
423 &mov ($inp,&wparam(0)); # load Xi
424 &emms ();
425 &mov (&DWP(12,$inp),$Zll);
426 &mov (&DWP(4,$inp),$Zhl);
427 &mov (&DWP(8,$inp),$Zlh);
428 &mov (&DWP(0,$inp),$Zhh);
429&function_end("gcm_gmult_4bit_mmx");
430
431# Streamed version performs 20% better on P4, 7% on Opteron,
432# 10% on Core2 and PIII...
433&function_begin("gcm_ghash_4bit_mmx");
434 &mov ($Zhh,&wparam(0)); # load Xi
435 &mov ($Htbl,&wparam(1)); # load Htable
436 &mov ($inp,&wparam(2)); # load in
437 &mov ($Zlh,&wparam(3)); # load len
438
439 &call (&label("pic_point"));
440 &set_label("pic_point");
441 &blindpop("eax");
442 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
443
444 &add ($Zlh,$inp);
445 &mov (&wparam(3),$Zlh); # len to point at the end of input
446 &stack_push(4+1); # +1 for stack alignment
447
448 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
449 &mov ($Zhl,&DWP(4,$Zhh));
450 &mov ($Zlh,&DWP(8,$Zhh));
451 &mov ($Zhh,&DWP(0,$Zhh));
452 &jmp (&label("mmx_outer_loop"));
453
454 &set_label("mmx_outer_loop",16);
455 &xor ($Zll,&DWP(12,$inp));
456 &xor ($Zhl,&DWP(4,$inp));
457 &xor ($Zlh,&DWP(8,$inp));
458 &xor ($Zhh,&DWP(0,$inp));
459 &mov (&wparam(2),$inp);
460 &mov (&DWP(12,"esp"),$Zll);
461 &mov (&DWP(4,"esp"),$Zhl);
462 &mov (&DWP(8,"esp"),$Zlh);
463 &mov (&DWP(0,"esp"),$Zhh);
464
465 &mov ($inp,"esp");
466 &shr ($Zll,24);
467
468 &call ("_mmx_gmult_4bit_inner");
469
470 &mov ($inp,&wparam(2));
471 &lea ($inp,&DWP(16,$inp));
472 &cmp ($inp,&wparam(3));
473 &jb (&label("mmx_outer_loop"));
474
475 &mov ($inp,&wparam(0)); # load Xi
476 &emms ();
477 &mov (&DWP(12,$inp),$Zll);
478 &mov (&DWP(4,$inp),$Zhl);
479 &mov (&DWP(8,$inp),$Zlh);
480 &mov (&DWP(0,$inp),$Zhh);
481
482 &stack_pop(4+1);
483&function_end("gcm_ghash_4bit_mmx");
484
485}} else {{ # "June" MMX version...
486 # ... has slower "April" gcm_gmult_4bit_mmx with folded
487 # loop. This is done to conserve code size...
488$S=16; # shift factor for rem_4bit
489
490sub mmx_loop() {
491# MMX version performs 2.8 times better on P4 (see comment in non-MMX
492# routine for further details), 40% better on Opteron and Core2, 50%
493# better on PIII... In other words effort is considered to be well
494# spent...
495 my $inp = shift;
496 my $rem_4bit = shift;
497 my $cnt = $Zhh;
498 my $nhi = $Zhl;
499 my $nlo = $Zlh;
500 my $rem = $Zll;
501
502 my ($Zlo,$Zhi) = ("mm0","mm1");
503 my $tmp = "mm2";
504
505 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
506 &mov ($nhi,$Zll);
507 &mov (&LB($nlo),&LB($nhi));
508 &mov ($cnt,14);
509 &shl (&LB($nlo),4);
510 &and ($nhi,0xf0);
511 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
512 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
513 &movd ($rem,$Zlo);
514 &jmp (&label("mmx_loop"));
515
516 &set_label("mmx_loop",16);
517 &psrlq ($Zlo,4);
518 &and ($rem,0xf);
519 &movq ($tmp,$Zhi);
520 &psrlq ($Zhi,4);
521 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
522 &mov (&LB($nlo),&BP(0,$inp,$cnt));
523 &psllq ($tmp,60);
524 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
525 &dec ($cnt);
526 &movd ($rem,$Zlo);
527 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
528 &mov ($nhi,$nlo);
529 &pxor ($Zlo,$tmp);
530 &js (&label("mmx_break"));
531
532 &shl (&LB($nlo),4);
533 &and ($rem,0xf);
534 &psrlq ($Zlo,4);
535 &and ($nhi,0xf0);
536 &movq ($tmp,$Zhi);
537 &psrlq ($Zhi,4);
538 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
539 &psllq ($tmp,60);
540 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
541 &movd ($rem,$Zlo);
542 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
543 &pxor ($Zlo,$tmp);
544 &jmp (&label("mmx_loop"));
545
546 &set_label("mmx_break",16);
547 &shl (&LB($nlo),4);
548 &and ($rem,0xf);
549 &psrlq ($Zlo,4);
550 &and ($nhi,0xf0);
551 &movq ($tmp,$Zhi);
552 &psrlq ($Zhi,4);
553 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
554 &psllq ($tmp,60);
555 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
556 &movd ($rem,$Zlo);
557 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
558 &pxor ($Zlo,$tmp);
559
560 &psrlq ($Zlo,4);
561 &and ($rem,0xf);
562 &movq ($tmp,$Zhi);
563 &psrlq ($Zhi,4);
564 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
565 &psllq ($tmp,60);
566 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
567 &movd ($rem,$Zlo);
568 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
569 &pxor ($Zlo,$tmp);
570
571 &psrlq ($Zlo,32); # lower part of Zlo is already there
572 &movd ($Zhl,$Zhi);
573 &psrlq ($Zhi,32);
574 &movd ($Zlh,$Zlo);
575 &movd ($Zhh,$Zhi);
576
577 &bswap ($Zll);
578 &bswap ($Zhl);
579 &bswap ($Zlh);
580 &bswap ($Zhh);
581}
582
583&function_begin("gcm_gmult_4bit_mmx");
584 &mov ($inp,&wparam(0)); # load Xi
585 &mov ($Htbl,&wparam(1)); # load Htable
586
587 &call (&label("pic_point"));
588 &set_label("pic_point");
589 &blindpop("eax");
590 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
591
592 &movz ($Zll,&BP(15,$inp));
593
594 &mmx_loop($inp,"eax");
595
596 &emms ();
597 &mov (&DWP(12,$inp),$Zll);
598 &mov (&DWP(4,$inp),$Zhl);
599 &mov (&DWP(8,$inp),$Zlh);
600 &mov (&DWP(0,$inp),$Zhh);
601&function_end("gcm_gmult_4bit_mmx");
602
603######################################################################
604# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
605# (see gcm128.c for details). It provides further 20-40% performance
606# improvement over above mentioned "May" version.
607
608&static_label("rem_8bit");
609
610&function_begin("gcm_ghash_4bit_mmx");
611{ my ($Zlo,$Zhi) = ("mm7","mm6");
612 my $rem_8bit = "esi";
613 my $Htbl = "ebx";
614
615 # parameter block
616 &mov ("eax",&wparam(0)); # Xi
617 &mov ("ebx",&wparam(1)); # Htable
618 &mov ("ecx",&wparam(2)); # inp
619 &mov ("edx",&wparam(3)); # len
620 &mov ("ebp","esp"); # original %esp
621 &call (&label("pic_point"));
622 &set_label ("pic_point");
623 &blindpop ($rem_8bit);
624 &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
625
626 &sub ("esp",512+16+16); # allocate stack frame...
627 &and ("esp",-64); # ...and align it
628 &sub ("esp",16); # place for (u8)(H[]<<4)
629
630 &add ("edx","ecx"); # pointer to the end of input
631 &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
632 &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
633 &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
634
635 { my @lo = ("mm0","mm1","mm2");
636 my @hi = ("mm3","mm4","mm5");
637 my @tmp = ("mm6","mm7");
638 my ($off1,$off2,$i) = (0,0,);
639
640 &add ($Htbl,128); # optimize for size
641 &lea ("edi",&DWP(16+128,"esp"));
642 &lea ("ebp",&DWP(16+256+128,"esp"));
643
644 # decompose Htable (low and high parts are kept separately),
645 # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
646 for ($i=0;$i<18;$i++) {
647
648 &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
649 &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
650 &psllq ($tmp[1],60) if ($i>1);
651 &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
652 &por ($lo[2],$tmp[1]) if ($i>1);
653 &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
654 &psrlq ($lo[1],4) if ($i>0 && $i<17);
655 &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
656 &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
657 &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
658 &psrlq ($hi[1],4) if ($i>0 && $i<17);
659 &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
660 &shl ("edx",4) if ($i<16);
661 &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
662
663 unshift (@lo,pop(@lo)); # "rotate" registers
664 unshift (@hi,pop(@hi));
665 unshift (@tmp,pop(@tmp));
666 $off1 += 8 if ($i>0);
667 $off2 += 8 if ($i>1);
668 }
669 }
670
671 &movq ($Zhi,&QWP(0,"eax"));
672 &mov ("ebx",&DWP(8,"eax"));
673 &mov ("edx",&DWP(12,"eax")); # load Xi
674
675&set_label("outer",16);
676 { my $nlo = "eax";
677 my $dat = "edx";
678 my @nhi = ("edi","ebp");
679 my @rem = ("ebx","ecx");
680 my @red = ("mm0","mm1","mm2");
681 my $tmp = "mm3";
682
683 &xor ($dat,&DWP(12,"ecx")); # merge input data
684 &xor ("ebx",&DWP(8,"ecx"));
685 &pxor ($Zhi,&QWP(0,"ecx"));
686 &lea ("ecx",&DWP(16,"ecx")); # inp+=16
687 #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
688 &mov (&DWP(528+8,"esp"),"ebx");
689 &movq (&QWP(528+0,"esp"),$Zhi);
690 &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
691
692 &xor ($nlo,$nlo);
693 &rol ($dat,8);
694 &mov (&LB($nlo),&LB($dat));
695 &mov ($nhi[1],$nlo);
696 &and (&LB($nlo),0x0f);
697 &shr ($nhi[1],4);
698 &pxor ($red[0],$red[0]);
699 &rol ($dat,8); # next byte
700 &pxor ($red[1],$red[1]);
701 &pxor ($red[2],$red[2]);
702
703 # Just like in "May" verson modulo-schedule for critical path in
704 # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
705 # is scheduled so late that rem_8bit[] has to be shifted *right*
706 # by 16, which is why last argument to pinsrw is 2, which
707 # corresponds to <<32=<<48>>16...
708 for ($j=11,$i=0;$i<15;$i++) {
709
710 if ($i>0) {
711 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
712 &rol ($dat,8); # next byte
713 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
714
715 &pxor ($Zlo,$tmp);
716 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
717 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
718 } else {
719 &movq ($Zlo,&QWP(16,"esp",$nlo,8));
720 &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
721 }
722
723 &mov (&LB($nlo),&LB($dat));
724 &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
725
726 &movd ($rem[0],$Zlo);
727 &movz ($rem[1],&LB($rem[1])) if ($i>0);
728 &psrlq ($Zlo,8); # Z>>=8
729
730 &movq ($tmp,$Zhi);
731 &mov ($nhi[0],$nlo);
732 &psrlq ($Zhi,8);
733
734 &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
735 &and (&LB($nlo),0x0f);
736 &psllq ($tmp,56);
737
738 &pxor ($Zhi,$red[1]) if ($i>1);
739 &shr ($nhi[0],4);
740 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
741
742 unshift (@red,pop(@red)); # "rotate" registers
743 unshift (@rem,pop(@rem));
744 unshift (@nhi,pop(@nhi));
745 }
746
747 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
748 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
749 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
750
751 &pxor ($Zlo,$tmp);
752 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
753 &movz ($rem[1],&LB($rem[1]));
754
755 &pxor ($red[2],$red[2]); # clear 2nd word
756 &psllq ($red[1],4);
757
758 &movd ($rem[0],$Zlo);
759 &psrlq ($Zlo,4); # Z>>=4
760
761 &movq ($tmp,$Zhi);
762 &psrlq ($Zhi,4);
763 &shl ($rem[0],4); # rem<<4
764
765 &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
766 &psllq ($tmp,60);
767 &movz ($rem[0],&LB($rem[0]));
768
769 &pxor ($Zlo,$tmp);
770 &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
771
772 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
773 &pxor ($Zhi,$red[1]);
774
775 &movd ($dat,$Zlo);
776 &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
777
778 &psllq ($red[0],12); # correct by <<16>>4
779 &pxor ($Zhi,$red[0]);
780 &psrlq ($Zlo,32);
781 &pxor ($Zhi,$red[2]);
782
783 &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
784 &movd ("ebx",$Zlo);
785 &movq ($tmp,$Zhi); # 01234567
786 &psllw ($Zhi,8); # 1.3.5.7.
787 &psrlw ($tmp,8); # .0.2.4.6
788 &por ($Zhi,$tmp); # 10325476
789 &bswap ($dat);
790 &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
791 &bswap ("ebx");
792
793 &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
794 &jne (&label("outer"));
795 }
796
797 &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
798 &mov (&DWP(12,"eax"),"edx");
799 &mov (&DWP(8,"eax"),"ebx");
800 &movq (&QWP(0,"eax"),$Zhi);
801
802 &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
803 &emms ();
804}
805&function_end("gcm_ghash_4bit_mmx");
806}}
807
808if ($sse2) {{
809######################################################################
810# PCLMULQDQ version.
811
812$Xip="eax";
813$Htbl="edx";
814$const="ecx";
815$inp="esi";
816$len="ebx";
817
818($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
819($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
820($Xn,$Xhn)=("xmm6","xmm7");
821
822&static_label("bswap");
823
824sub clmul64x64_T2 { # minimal "register" pressure
825my ($Xhi,$Xi,$Hkey)=@_;
826
827 &movdqa ($Xhi,$Xi); #
828 &pshufd ($T1,$Xi,0b01001110);
829 &pshufd ($T2,$Hkey,0b01001110);
830 &pxor ($T1,$Xi); #
831 &pxor ($T2,$Hkey);
832
833 &pclmulqdq ($Xi,$Hkey,0x00); #######
834 &pclmulqdq ($Xhi,$Hkey,0x11); #######
835 &pclmulqdq ($T1,$T2,0x00); #######
836 &xorps ($T1,$Xi); #
837 &xorps ($T1,$Xhi); #
838
839 &movdqa ($T2,$T1); #
840 &psrldq ($T1,8);
841 &pslldq ($T2,8); #
842 &pxor ($Xhi,$T1);
843 &pxor ($Xi,$T2); #
844}
845
846sub clmul64x64_T3 {
847# Even though this subroutine offers visually better ILP, it
848# was empirically found to be a tad slower than above version.
849# At least in gcm_ghash_clmul context. But it's just as well,
850# because loop modulo-scheduling is possible only thanks to
851# minimized "register" pressure...
852my ($Xhi,$Xi,$Hkey)=@_;
853
854 &movdqa ($T1,$Xi); #
855 &movdqa ($Xhi,$Xi);
856 &pclmulqdq ($Xi,$Hkey,0x00); #######
857 &pclmulqdq ($Xhi,$Hkey,0x11); #######
858 &pshufd ($T2,$T1,0b01001110); #
859 &pshufd ($T3,$Hkey,0b01001110);
860 &pxor ($T2,$T1); #
861 &pxor ($T3,$Hkey);
862 &pclmulqdq ($T2,$T3,0x00); #######
863 &pxor ($T2,$Xi); #
864 &pxor ($T2,$Xhi); #
865
866 &movdqa ($T3,$T2); #
867 &psrldq ($T2,8);
868 &pslldq ($T3,8); #
869 &pxor ($Xhi,$T2);
870 &pxor ($Xi,$T3); #
871}
872
873if (1) { # Algorithm 9 with <<1 twist.
874 # Reduction is shorter and uses only two
875 # temporary registers, which makes it better
876 # candidate for interleaving with 64x64
877 # multiplication. Pre-modulo-scheduled loop
878 # was found to be ~20% faster than Algorithm 5
879 # below. Algorithm 9 was therefore chosen for
880 # further optimization...
881
882sub reduction_alg9 { # 17/13 times faster than Intel version
883my ($Xhi,$Xi) = @_;
884
885 # 1st phase
886 &movdqa ($T1,$Xi); #
887 &psllq ($Xi,1);
888 &pxor ($Xi,$T1); #
889 &psllq ($Xi,5); #
890 &pxor ($Xi,$T1); #
891 &psllq ($Xi,57); #
892 &movdqa ($T2,$Xi); #
893 &pslldq ($Xi,8);
894 &psrldq ($T2,8); #
895 &pxor ($Xi,$T1);
896 &pxor ($Xhi,$T2); #
897
898 # 2nd phase
899 &movdqa ($T2,$Xi);
900 &psrlq ($Xi,5);
901 &pxor ($Xi,$T2); #
902 &psrlq ($Xi,1); #
903 &pxor ($Xi,$T2); #
904 &pxor ($T2,$Xhi);
905 &psrlq ($Xi,1); #
906 &pxor ($Xi,$T2); #
907}
908
909&function_begin_B("gcm_init_clmul");
910 &mov ($Htbl,&wparam(0));
911 &mov ($Xip,&wparam(1));
912
913 &call (&label("pic"));
914&set_label("pic");
915 &blindpop ($const);
916 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
917
918 &movdqu ($Hkey,&QWP(0,$Xip));
919 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
920
921 # <<1 twist
922 &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
923 &movdqa ($T1,$Hkey);
924 &psllq ($Hkey,1);
925 &pxor ($T3,$T3); #
926 &psrlq ($T1,63);
927 &pcmpgtd ($T3,$T2); # broadcast carry bit
928 &pslldq ($T1,8);
929 &por ($Hkey,$T1); # H<<=1
930
931 # magic reduction
932 &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
933 &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
934
935 # calculate H^2
936 &movdqa ($Xi,$Hkey);
937 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
938 &reduction_alg9 ($Xhi,$Xi);
939
940 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
941 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
942
943 &ret ();
944&function_end_B("gcm_init_clmul");
945
946&function_begin_B("gcm_gmult_clmul");
947 &mov ($Xip,&wparam(0));
948 &mov ($Htbl,&wparam(1));
949
950 &call (&label("pic"));
951&set_label("pic");
952 &blindpop ($const);
953 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
954
955 &movdqu ($Xi,&QWP(0,$Xip));
956 &movdqa ($T3,&QWP(0,$const));
957 &movups ($Hkey,&QWP(0,$Htbl));
958 &pshufb ($Xi,$T3);
959
960 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
961 &reduction_alg9 ($Xhi,$Xi);
962
963 &pshufb ($Xi,$T3);
964 &movdqu (&QWP(0,$Xip),$Xi);
965
966 &ret ();
967&function_end_B("gcm_gmult_clmul");
968
969&function_begin("gcm_ghash_clmul");
970 &mov ($Xip,&wparam(0));
971 &mov ($Htbl,&wparam(1));
972 &mov ($inp,&wparam(2));
973 &mov ($len,&wparam(3));
974
975 &call (&label("pic"));
976&set_label("pic");
977 &blindpop ($const);
978 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
979
980 &movdqu ($Xi,&QWP(0,$Xip));
981 &movdqa ($T3,&QWP(0,$const));
982 &movdqu ($Hkey,&QWP(0,$Htbl));
983 &pshufb ($Xi,$T3);
984
985 &sub ($len,0x10);
986 &jz (&label("odd_tail"));
987
988 #######
989 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
990 # [(H*Ii+1) + (H*Xi+1)] mod P =
991 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
992 #
993 &movdqu ($T1,&QWP(0,$inp)); # Ii
994 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
995 &pshufb ($T1,$T3);
996 &pshufb ($Xn,$T3);
997 &pxor ($Xi,$T1); # Ii+Xi
998
999 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
1000 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1001
1002 &lea ($inp,&DWP(32,$inp)); # i+=2
1003 &sub ($len,0x20);
1004 &jbe (&label("even_tail"));
1005
1006&set_label("mod_loop");
1007 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1008 &movdqu ($T1,&QWP(0,$inp)); # Ii
1009 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1010
1011 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1012 &pxor ($Xhi,$Xhn);
1013
1014 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1015 &pshufb ($T1,$T3);
1016 &pshufb ($Xn,$T3);
1017
1018 &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1019 &movdqa ($Xhn,$Xn);
1020 &pxor ($Xhi,$T1); # "Ii+Xi", consume early
1021
1022 &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
1023 &psllq ($Xi,1);
1024 &pxor ($Xi,$T1); #
1025 &psllq ($Xi,5); #
1026 &pxor ($Xi,$T1); #
1027 &pclmulqdq ($Xn,$Hkey,0x00); #######
1028 &psllq ($Xi,57); #
1029 &movdqa ($T2,$Xi); #
1030 &pslldq ($Xi,8);
1031 &psrldq ($T2,8); #
1032 &pxor ($Xi,$T1);
1033 &pshufd ($T1,$T3,0b01001110);
1034 &pxor ($Xhi,$T2); #
1035 &pxor ($T1,$T3);
1036 &pshufd ($T3,$Hkey,0b01001110);
1037 &pxor ($T3,$Hkey); #
1038
1039 &pclmulqdq ($Xhn,$Hkey,0x11); #######
1040 &movdqa ($T2,$Xi); # 2nd phase
1041 &psrlq ($Xi,5);
1042 &pxor ($Xi,$T2); #
1043 &psrlq ($Xi,1); #
1044 &pxor ($Xi,$T2); #
1045 &pxor ($T2,$Xhi);
1046 &psrlq ($Xi,1); #
1047 &pxor ($Xi,$T2); #
1048
1049 &pclmulqdq ($T1,$T3,0x00); #######
1050 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1051 &xorps ($T1,$Xn); #
1052 &xorps ($T1,$Xhn); #
1053
1054 &movdqa ($T3,$T1); #
1055 &psrldq ($T1,8);
1056 &pslldq ($T3,8); #
1057 &pxor ($Xhn,$T1);
1058 &pxor ($Xn,$T3); #
1059 &movdqa ($T3,&QWP(0,$const));
1060
1061 &lea ($inp,&DWP(32,$inp));
1062 &sub ($len,0x20);
1063 &ja (&label("mod_loop"));
1064
1065&set_label("even_tail");
1066 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1067
1068 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1069 &pxor ($Xhi,$Xhn);
1070
1071 &reduction_alg9 ($Xhi,$Xi);
1072
1073 &test ($len,$len);
1074 &jnz (&label("done"));
1075
1076 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1077&set_label("odd_tail");
1078 &movdqu ($T1,&QWP(0,$inp)); # Ii
1079 &pshufb ($T1,$T3);
1080 &pxor ($Xi,$T1); # Ii+Xi
1081
1082 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1083 &reduction_alg9 ($Xhi,$Xi);
1084
1085&set_label("done");
1086 &pshufb ($Xi,$T3);
1087 &movdqu (&QWP(0,$Xip),$Xi);
1088&function_end("gcm_ghash_clmul");
1089
1090} else { # Algorith 5. Kept for reference purposes.
1091
1092sub reduction_alg5 { # 19/16 times faster than Intel version
1093my ($Xhi,$Xi)=@_;
1094
1095 # <<1
1096 &movdqa ($T1,$Xi); #
1097 &movdqa ($T2,$Xhi);
1098 &pslld ($Xi,1);
1099 &pslld ($Xhi,1); #
1100 &psrld ($T1,31);
1101 &psrld ($T2,31); #
1102 &movdqa ($T3,$T1);
1103 &pslldq ($T1,4);
1104 &psrldq ($T3,12); #
1105 &pslldq ($T2,4);
1106 &por ($Xhi,$T3); #
1107 &por ($Xi,$T1);
1108 &por ($Xhi,$T2); #
1109
1110 # 1st phase
1111 &movdqa ($T1,$Xi);
1112 &movdqa ($T2,$Xi);
1113 &movdqa ($T3,$Xi); #
1114 &pslld ($T1,31);
1115 &pslld ($T2,30);
1116 &pslld ($Xi,25); #
1117 &pxor ($T1,$T2);
1118 &pxor ($T1,$Xi); #
1119 &movdqa ($T2,$T1); #
1120 &pslldq ($T1,12);
1121 &psrldq ($T2,4); #
1122 &pxor ($T3,$T1);
1123
1124 # 2nd phase
1125 &pxor ($Xhi,$T3); #
1126 &movdqa ($Xi,$T3);
1127 &movdqa ($T1,$T3);
1128 &psrld ($Xi,1); #
1129 &psrld ($T1,2);
1130 &psrld ($T3,7); #
1131 &pxor ($Xi,$T1);
1132 &pxor ($Xhi,$T2);
1133 &pxor ($Xi,$T3); #
1134 &pxor ($Xi,$Xhi); #
1135}
1136
1137&function_begin_B("gcm_init_clmul");
1138 &mov ($Htbl,&wparam(0));
1139 &mov ($Xip,&wparam(1));
1140
1141 &call (&label("pic"));
1142&set_label("pic");
1143 &blindpop ($const);
1144 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1145
1146 &movdqu ($Hkey,&QWP(0,$Xip));
1147 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1148
1149 # calculate H^2
1150 &movdqa ($Xi,$Hkey);
1151 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1152 &reduction_alg5 ($Xhi,$Xi);
1153
1154 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1155 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1156
1157 &ret ();
1158&function_end_B("gcm_init_clmul");
1159
1160&function_begin_B("gcm_gmult_clmul");
1161 &mov ($Xip,&wparam(0));
1162 &mov ($Htbl,&wparam(1));
1163
1164 &call (&label("pic"));
1165&set_label("pic");
1166 &blindpop ($const);
1167 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1168
1169 &movdqu ($Xi,&QWP(0,$Xip));
1170 &movdqa ($Xn,&QWP(0,$const));
1171 &movdqu ($Hkey,&QWP(0,$Htbl));
1172 &pshufb ($Xi,$Xn);
1173
1174 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1175 &reduction_alg5 ($Xhi,$Xi);
1176
1177 &pshufb ($Xi,$Xn);
1178 &movdqu (&QWP(0,$Xip),$Xi);
1179
1180 &ret ();
1181&function_end_B("gcm_gmult_clmul");
1182
1183&function_begin("gcm_ghash_clmul");
1184 &mov ($Xip,&wparam(0));
1185 &mov ($Htbl,&wparam(1));
1186 &mov ($inp,&wparam(2));
1187 &mov ($len,&wparam(3));
1188
1189 &call (&label("pic"));
1190&set_label("pic");
1191 &blindpop ($const);
1192 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1193
1194 &movdqu ($Xi,&QWP(0,$Xip));
1195 &movdqa ($T3,&QWP(0,$const));
1196 &movdqu ($Hkey,&QWP(0,$Htbl));
1197 &pshufb ($Xi,$T3);
1198
1199 &sub ($len,0x10);
1200 &jz (&label("odd_tail"));
1201
1202 #######
1203 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1204 # [(H*Ii+1) + (H*Xi+1)] mod P =
1205 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1206 #
1207 &movdqu ($T1,&QWP(0,$inp)); # Ii
1208 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1209 &pshufb ($T1,$T3);
1210 &pshufb ($Xn,$T3);
1211 &pxor ($Xi,$T1); # Ii+Xi
1212
1213 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1214 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1215
1216 &sub ($len,0x20);
1217 &lea ($inp,&DWP(32,$inp)); # i+=2
1218 &jbe (&label("even_tail"));
1219
1220&set_label("mod_loop");
1221 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1222 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1223
1224 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1225 &pxor ($Xhi,$Xhn);
1226
1227 &reduction_alg5 ($Xhi,$Xi);
1228
1229 #######
1230 &movdqa ($T3,&QWP(0,$const));
1231 &movdqu ($T1,&QWP(0,$inp)); # Ii
1232 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1233 &pshufb ($T1,$T3);
1234 &pshufb ($Xn,$T3);
1235 &pxor ($Xi,$T1); # Ii+Xi
1236
1237 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1238 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1239
1240 &sub ($len,0x20);
1241 &lea ($inp,&DWP(32,$inp));
1242 &ja (&label("mod_loop"));
1243
1244&set_label("even_tail");
1245 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1246
1247 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1248 &pxor ($Xhi,$Xhn);
1249
1250 &reduction_alg5 ($Xhi,$Xi);
1251
1252 &movdqa ($T3,&QWP(0,$const));
1253 &test ($len,$len);
1254 &jnz (&label("done"));
1255
1256 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1257&set_label("odd_tail");
1258 &movdqu ($T1,&QWP(0,$inp)); # Ii
1259 &pshufb ($T1,$T3);
1260 &pxor ($Xi,$T1); # Ii+Xi
1261
1262 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1263 &reduction_alg5 ($Xhi,$Xi);
1264
1265 &movdqa ($T3,&QWP(0,$const));
1266&set_label("done");
1267 &pshufb ($Xi,$T3);
1268 &movdqu (&QWP(0,$Xip),$Xi);
1269&function_end("gcm_ghash_clmul");
1270
1271}
1272
1273&set_label("bswap",64);
1274 &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1275 &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1276}} # $sse2
1277
1278&set_label("rem_4bit",64);
1279 &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1280 &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1281 &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1282 &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1283&set_label("rem_8bit",64);
1284 &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1285 &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1286 &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1287 &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1288 &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1289 &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1290 &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1291 &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1292 &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1293 &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1294 &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1295 &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1296 &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1297 &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1298 &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1299 &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1300 &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1301 &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1302 &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1303 &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1304 &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1305 &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1306 &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1307 &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1308 &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1309 &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1310 &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1311 &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1312 &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1313 &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1314 &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1315 &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1316}}} # !$x86only
1317
1318&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
1319&asm_finish();
1320
1321# A question was risen about choice of vanilla MMX. Or rather why wasn't
1322# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1323# CPUs such as PIII, "4-bit" MMX version was observed to provide better
1324# performance than *corresponding* SSE2 one even on contemporary CPUs.
1325# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1326# implementation featuring full range of lookup-table sizes, but with
1327# per-invocation lookup table setup. Latter means that table size is
1328# chosen depending on how much data is to be hashed in every given call,
1329# more data - larger table. Best reported result for Core2 is ~4 cycles
1330# per processed byte out of 64KB block. This number accounts even for
1331# 64KB table setup overhead. As discussed in gcm128.c we choose to be
1332# more conservative in respect to lookup table sizes, but how do the
1333# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1334# on same platform. As also discussed in gcm128.c, next in line "8-bit
1335# Shoup's" or "4KB" method should deliver twice the performance of
1336# "256B" one, in other words not worse than ~6 cycles per byte. It
1337# should be also be noted that in SSE2 case improvement can be "super-
1338# linear," i.e. more than twice, mostly because >>8 maps to single
1339# instruction on SSE2 register. This is unlike "4-bit" case when >>4
1340# maps to same amount of instructions in both MMX and SSE2 cases.
1341# Bottom line is that switch to SSE2 is considered to be justifiable
1342# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index 38d779edbc..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,806 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that
14# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15# function features so called "528B" variant utilizing additional
16# 256+16 bytes of per-key storage [+512 bytes shared table].
17# Performance results are for this streamed GHASH subroutine and are
18# expressed in cycles per processed byte, less is better:
19#
20# gcc 3.4.x(*) assembler
21#
22# P4 28.6 14.0 +100%
23# Opteron 19.3 7.7 +150%
24# Core2 17.8 8.1(**) +120%
25#
26# (*) comparison is not completely fair, because C results are
27# for vanilla "256B" implementation, while assembler results
28# are for "528B";-)
29# (**) it's mystery [to me] why Core2 result is not same as for
30# Opteron;
31
32# May 2010
33#
34# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
35# See ghash-x86.pl for background information and details about coding
36# techniques.
37#
38# Special thanks to David Woodhouse <dwmw2@infradead.org> for
39# providing access to a Westmere-based system on behalf of Intel
40# Open Source Technology Centre.
41
42$flavour = shift;
43$output = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour $output";
54*STDOUT=*OUT;
55
56# common register layout
57$nlo="%rax";
58$nhi="%rbx";
59$Zlo="%r8";
60$Zhi="%r9";
61$tmp="%r10";
62$rem_4bit = "%r11";
63
64$Xi="%rdi";
65$Htbl="%rsi";
66
67# per-function register layout
68$cnt="%rcx";
69$rem="%rdx";
70
71sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
72 $r =~ s/%[er]([sd]i)/%\1l/ or
73 $r =~ s/%[er](bp)/%\1l/ or
74 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
75
76sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
77{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
78 my $arg = pop;
79 $arg = "\$$arg" if ($arg*1 eq $arg);
80 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
81}
82
83{ my $N;
84 sub loop() {
85 my $inp = shift;
86
87 $N++;
88$code.=<<___;
89 xor $nlo,$nlo
90 xor $nhi,$nhi
91 mov `&LB("$Zlo")`,`&LB("$nlo")`
92 mov `&LB("$Zlo")`,`&LB("$nhi")`
93 shl \$4,`&LB("$nlo")`
94 mov \$14,$cnt
95 mov 8($Htbl,$nlo),$Zlo
96 mov ($Htbl,$nlo),$Zhi
97 and \$0xf0,`&LB("$nhi")`
98 mov $Zlo,$rem
99 jmp .Loop$N
100
101.align 16
102.Loop$N:
103 shr \$4,$Zlo
104 and \$0xf,$rem
105 mov $Zhi,$tmp
106 mov ($inp,$cnt),`&LB("$nlo")`
107 shr \$4,$Zhi
108 xor 8($Htbl,$nhi),$Zlo
109 shl \$60,$tmp
110 xor ($Htbl,$nhi),$Zhi
111 mov `&LB("$nlo")`,`&LB("$nhi")`
112 xor ($rem_4bit,$rem,8),$Zhi
113 mov $Zlo,$rem
114 shl \$4,`&LB("$nlo")`
115 xor $tmp,$Zlo
116 dec $cnt
117 js .Lbreak$N
118
119 shr \$4,$Zlo
120 and \$0xf,$rem
121 mov $Zhi,$tmp
122 shr \$4,$Zhi
123 xor 8($Htbl,$nlo),$Zlo
124 shl \$60,$tmp
125 xor ($Htbl,$nlo),$Zhi
126 and \$0xf0,`&LB("$nhi")`
127 xor ($rem_4bit,$rem,8),$Zhi
128 mov $Zlo,$rem
129 xor $tmp,$Zlo
130 jmp .Loop$N
131
132.align 16
133.Lbreak$N:
134 shr \$4,$Zlo
135 and \$0xf,$rem
136 mov $Zhi,$tmp
137 shr \$4,$Zhi
138 xor 8($Htbl,$nlo),$Zlo
139 shl \$60,$tmp
140 xor ($Htbl,$nlo),$Zhi
141 and \$0xf0,`&LB("$nhi")`
142 xor ($rem_4bit,$rem,8),$Zhi
143 mov $Zlo,$rem
144 xor $tmp,$Zlo
145
146 shr \$4,$Zlo
147 and \$0xf,$rem
148 mov $Zhi,$tmp
149 shr \$4,$Zhi
150 xor 8($Htbl,$nhi),$Zlo
151 shl \$60,$tmp
152 xor ($Htbl,$nhi),$Zhi
153 xor $tmp,$Zlo
154 xor ($rem_4bit,$rem,8),$Zhi
155
156 bswap $Zlo
157 bswap $Zhi
158___
159}}
160
161$code=<<___;
162.text
163
164.globl gcm_gmult_4bit
165.type gcm_gmult_4bit,\@function,2
166.align 16
167gcm_gmult_4bit:
168 push %rbx
169 push %rbp # %rbp and %r12 are pushed exclusively in
170 push %r12 # order to reuse Win64 exception handler...
171.Lgmult_prologue:
172
173 movzb 15($Xi),$Zlo
174 lea .Lrem_4bit(%rip),$rem_4bit
175___
176 &loop ($Xi);
177$code.=<<___;
178 mov $Zlo,8($Xi)
179 mov $Zhi,($Xi)
180
181 mov 16(%rsp),%rbx
182 lea 24(%rsp),%rsp
183.Lgmult_epilogue:
184 ret
185.size gcm_gmult_4bit,.-gcm_gmult_4bit
186___
187
188# per-function register layout
189$inp="%rdx";
190$len="%rcx";
191$rem_8bit=$rem_4bit;
192
193$code.=<<___;
194.globl gcm_ghash_4bit
195.type gcm_ghash_4bit,\@function,4
196.align 16
197gcm_ghash_4bit:
198 push %rbx
199 push %rbp
200 push %r12
201 push %r13
202 push %r14
203 push %r15
204 sub \$280,%rsp
205.Lghash_prologue:
206 mov $inp,%r14 # reassign couple of args
207 mov $len,%r15
208___
209{ my $inp="%r14";
210 my $dat="%edx";
211 my $len="%r15";
212 my @nhi=("%ebx","%ecx");
213 my @rem=("%r12","%r13");
214 my $Hshr4="%rbp";
215
216 &sub ($Htbl,-128); # size optimization
217 &lea ($Hshr4,"16+128(%rsp)");
218 { my @lo =($nlo,$nhi);
219 my @hi =($Zlo,$Zhi);
220
221 &xor ($dat,$dat);
222 for ($i=0,$j=-2;$i<18;$i++,$j++) {
223 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
224 &or ($lo[0],$tmp) if ($i>1);
225 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
226 &shr ($lo[1],4) if ($i>0 && $i<17);
227 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
228 &shr ($hi[1],4) if ($i>0 && $i<17);
229 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
230 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
231 &shl (&LB($dat),4) if ($i>0 && $i<17);
232 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
233 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
234 &shl ($tmp,60) if ($i>0 && $i<17);
235
236 push (@lo,shift(@lo));
237 push (@hi,shift(@hi));
238 }
239 }
240 &add ($Htbl,-128);
241 &mov ($Zlo,"8($Xi)");
242 &mov ($Zhi,"0($Xi)");
243 &add ($len,$inp); # pointer to the end of data
244 &lea ($rem_8bit,".Lrem_8bit(%rip)");
245 &jmp (".Louter_loop");
246
247$code.=".align 16\n.Louter_loop:\n";
248 &xor ($Zhi,"($inp)");
249 &mov ("%rdx","8($inp)");
250 &lea ($inp,"16($inp)");
251 &xor ("%rdx",$Zlo);
252 &mov ("($Xi)",$Zhi);
253 &mov ("8($Xi)","%rdx");
254 &shr ("%rdx",32);
255
256 &xor ($nlo,$nlo);
257 &rol ($dat,8);
258 &mov (&LB($nlo),&LB($dat));
259 &movz ($nhi[0],&LB($dat));
260 &shl (&LB($nlo),4);
261 &shr ($nhi[0],4);
262
263 for ($j=11,$i=0;$i<15;$i++) {
264 &rol ($dat,8);
265 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
266 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
267 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
268 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
269
270 &mov (&LB($nlo),&LB($dat));
271 &xor ($Zlo,$tmp) if ($i>0);
272 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
273
274 &movz ($nhi[1],&LB($dat));
275 &shl (&LB($nlo),4);
276 &movzb ($rem[0],"(%rsp,$nhi[0])");
277
278 &shr ($nhi[1],4) if ($i<14);
279 &and ($nhi[1],0xf0) if ($i==14);
280 &shl ($rem[1],48) if ($i>0);
281 &xor ($rem[0],$Zlo);
282
283 &mov ($tmp,$Zhi);
284 &xor ($Zhi,$rem[1]) if ($i>0);
285 &shr ($Zlo,8);
286
287 &movz ($rem[0],&LB($rem[0]));
288 &mov ($dat,"$j($Xi)") if (--$j%4==0);
289 &shr ($Zhi,8);
290
291 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
292 &shl ($tmp,56);
293 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
294
295 unshift (@nhi,pop(@nhi)); # "rotate" registers
296 unshift (@rem,pop(@rem));
297 }
298 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
299 &xor ($Zlo,"8($Htbl,$nlo)");
300 &xor ($Zhi,"($Htbl,$nlo)");
301
302 &shl ($rem[1],48);
303 &xor ($Zlo,$tmp);
304
305 &xor ($Zhi,$rem[1]);
306 &movz ($rem[0],&LB($Zlo));
307 &shr ($Zlo,4);
308
309 &mov ($tmp,$Zhi);
310 &shl (&LB($rem[0]),4);
311 &shr ($Zhi,4);
312
313 &xor ($Zlo,"8($Htbl,$nhi[0])");
314 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
315 &shl ($tmp,60);
316
317 &xor ($Zhi,"($Htbl,$nhi[0])");
318 &xor ($Zlo,$tmp);
319 &shl ($rem[0],48);
320
321 &bswap ($Zlo);
322 &xor ($Zhi,$rem[0]);
323
324 &bswap ($Zhi);
325 &cmp ($inp,$len);
326 &jb (".Louter_loop");
327}
328$code.=<<___;
329 mov $Zlo,8($Xi)
330 mov $Zhi,($Xi)
331
332 lea 280(%rsp),%rsi
333 mov 0(%rsi),%r15
334 mov 8(%rsi),%r14
335 mov 16(%rsi),%r13
336 mov 24(%rsi),%r12
337 mov 32(%rsi),%rbp
338 mov 40(%rsi),%rbx
339 lea 48(%rsi),%rsp
340.Lghash_epilogue:
341 ret
342.size gcm_ghash_4bit,.-gcm_ghash_4bit
343___
344
345######################################################################
346# PCLMULQDQ version.
347
348@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
349 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
350
351($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
352($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
353
354sub clmul64x64_T2 { # minimal register pressure
355my ($Xhi,$Xi,$Hkey,$modulo)=@_;
356
357$code.=<<___ if (!defined($modulo));
358 movdqa $Xi,$Xhi #
359 pshufd \$0b01001110,$Xi,$T1
360 pshufd \$0b01001110,$Hkey,$T2
361 pxor $Xi,$T1 #
362 pxor $Hkey,$T2
363___
364$code.=<<___;
365 pclmulqdq \$0x00,$Hkey,$Xi #######
366 pclmulqdq \$0x11,$Hkey,$Xhi #######
367 pclmulqdq \$0x00,$T2,$T1 #######
368 pxor $Xi,$T1 #
369 pxor $Xhi,$T1 #
370
371 movdqa $T1,$T2 #
372 psrldq \$8,$T1
373 pslldq \$8,$T2 #
374 pxor $T1,$Xhi
375 pxor $T2,$Xi #
376___
377}
378
379sub reduction_alg9 { # 17/13 times faster than Intel version
380my ($Xhi,$Xi) = @_;
381
382$code.=<<___;
383 # 1st phase
384 movdqa $Xi,$T1 #
385 psllq \$1,$Xi
386 pxor $T1,$Xi #
387 psllq \$5,$Xi #
388 pxor $T1,$Xi #
389 psllq \$57,$Xi #
390 movdqa $Xi,$T2 #
391 pslldq \$8,$Xi
392 psrldq \$8,$T2 #
393 pxor $T1,$Xi
394 pxor $T2,$Xhi #
395
396 # 2nd phase
397 movdqa $Xi,$T2
398 psrlq \$5,$Xi
399 pxor $T2,$Xi #
400 psrlq \$1,$Xi #
401 pxor $T2,$Xi #
402 pxor $Xhi,$T2
403 psrlq \$1,$Xi #
404 pxor $T2,$Xi #
405___
406}
407
408{ my ($Htbl,$Xip)=@_4args;
409
410$code.=<<___;
411.globl gcm_init_clmul
412.type gcm_init_clmul,\@abi-omnipotent
413.align 16
414gcm_init_clmul:
415 movdqu ($Xip),$Hkey
416 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
417
418 # <<1 twist
419 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
420 movdqa $Hkey,$T1
421 psllq \$1,$Hkey
422 pxor $T3,$T3 #
423 psrlq \$63,$T1
424 pcmpgtd $T2,$T3 # broadcast carry bit
425 pslldq \$8,$T1
426 por $T1,$Hkey # H<<=1
427
428 # magic reduction
429 pand .L0x1c2_polynomial(%rip),$T3
430 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
431
432 # calculate H^2
433 movdqa $Hkey,$Xi
434___
435 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
436 &reduction_alg9 ($Xhi,$Xi);
437$code.=<<___;
438 movdqu $Hkey,($Htbl) # save H
439 movdqu $Xi,16($Htbl) # save H^2
440 ret
441.size gcm_init_clmul,.-gcm_init_clmul
442___
443}
444
445{ my ($Xip,$Htbl)=@_4args;
446
447$code.=<<___;
448.globl gcm_gmult_clmul
449.type gcm_gmult_clmul,\@abi-omnipotent
450.align 16
451gcm_gmult_clmul:
452 movdqu ($Xip),$Xi
453 movdqa .Lbswap_mask(%rip),$T3
454 movdqu ($Htbl),$Hkey
455 pshufb $T3,$Xi
456___
457 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
458 &reduction_alg9 ($Xhi,$Xi);
459$code.=<<___;
460 pshufb $T3,$Xi
461 movdqu $Xi,($Xip)
462 ret
463.size gcm_gmult_clmul,.-gcm_gmult_clmul
464___
465}
466
467{ my ($Xip,$Htbl,$inp,$len)=@_4args;
468 my $Xn="%xmm6";
469 my $Xhn="%xmm7";
470 my $Hkey2="%xmm8";
471 my $T1n="%xmm9";
472 my $T2n="%xmm10";
473
474$code.=<<___;
475.globl gcm_ghash_clmul
476.type gcm_ghash_clmul,\@abi-omnipotent
477.align 16
478gcm_ghash_clmul:
479___
480$code.=<<___ if ($win64);
481.LSEH_begin_gcm_ghash_clmul:
482 # I can't trust assembler to use specific encoding:-(
483 .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
484 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
485 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
486 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
487 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
488 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
489___
490$code.=<<___;
491 movdqa .Lbswap_mask(%rip),$T3
492
493 movdqu ($Xip),$Xi
494 movdqu ($Htbl),$Hkey
495 pshufb $T3,$Xi
496
497 sub \$0x10,$len
498 jz .Lodd_tail
499
500 movdqu 16($Htbl),$Hkey2
501 #######
502 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
503 # [(H*Ii+1) + (H*Xi+1)] mod P =
504 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
505 #
506 movdqu ($inp),$T1 # Ii
507 movdqu 16($inp),$Xn # Ii+1
508 pshufb $T3,$T1
509 pshufb $T3,$Xn
510 pxor $T1,$Xi # Ii+Xi
511___
512 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
513$code.=<<___;
514 movdqa $Xi,$Xhi #
515 pshufd \$0b01001110,$Xi,$T1
516 pshufd \$0b01001110,$Hkey2,$T2
517 pxor $Xi,$T1 #
518 pxor $Hkey2,$T2
519
520 lea 32($inp),$inp # i+=2
521 sub \$0x20,$len
522 jbe .Leven_tail
523
524.Lmod_loop:
525___
526 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
527$code.=<<___;
528 movdqu ($inp),$T1 # Ii
529 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
530 pxor $Xhn,$Xhi
531
532 movdqu 16($inp),$Xn # Ii+1
533 pshufb $T3,$T1
534 pshufb $T3,$Xn
535
536 movdqa $Xn,$Xhn #
537 pshufd \$0b01001110,$Xn,$T1n
538 pshufd \$0b01001110,$Hkey,$T2n
539 pxor $Xn,$T1n #
540 pxor $Hkey,$T2n
541 pxor $T1,$Xhi # "Ii+Xi", consume early
542
543 movdqa $Xi,$T1 # 1st phase
544 psllq \$1,$Xi
545 pxor $T1,$Xi #
546 psllq \$5,$Xi #
547 pxor $T1,$Xi #
548 pclmulqdq \$0x00,$Hkey,$Xn #######
549 psllq \$57,$Xi #
550 movdqa $Xi,$T2 #
551 pslldq \$8,$Xi
552 psrldq \$8,$T2 #
553 pxor $T1,$Xi
554 pxor $T2,$Xhi #
555
556 pclmulqdq \$0x11,$Hkey,$Xhn #######
557 movdqa $Xi,$T2 # 2nd phase
558 psrlq \$5,$Xi
559 pxor $T2,$Xi #
560 psrlq \$1,$Xi #
561 pxor $T2,$Xi #
562 pxor $Xhi,$T2
563 psrlq \$1,$Xi #
564 pxor $T2,$Xi #
565
566 pclmulqdq \$0x00,$T2n,$T1n #######
567 movdqa $Xi,$Xhi #
568 pshufd \$0b01001110,$Xi,$T1
569 pshufd \$0b01001110,$Hkey2,$T2
570 pxor $Xi,$T1 #
571 pxor $Hkey2,$T2
572
573 pxor $Xn,$T1n #
574 pxor $Xhn,$T1n #
575 movdqa $T1n,$T2n #
576 psrldq \$8,$T1n
577 pslldq \$8,$T2n #
578 pxor $T1n,$Xhn
579 pxor $T2n,$Xn #
580
581 lea 32($inp),$inp
582 sub \$0x20,$len
583 ja .Lmod_loop
584
585.Leven_tail:
586___
587 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
588$code.=<<___;
589 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
590 pxor $Xhn,$Xhi
591___
592 &reduction_alg9 ($Xhi,$Xi);
593$code.=<<___;
594 test $len,$len
595 jnz .Ldone
596
597.Lodd_tail:
598 movdqu ($inp),$T1 # Ii
599 pshufb $T3,$T1
600 pxor $T1,$Xi # Ii+Xi
601___
602 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
603 &reduction_alg9 ($Xhi,$Xi);
604$code.=<<___;
605.Ldone:
606 pshufb $T3,$Xi
607 movdqu $Xi,($Xip)
608___
609$code.=<<___ if ($win64);
610 movaps (%rsp),%xmm6
611 movaps 0x10(%rsp),%xmm7
612 movaps 0x20(%rsp),%xmm8
613 movaps 0x30(%rsp),%xmm9
614 movaps 0x40(%rsp),%xmm10
615 add \$0x58,%rsp
616___
617$code.=<<___;
618 ret
619.LSEH_end_gcm_ghash_clmul:
620.size gcm_ghash_clmul,.-gcm_ghash_clmul
621___
622}
623
624$code.=<<___;
625.align 64
626.Lbswap_mask:
627 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
628.L0x1c2_polynomial:
629 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
630.align 64
631.type .Lrem_4bit,\@object
632.Lrem_4bit:
633 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
634 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
635 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
636 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
637.type .Lrem_8bit,\@object
638.Lrem_8bit:
639 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
640 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
641 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
642 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
643 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
644 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
645 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
646 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
647 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
648 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
649 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
650 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
651 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
652 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
653 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
654 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
655 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
656 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
657 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
658 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
659 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
660 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
661 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
662 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
663 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
664 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
665 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
666 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
667 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
668 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
669 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
670 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
671
672.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
673.align 64
674___
675
676# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
677# CONTEXT *context,DISPATCHER_CONTEXT *disp)
678if ($win64) {
679$rec="%rcx";
680$frame="%rdx";
681$context="%r8";
682$disp="%r9";
683
684$code.=<<___;
685.extern __imp_RtlVirtualUnwind
686.type se_handler,\@abi-omnipotent
687.align 16
688se_handler:
689 push %rsi
690 push %rdi
691 push %rbx
692 push %rbp
693 push %r12
694 push %r13
695 push %r14
696 push %r15
697 pushfq
698 sub \$64,%rsp
699
700 mov 120($context),%rax # pull context->Rax
701 mov 248($context),%rbx # pull context->Rip
702
703 mov 8($disp),%rsi # disp->ImageBase
704 mov 56($disp),%r11 # disp->HandlerData
705
706 mov 0(%r11),%r10d # HandlerData[0]
707 lea (%rsi,%r10),%r10 # prologue label
708 cmp %r10,%rbx # context->Rip<prologue label
709 jb .Lin_prologue
710
711 mov 152($context),%rax # pull context->Rsp
712
713 mov 4(%r11),%r10d # HandlerData[1]
714 lea (%rsi,%r10),%r10 # epilogue label
715 cmp %r10,%rbx # context->Rip>=epilogue label
716 jae .Lin_prologue
717
718 lea 24(%rax),%rax # adjust "rsp"
719
720 mov -8(%rax),%rbx
721 mov -16(%rax),%rbp
722 mov -24(%rax),%r12
723 mov %rbx,144($context) # restore context->Rbx
724 mov %rbp,160($context) # restore context->Rbp
725 mov %r12,216($context) # restore context->R12
726
727.Lin_prologue:
728 mov 8(%rax),%rdi
729 mov 16(%rax),%rsi
730 mov %rax,152($context) # restore context->Rsp
731 mov %rsi,168($context) # restore context->Rsi
732 mov %rdi,176($context) # restore context->Rdi
733
734 mov 40($disp),%rdi # disp->ContextRecord
735 mov $context,%rsi # context
736 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
737 .long 0xa548f3fc # cld; rep movsq
738
739 mov $disp,%rsi
740 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
741 mov 8(%rsi),%rdx # arg2, disp->ImageBase
742 mov 0(%rsi),%r8 # arg3, disp->ControlPc
743 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
744 mov 40(%rsi),%r10 # disp->ContextRecord
745 lea 56(%rsi),%r11 # &disp->HandlerData
746 lea 24(%rsi),%r12 # &disp->EstablisherFrame
747 mov %r10,32(%rsp) # arg5
748 mov %r11,40(%rsp) # arg6
749 mov %r12,48(%rsp) # arg7
750 mov %rcx,56(%rsp) # arg8, (NULL)
751 call *__imp_RtlVirtualUnwind(%rip)
752
753 mov \$1,%eax # ExceptionContinueSearch
754 add \$64,%rsp
755 popfq
756 pop %r15
757 pop %r14
758 pop %r13
759 pop %r12
760 pop %rbp
761 pop %rbx
762 pop %rdi
763 pop %rsi
764 ret
765.size se_handler,.-se_handler
766
767.section .pdata
768.align 4
769 .rva .LSEH_begin_gcm_gmult_4bit
770 .rva .LSEH_end_gcm_gmult_4bit
771 .rva .LSEH_info_gcm_gmult_4bit
772
773 .rva .LSEH_begin_gcm_ghash_4bit
774 .rva .LSEH_end_gcm_ghash_4bit
775 .rva .LSEH_info_gcm_ghash_4bit
776
777 .rva .LSEH_begin_gcm_ghash_clmul
778 .rva .LSEH_end_gcm_ghash_clmul
779 .rva .LSEH_info_gcm_ghash_clmul
780
781.section .xdata
782.align 8
783.LSEH_info_gcm_gmult_4bit:
784 .byte 9,0,0,0
785 .rva se_handler
786 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
787.LSEH_info_gcm_ghash_4bit:
788 .byte 9,0,0,0
789 .rva se_handler
790 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
791.LSEH_info_gcm_ghash_clmul:
792 .byte 0x01,0x1f,0x0b,0x00
793 .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
794 .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
795 .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
796 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
797 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
798 .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
799___
800}
801
802$code =~ s/\`([^\`]*)\`/eval($1)/gem;
803
804print $code;
805
806close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
deleted file mode 100644
index 0e54f75470..0000000000
--- a/src/lib/libcrypto/modes/cbc128.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 *
49 */
50
51#include <openssl/crypto.h>
52#include "modes_lcl.h"
53#include <string.h>
54
55#ifndef MODES_DEBUG
56# ifndef NDEBUG
57# define NDEBUG
58# endif
59#endif
60#include <assert.h>
61
62#ifndef STRICT_ALIGNMENT
63# define STRICT_ALIGNMENT 0
64#endif
65
66void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
67 size_t len, const void *key,
68 unsigned char ivec[16], block128_f block)
69{
70 size_t n;
71 const unsigned char *iv = ivec;
72
73 assert(in && out && key && ivec);
74
75#if !defined(OPENSSL_SMALL_FOOTPRINT)
76 if (STRICT_ALIGNMENT &&
77 ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
78 while (len>=16) {
79 for(n=0; n<16; ++n)
80 out[n] = in[n] ^ iv[n];
81 (*block)(out, out, key);
82 iv = out;
83 len -= 16;
84 in += 16;
85 out += 16;
86 }
87 } else {
88 while (len>=16) {
89 for(n=0; n<16; n+=sizeof(size_t))
90 *(size_t*)(out+n) =
91 *(size_t*)(in+n) ^ *(size_t*)(iv+n);
92 (*block)(out, out, key);
93 iv = out;
94 len -= 16;
95 in += 16;
96 out += 16;
97 }
98 }
99#endif
100 while (len) {
101 for(n=0; n<16 && n<len; ++n)
102 out[n] = in[n] ^ iv[n];
103 for(; n<16; ++n)
104 out[n] = iv[n];
105 (*block)(out, out, key);
106 iv = out;
107 if (len<=16) break;
108 len -= 16;
109 in += 16;
110 out += 16;
111 }
112 memcpy(ivec,iv,16);
113}
114
115void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
116 size_t len, const void *key,
117 unsigned char ivec[16], block128_f block)
118{
119 size_t n;
120 union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
121
122 assert(in && out && key && ivec);
123
124#if !defined(OPENSSL_SMALL_FOOTPRINT)
125 if (in != out) {
126 const unsigned char *iv = ivec;
127
128 if (STRICT_ALIGNMENT &&
129 ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
130 while (len>=16) {
131 (*block)(in, out, key);
132 for(n=0; n<16; ++n)
133 out[n] ^= iv[n];
134 iv = in;
135 len -= 16;
136 in += 16;
137 out += 16;
138 }
139 }
140 else if (16%sizeof(size_t) == 0) { /* always true */
141 while (len>=16) {
142 size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
143
144 (*block)(in, out, key);
145 for(n=0; n<16/sizeof(size_t); n++)
146 out_t[n] ^= iv_t[n];
147 iv = in;
148 len -= 16;
149 in += 16;
150 out += 16;
151 }
152 }
153 memcpy(ivec,iv,16);
154 } else {
155 if (STRICT_ALIGNMENT &&
156 ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
157 unsigned char c;
158 while (len>=16) {
159 (*block)(in, tmp.c, key);
160 for(n=0; n<16; ++n) {
161 c = in[n];
162 out[n] = tmp.c[n] ^ ivec[n];
163 ivec[n] = c;
164 }
165 len -= 16;
166 in += 16;
167 out += 16;
168 }
169 }
170 else if (16%sizeof(size_t) == 0) { /* always true */
171 while (len>=16) {
172 size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
173 const size_t *in_t=(const size_t *)in;
174
175 (*block)(in, tmp.c, key);
176 for(n=0; n<16/sizeof(size_t); n++) {
177 c = in_t[n];
178 out_t[n] = tmp.t[n] ^ ivec_t[n];
179 ivec_t[n] = c;
180 }
181 len -= 16;
182 in += 16;
183 out += 16;
184 }
185 }
186 }
187#endif
188 while (len) {
189 unsigned char c;
190 (*block)(in, tmp.c, key);
191 for(n=0; n<16 && n<len; ++n) {
192 c = in[n];
193 out[n] = tmp.c[n] ^ ivec[n];
194 ivec[n] = c;
195 }
196 if (len<=16) {
197 for (; n<16; ++n)
198 ivec[n] = in[n];
199 break;
200 }
201 len -= 16;
202 in += 16;
203 out += 16;
204 }
205}
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
deleted file mode 100644
index 3ce11d0d98..0000000000
--- a/src/lib/libcrypto/modes/ccm128.c
+++ /dev/null
@@ -1,441 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#include <openssl/crypto.h>
51#include "modes_lcl.h"
52#include <string.h>
53
54#ifndef MODES_DEBUG
55# ifndef NDEBUG
56# define NDEBUG
57# endif
58#endif
59#include <assert.h>
60
61/* First you setup M and L parameters and pass the key schedule.
62 * This is called once per session setup... */
63void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
64 unsigned int M,unsigned int L,void *key,block128_f block)
65{
66 memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
67 ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
68 ctx->blocks = 0;
69 ctx->block = block;
70 ctx->key = key;
71}
72
73/* !!! Following interfaces are to be called *once* per packet !!! */
74
75/* Then you setup per-message nonce and pass the length of the message */
76int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
77 const unsigned char *nonce,size_t nlen,size_t mlen)
78{
79 unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
80
81 if (nlen<(14-L)) return -1; /* nonce is too short */
82
83 if (sizeof(mlen)==8 && L>=3) {
84 ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
85 ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
86 ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
87 ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
88 }
89 else
90 ctx->nonce.u[1] = 0;
91
92 ctx->nonce.c[12] = (u8)(mlen>>24);
93 ctx->nonce.c[13] = (u8)(mlen>>16);
94 ctx->nonce.c[14] = (u8)(mlen>>8);
95 ctx->nonce.c[15] = (u8)mlen;
96
97 ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
98 memcpy(&ctx->nonce.c[1],nonce,14-L);
99
100 return 0;
101}
102
103/* Then you pass additional authentication data, this is optional */
104void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
105 const unsigned char *aad,size_t alen)
106{ unsigned int i;
107 block128_f block = ctx->block;
108
109 if (alen==0) return;
110
111 ctx->nonce.c[0] |= 0x40; /* set Adata flag */
112 (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
113 ctx->blocks++;
114
115 if (alen<(0x10000-0x100)) {
116 ctx->cmac.c[0] ^= (u8)(alen>>8);
117 ctx->cmac.c[1] ^= (u8)alen;
118 i=2;
119 }
120 else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
121 ctx->cmac.c[0] ^= 0xFF;
122 ctx->cmac.c[1] ^= 0xFF;
123 ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
124 ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
125 ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
126 ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
127 ctx->cmac.c[6] ^= (u8)(alen>>24);
128 ctx->cmac.c[7] ^= (u8)(alen>>16);
129 ctx->cmac.c[8] ^= (u8)(alen>>8);
130 ctx->cmac.c[9] ^= (u8)alen;
131 i=10;
132 }
133 else {
134 ctx->cmac.c[0] ^= 0xFF;
135 ctx->cmac.c[1] ^= 0xFE;
136 ctx->cmac.c[2] ^= (u8)(alen>>24);
137 ctx->cmac.c[3] ^= (u8)(alen>>16);
138 ctx->cmac.c[4] ^= (u8)(alen>>8);
139 ctx->cmac.c[5] ^= (u8)alen;
140 i=6;
141 }
142
143 do {
144 for(;i<16 && alen;++i,++aad,--alen)
145 ctx->cmac.c[i] ^= *aad;
146 (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
147 ctx->blocks++;
148 i=0;
149 } while (alen);
150}
151
152/* Finally you encrypt or decrypt the message */
153
154/* counter part of nonce may not be larger than L*8 bits,
155 * L is not larger than 8, therefore 64-bit counter... */
156static void ctr64_inc(unsigned char *counter) {
157 unsigned int n=8;
158 u8 c;
159
160 counter += 8;
161 do {
162 --n;
163 c = counter[n];
164 ++c;
165 counter[n] = c;
166 if (c) return;
167 } while (n);
168}
169
170int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
171 const unsigned char *inp, unsigned char *out,
172 size_t len)
173{
174 size_t n;
175 unsigned int i,L;
176 unsigned char flags0 = ctx->nonce.c[0];
177 block128_f block = ctx->block;
178 void * key = ctx->key;
179 union { u64 u[2]; u8 c[16]; } scratch;
180
181 if (!(flags0&0x40))
182 (*block)(ctx->nonce.c,ctx->cmac.c,key),
183 ctx->blocks++;
184
185 ctx->nonce.c[0] = L = flags0&7;
186 for (n=0,i=15-L;i<15;++i) {
187 n |= ctx->nonce.c[i];
188 ctx->nonce.c[i]=0;
189 n <<= 8;
190 }
191 n |= ctx->nonce.c[15]; /* reconstructed length */
192 ctx->nonce.c[15]=1;
193
194 if (n!=len) return -1; /* length mismatch */
195
196 ctx->blocks += ((len+15)>>3)|1;
197 if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
198
199 while (len>=16) {
200#if defined(STRICT_ALIGNMENT)
201 union { u64 u[2]; u8 c[16]; } temp;
202
203 memcpy (temp.c,inp,16);
204 ctx->cmac.u[0] ^= temp.u[0];
205 ctx->cmac.u[1] ^= temp.u[1];
206#else
207 ctx->cmac.u[0] ^= ((u64*)inp)[0];
208 ctx->cmac.u[1] ^= ((u64*)inp)[1];
209#endif
210 (*block)(ctx->cmac.c,ctx->cmac.c,key);
211 (*block)(ctx->nonce.c,scratch.c,key);
212 ctr64_inc(ctx->nonce.c);
213#if defined(STRICT_ALIGNMENT)
214 temp.u[0] ^= scratch.u[0];
215 temp.u[1] ^= scratch.u[1];
216 memcpy(out,temp.c,16);
217#else
218 ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
219 ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
220#endif
221 inp += 16;
222 out += 16;
223 len -= 16;
224 }
225
226 if (len) {
227 for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
228 (*block)(ctx->cmac.c,ctx->cmac.c,key);
229 (*block)(ctx->nonce.c,scratch.c,key);
230 for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
231 }
232
233 for (i=15-L;i<16;++i)
234 ctx->nonce.c[i]=0;
235
236 (*block)(ctx->nonce.c,scratch.c,key);
237 ctx->cmac.u[0] ^= scratch.u[0];
238 ctx->cmac.u[1] ^= scratch.u[1];
239
240 ctx->nonce.c[0] = flags0;
241
242 return 0;
243}
244
245int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
246 const unsigned char *inp, unsigned char *out,
247 size_t len)
248{
249 size_t n;
250 unsigned int i,L;
251 unsigned char flags0 = ctx->nonce.c[0];
252 block128_f block = ctx->block;
253 void * key = ctx->key;
254 union { u64 u[2]; u8 c[16]; } scratch;
255
256 if (!(flags0&0x40))
257 (*block)(ctx->nonce.c,ctx->cmac.c,key);
258
259 ctx->nonce.c[0] = L = flags0&7;
260 for (n=0,i=15-L;i<15;++i) {
261 n |= ctx->nonce.c[i];
262 ctx->nonce.c[i]=0;
263 n <<= 8;
264 }
265 n |= ctx->nonce.c[15]; /* reconstructed length */
266 ctx->nonce.c[15]=1;
267
268 if (n!=len) return -1;
269
270 while (len>=16) {
271#if defined(STRICT_ALIGNMENT)
272 union { u64 u[2]; u8 c[16]; } temp;
273#endif
274 (*block)(ctx->nonce.c,scratch.c,key);
275 ctr64_inc(ctx->nonce.c);
276#if defined(STRICT_ALIGNMENT)
277 memcpy (temp.c,inp,16);
278 ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
279 ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
280 memcpy (out,scratch.c,16);
281#else
282 ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
283 ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
284#endif
285 (*block)(ctx->cmac.c,ctx->cmac.c,key);
286
287 inp += 16;
288 out += 16;
289 len -= 16;
290 }
291
292 if (len) {
293 (*block)(ctx->nonce.c,scratch.c,key);
294 for (i=0; i<len; ++i)
295 ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
296 (*block)(ctx->cmac.c,ctx->cmac.c,key);
297 }
298
299 for (i=15-L;i<16;++i)
300 ctx->nonce.c[i]=0;
301
302 (*block)(ctx->nonce.c,scratch.c,key);
303 ctx->cmac.u[0] ^= scratch.u[0];
304 ctx->cmac.u[1] ^= scratch.u[1];
305
306 ctx->nonce.c[0] = flags0;
307
308 return 0;
309}
310
311static void ctr64_add (unsigned char *counter,size_t inc)
312{ size_t n=8, val=0;
313
314 counter += 8;
315 do {
316 --n;
317 val += counter[n] + (inc&0xff);
318 counter[n] = (unsigned char)val;
319 val >>= 8; /* carry bit */
320 inc >>= 8;
321 } while(n && (inc || val));
322}
323
324int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
325 const unsigned char *inp, unsigned char *out,
326 size_t len,ccm128_f stream)
327{
328 size_t n;
329 unsigned int i,L;
330 unsigned char flags0 = ctx->nonce.c[0];
331 block128_f block = ctx->block;
332 void * key = ctx->key;
333 union { u64 u[2]; u8 c[16]; } scratch;
334
335 if (!(flags0&0x40))
336 (*block)(ctx->nonce.c,ctx->cmac.c,key),
337 ctx->blocks++;
338
339 ctx->nonce.c[0] = L = flags0&7;
340 for (n=0,i=15-L;i<15;++i) {
341 n |= ctx->nonce.c[i];
342 ctx->nonce.c[i]=0;
343 n <<= 8;
344 }
345 n |= ctx->nonce.c[15]; /* reconstructed length */
346 ctx->nonce.c[15]=1;
347
348 if (n!=len) return -1; /* length mismatch */
349
350 ctx->blocks += ((len+15)>>3)|1;
351 if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
352
353 if ((n=len/16)) {
354 (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
355 n *= 16;
356 inp += n;
357 out += n;
358 len -= n;
359 if (len) ctr64_add(ctx->nonce.c,n/16);
360 }
361
362 if (len) {
363 for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
364 (*block)(ctx->cmac.c,ctx->cmac.c,key);
365 (*block)(ctx->nonce.c,scratch.c,key);
366 for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
367 }
368
369 for (i=15-L;i<16;++i)
370 ctx->nonce.c[i]=0;
371
372 (*block)(ctx->nonce.c,scratch.c,key);
373 ctx->cmac.u[0] ^= scratch.u[0];
374 ctx->cmac.u[1] ^= scratch.u[1];
375
376 ctx->nonce.c[0] = flags0;
377
378 return 0;
379}
380
381int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
382 const unsigned char *inp, unsigned char *out,
383 size_t len,ccm128_f stream)
384{
385 size_t n;
386 unsigned int i,L;
387 unsigned char flags0 = ctx->nonce.c[0];
388 block128_f block = ctx->block;
389 void * key = ctx->key;
390 union { u64 u[2]; u8 c[16]; } scratch;
391
392 if (!(flags0&0x40))
393 (*block)(ctx->nonce.c,ctx->cmac.c,key);
394
395 ctx->nonce.c[0] = L = flags0&7;
396 for (n=0,i=15-L;i<15;++i) {
397 n |= ctx->nonce.c[i];
398 ctx->nonce.c[i]=0;
399 n <<= 8;
400 }
401 n |= ctx->nonce.c[15]; /* reconstructed length */
402 ctx->nonce.c[15]=1;
403
404 if (n!=len) return -1;
405
406 if ((n=len/16)) {
407 (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
408 n *= 16;
409 inp += n;
410 out += n;
411 len -= n;
412 if (len) ctr64_add(ctx->nonce.c,n/16);
413 }
414
415 if (len) {
416 (*block)(ctx->nonce.c,scratch.c,key);
417 for (i=0; i<len; ++i)
418 ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
419 (*block)(ctx->cmac.c,ctx->cmac.c,key);
420 }
421
422 for (i=15-L;i<16;++i)
423 ctx->nonce.c[i]=0;
424
425 (*block)(ctx->nonce.c,scratch.c,key);
426 ctx->cmac.u[0] ^= scratch.u[0];
427 ctx->cmac.u[1] ^= scratch.u[1];
428
429 ctx->nonce.c[0] = flags0;
430
431 return 0;
432}
433
434size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
435{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
436
437 M *= 2; M += 2;
438 if (len<M) return 0;
439 memcpy(tag,ctx->cmac.c,M);
440 return M;
441}
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
deleted file mode 100644
index 4e6f5d35e1..0000000000
--- a/src/lib/libcrypto/modes/cfb128.c
+++ /dev/null
@@ -1,242 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 *
49 */
50
51#include <openssl/crypto.h>
52#include "modes_lcl.h"
53#include <string.h>
54
55#ifndef MODES_DEBUG
56# ifndef NDEBUG
57# define NDEBUG
58# endif
59#endif
60#include <assert.h>
61
62/* The input and output encrypted as though 128bit cfb mode is being
63 * used. The extra state information to record how much of the
64 * 128bit block we have used is contained in *num;
65 */
66void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
67 size_t len, const void *key,
68 unsigned char ivec[16], int *num,
69 int enc, block128_f block)
70{
71 unsigned int n;
72 size_t l = 0;
73
74 assert(in && out && key && ivec && num);
75
76 n = *num;
77
78 if (enc) {
79#if !defined(OPENSSL_SMALL_FOOTPRINT)
80 if (16%sizeof(size_t) == 0) do { /* always true actually */
81 while (n && len) {
82 *(out++) = ivec[n] ^= *(in++);
83 --len;
84 n = (n+1) % 16;
85 }
86#if defined(STRICT_ALIGNMENT)
87 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
88 break;
89#endif
90 while (len>=16) {
91 (*block)(ivec, ivec, key);
92 for (; n<16; n+=sizeof(size_t)) {
93 *(size_t*)(out+n) =
94 *(size_t*)(ivec+n) ^= *(size_t*)(in+n);
95 }
96 len -= 16;
97 out += 16;
98 in += 16;
99 n = 0;
100 }
101 if (len) {
102 (*block)(ivec, ivec, key);
103 while (len--) {
104 out[n] = ivec[n] ^= in[n];
105 ++n;
106 }
107 }
108 *num = n;
109 return;
110 } while (0);
111 /* the rest would be commonly eliminated by x86* compiler */
112#endif
113 while (l<len) {
114 if (n == 0) {
115 (*block)(ivec, ivec, key);
116 }
117 out[l] = ivec[n] ^= in[l];
118 ++l;
119 n = (n+1) % 16;
120 }
121 *num = n;
122 } else {
123#if !defined(OPENSSL_SMALL_FOOTPRINT)
124 if (16%sizeof(size_t) == 0) do { /* always true actually */
125 while (n && len) {
126 unsigned char c;
127 *(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c;
128 --len;
129 n = (n+1) % 16;
130 }
131#if defined(STRICT_ALIGNMENT)
132 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
133 break;
134#endif
135 while (len>=16) {
136 (*block)(ivec, ivec, key);
137 for (; n<16; n+=sizeof(size_t)) {
138 size_t t = *(size_t*)(in+n);
139 *(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t;
140 *(size_t*)(ivec+n) = t;
141 }
142 len -= 16;
143 out += 16;
144 in += 16;
145 n = 0;
146 }
147 if (len) {
148 (*block)(ivec, ivec, key);
149 while (len--) {
150 unsigned char c;
151 out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c;
152 ++n;
153 }
154 }
155 *num = n;
156 return;
157 } while (0);
158 /* the rest would be commonly eliminated by x86* compiler */
159#endif
160 while (l<len) {
161 unsigned char c;
162 if (n == 0) {
163 (*block)(ivec, ivec, key);
164 }
165 out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c;
166 ++l;
167 n = (n+1) % 16;
168 }
169 *num=n;
170 }
171}
172
173/* This expects a single block of size nbits for both in and out. Note that
174 it corrupts any extra bits in the last byte of out */
175static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
176 int nbits,const void *key,
177 unsigned char ivec[16],int enc,
178 block128_f block)
179{
180 int n,rem,num;
181 unsigned char ovec[16*2 + 1]; /* +1 because we dererefence (but don't use) one byte off the end */
182
183 if (nbits<=0 || nbits>128) return;
184
185 /* fill in the first half of the new IV with the current IV */
186 memcpy(ovec,ivec,16);
187 /* construct the new IV */
188 (*block)(ivec,ivec,key);
189 num = (nbits+7)/8;
190 if (enc) /* encrypt the input */
191 for(n=0 ; n < num ; ++n)
192 out[n] = (ovec[16+n] = in[n] ^ ivec[n]);
193 else /* decrypt the input */
194 for(n=0 ; n < num ; ++n)
195 out[n] = (ovec[16+n] = in[n]) ^ ivec[n];
196 /* shift ovec left... */
197 rem = nbits%8;
198 num = nbits/8;
199 if(rem==0)
200 memcpy(ivec,ovec+num,16);
201 else
202 for(n=0 ; n < 16 ; ++n)
203 ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
204
205 /* it is not necessary to cleanse ovec, since the IV is not secret */
206}
207
208/* N.B. This expects the input to be packed, MS bit first */
209void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
210 size_t bits, const void *key,
211 unsigned char ivec[16], int *num,
212 int enc, block128_f block)
213{
214 size_t n;
215 unsigned char c[1],d[1];
216
217 assert(in && out && key && ivec && num);
218 assert(*num == 0);
219
220 for(n=0 ; n<bits ; ++n)
221 {
222 c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
223 cfbr_encrypt_block(c,d,1,key,ivec,enc,block);
224 out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) |
225 ((d[0]&0x80) >> (unsigned int)(n%8));
226 }
227}
228
229void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
230 size_t length, const void *key,
231 unsigned char ivec[16], int *num,
232 int enc, block128_f block)
233{
234 size_t n;
235
236 assert(in && out && key && ivec && num);
237 assert(*num == 0);
238
239 for(n=0 ; n<length ; ++n)
240 cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block);
241}
242
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
deleted file mode 100644
index ee642c5863..0000000000
--- a/src/lib/libcrypto/modes/ctr128.c
+++ /dev/null
@@ -1,252 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 *
49 */
50
51#include <openssl/crypto.h>
52#include "modes_lcl.h"
53#include <string.h>
54
55#ifndef MODES_DEBUG
56# ifndef NDEBUG
57# define NDEBUG
58# endif
59#endif
60#include <assert.h>
61
62/* NOTE: the IV/counter CTR mode is big-endian. The code itself
63 * is endian-neutral. */
64
65/* increment counter (128-bit int) by 1 */
66static void ctr128_inc(unsigned char *counter) {
67 u32 n=16;
68 u8 c;
69
70 do {
71 --n;
72 c = counter[n];
73 ++c;
74 counter[n] = c;
75 if (c) return;
76 } while (n);
77}
78
79#if !defined(OPENSSL_SMALL_FOOTPRINT)
80static void ctr128_inc_aligned(unsigned char *counter) {
81 size_t *data,c,n;
82 const union { long one; char little; } is_endian = {1};
83
84 if (is_endian.little) {
85 ctr128_inc(counter);
86 return;
87 }
88
89 data = (size_t *)counter;
90 n = 16/sizeof(size_t);
91 do {
92 --n;
93 c = data[n];
94 ++c;
95 data[n] = c;
96 if (c) return;
97 } while (n);
98}
99#endif
100
101/* The input encrypted as though 128bit counter mode is being
102 * used. The extra state information to record how much of the
103 * 128bit block we have used is contained in *num, and the
104 * encrypted counter is kept in ecount_buf. Both *num and
105 * ecount_buf must be initialised with zeros before the first
106 * call to CRYPTO_ctr128_encrypt().
107 *
108 * This algorithm assumes that the counter is in the x lower bits
109 * of the IV (ivec), and that the application has full control over
110 * overflow and the rest of the IV. This implementation takes NO
111 * responsability for checking that the counter doesn't overflow
112 * into the rest of the IV when incremented.
113 */
114void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
115 size_t len, const void *key,
116 unsigned char ivec[16], unsigned char ecount_buf[16],
117 unsigned int *num, block128_f block)
118{
119 unsigned int n;
120 size_t l=0;
121
122 assert(in && out && key && ecount_buf && num);
123 assert(*num < 16);
124
125 n = *num;
126
127#if !defined(OPENSSL_SMALL_FOOTPRINT)
128 if (16%sizeof(size_t) == 0) do { /* always true actually */
129 while (n && len) {
130 *(out++) = *(in++) ^ ecount_buf[n];
131 --len;
132 n = (n+1) % 16;
133 }
134
135#if defined(STRICT_ALIGNMENT)
136 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
137 break;
138#endif
139 while (len>=16) {
140 (*block)(ivec, ecount_buf, key);
141 ctr128_inc_aligned(ivec);
142 for (; n<16; n+=sizeof(size_t))
143 *(size_t *)(out+n) =
144 *(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n);
145 len -= 16;
146 out += 16;
147 in += 16;
148 n = 0;
149 }
150 if (len) {
151 (*block)(ivec, ecount_buf, key);
152 ctr128_inc_aligned(ivec);
153 while (len--) {
154 out[n] = in[n] ^ ecount_buf[n];
155 ++n;
156 }
157 }
158 *num = n;
159 return;
160 } while(0);
161 /* the rest would be commonly eliminated by x86* compiler */
162#endif
163 while (l<len) {
164 if (n==0) {
165 (*block)(ivec, ecount_buf, key);
166 ctr128_inc(ivec);
167 }
168 out[l] = in[l] ^ ecount_buf[n];
169 ++l;
170 n = (n+1) % 16;
171 }
172
173 *num=n;
174}
175
176/* increment upper 96 bits of 128-bit counter by 1 */
177static void ctr96_inc(unsigned char *counter) {
178 u32 n=12;
179 u8 c;
180
181 do {
182 --n;
183 c = counter[n];
184 ++c;
185 counter[n] = c;
186 if (c) return;
187 } while (n);
188}
189
190void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
191 size_t len, const void *key,
192 unsigned char ivec[16], unsigned char ecount_buf[16],
193 unsigned int *num, ctr128_f func)
194{
195 unsigned int n,ctr32;
196
197 assert(in && out && key && ecount_buf && num);
198 assert(*num < 16);
199
200 n = *num;
201
202 while (n && len) {
203 *(out++) = *(in++) ^ ecount_buf[n];
204 --len;
205 n = (n+1) % 16;
206 }
207
208 ctr32 = GETU32(ivec+12);
209 while (len>=16) {
210 size_t blocks = len/16;
211 /*
212 * 1<<28 is just a not-so-small yet not-so-large number...
213 * Below condition is practically never met, but it has to
214 * be checked for code correctness.
215 */
216 if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
217 blocks = (1U<<28);
218 /*
219 * As (*func) operates on 32-bit counter, caller
220 * has to handle overflow. 'if' below detects the
221 * overflow, which is then handled by limiting the
222 * amount of blocks to the exact overflow point...
223 */
224 ctr32 += (u32)blocks;
225 if (ctr32 < blocks) {
226 blocks -= ctr32;
227 ctr32 = 0;
228 }
229 (*func)(in,out,blocks,key,ivec);
230 /* (*ctr) does not update ivec, caller does: */
231 PUTU32(ivec+12,ctr32);
232 /* ... overflow was detected, propogate carry. */
233 if (ctr32 == 0) ctr96_inc(ivec);
234 blocks *= 16;
235 len -= blocks;
236 out += blocks;
237 in += blocks;
238 }
239 if (len) {
240 memset(ecount_buf,0,16);
241 (*func)(ecount_buf,ecount_buf,1,key,ivec);
242 ++ctr32;
243 PUTU32(ivec+12,ctr32);
244 if (ctr32 == 0) ctr96_inc(ivec);
245 while (len--) {
246 out[n] = in[n] ^ ecount_buf[n];
247 ++n;
248 }
249 }
250
251 *num=n;
252}
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c
deleted file mode 100644
index 2d583de6f6..0000000000
--- a/src/lib/libcrypto/modes/cts128.c
+++ /dev/null
@@ -1,453 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
3 *
4 * Rights for redistribution and usage in source and binary
5 * forms are granted according to the OpenSSL license.
6 */
7
8#include <openssl/crypto.h>
9#include "modes_lcl.h"
10#include <string.h>
11
12#ifndef MODES_DEBUG
13# ifndef NDEBUG
14# define NDEBUG
15# endif
16#endif
17#include <assert.h>
18
19/*
20 * Trouble with Ciphertext Stealing, CTS, mode is that there is no
21 * common official specification, but couple of cipher/application
22 * specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
23 * Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
24 * deviates from mentioned RFCs. Most notably it allows input to be
25 * of block length and it doesn't flip the order of the last two
26 * blocks. CTS is being discussed even in ECB context, but it's not
27 * adopted for any known application. This implementation provides
28 * two interfaces: one compliant with above mentioned RFCs and one
29 * compliant with the NIST proposal, both extending CBC mode.
30 */
31
32size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
33 size_t len, const void *key,
34 unsigned char ivec[16], block128_f block)
35{ size_t residue, n;
36
37 assert (in && out && key && ivec);
38
39 if (len <= 16) return 0;
40
41 if ((residue=len%16) == 0) residue = 16;
42
43 len -= residue;
44
45 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
46
47 in += len;
48 out += len;
49
50 for (n=0; n<residue; ++n)
51 ivec[n] ^= in[n];
52 (*block)(ivec,ivec,key);
53 memcpy(out,out-16,residue);
54 memcpy(out-16,ivec,16);
55
56 return len+residue;
57}
58
59size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
60 size_t len, const void *key,
61 unsigned char ivec[16], block128_f block)
62{ size_t residue, n;
63
64 assert (in && out && key && ivec);
65
66 if (len < 16) return 0;
67
68 residue=len%16;
69
70 len -= residue;
71
72 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
73
74 if (residue==0) return len;
75
76 in += len;
77 out += len;
78
79 for (n=0; n<residue; ++n)
80 ivec[n] ^= in[n];
81 (*block)(ivec,ivec,key);
82 memcpy(out-16+residue,ivec,16);
83
84 return len+residue;
85}
86
87size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
88 size_t len, const void *key,
89 unsigned char ivec[16], cbc128_f cbc)
90{ size_t residue;
91 union { size_t align; unsigned char c[16]; } tmp;
92
93 assert (in && out && key && ivec);
94
95 if (len <= 16) return 0;
96
97 if ((residue=len%16) == 0) residue = 16;
98
99 len -= residue;
100
101 (*cbc)(in,out,len,key,ivec,1);
102
103 in += len;
104 out += len;
105
106#if defined(CBC_HANDLES_TRUNCATED_IO)
107 memcpy(tmp.c,out-16,16);
108 (*cbc)(in,out-16,residue,key,ivec,1);
109 memcpy(out,tmp.c,residue);
110#else
111 memset(tmp.c,0,sizeof(tmp));
112 memcpy(tmp.c,in,residue);
113 memcpy(out,out-16,residue);
114 (*cbc)(tmp.c,out-16,16,key,ivec,1);
115#endif
116 return len+residue;
117}
118
119size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
120 size_t len, const void *key,
121 unsigned char ivec[16], cbc128_f cbc)
122{ size_t residue;
123 union { size_t align; unsigned char c[16]; } tmp;
124
125 assert (in && out && key && ivec);
126
127 if (len < 16) return 0;
128
129 residue=len%16;
130
131 len -= residue;
132
133 (*cbc)(in,out,len,key,ivec,1);
134
135 if (residue==0) return len;
136
137 in += len;
138 out += len;
139
140#if defined(CBC_HANDLES_TRUNCATED_IO)
141 (*cbc)(in,out-16+residue,residue,key,ivec,1);
142#else
143 memset(tmp.c,0,sizeof(tmp));
144 memcpy(tmp.c,in,residue);
145 (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
146#endif
147 return len+residue;
148}
149
150size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
151 size_t len, const void *key,
152 unsigned char ivec[16], block128_f block)
153{ size_t residue, n;
154 union { size_t align; unsigned char c[32]; } tmp;
155
156 assert (in && out && key && ivec);
157
158 if (len<=16) return 0;
159
160 if ((residue=len%16) == 0) residue = 16;
161
162 len -= 16+residue;
163
164 if (len) {
165 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
166 in += len;
167 out += len;
168 }
169
170 (*block)(in,tmp.c+16,key);
171
172 memcpy(tmp.c,tmp.c+16,16);
173 memcpy(tmp.c,in+16,residue);
174 (*block)(tmp.c,tmp.c,key);
175
176 for(n=0; n<16; ++n) {
177 unsigned char c = in[n];
178 out[n] = tmp.c[n] ^ ivec[n];
179 ivec[n] = c;
180 }
181 for(residue+=16; n<residue; ++n)
182 out[n] = tmp.c[n] ^ in[n];
183
184 return 16+len+residue;
185}
186
187size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
188 size_t len, const void *key,
189 unsigned char ivec[16], block128_f block)
190{ size_t residue, n;
191 union { size_t align; unsigned char c[32]; } tmp;
192
193 assert (in && out && key && ivec);
194
195 if (len<16) return 0;
196
197 residue=len%16;
198
199 if (residue==0) {
200 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
201 return len;
202 }
203
204 len -= 16+residue;
205
206 if (len) {
207 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
208 in += len;
209 out += len;
210 }
211
212 (*block)(in+residue,tmp.c+16,key);
213
214 memcpy(tmp.c,tmp.c+16,16);
215 memcpy(tmp.c,in,residue);
216 (*block)(tmp.c,tmp.c,key);
217
218 for(n=0; n<16; ++n) {
219 unsigned char c = in[n];
220 out[n] = tmp.c[n] ^ ivec[n];
221 ivec[n] = in[n+residue];
222 tmp.c[n] = c;
223 }
224 for(residue+=16; n<residue; ++n)
225 out[n] = tmp.c[n] ^ tmp.c[n-16];
226
227 return 16+len+residue;
228}
229
230size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
231 size_t len, const void *key,
232 unsigned char ivec[16], cbc128_f cbc)
233{ size_t residue;
234 union { size_t align; unsigned char c[32]; } tmp;
235
236 assert (in && out && key && ivec);
237
238 if (len<=16) return 0;
239
240 if ((residue=len%16) == 0) residue = 16;
241
242 len -= 16+residue;
243
244 if (len) {
245 (*cbc)(in,out,len,key,ivec,0);
246 in += len;
247 out += len;
248 }
249
250 memset(tmp.c,0,sizeof(tmp));
251 /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
252 (*cbc)(in,tmp.c,16,key,tmp.c+16,0);
253
254 memcpy(tmp.c,in+16,residue);
255#if defined(CBC_HANDLES_TRUNCATED_IO)
256 (*cbc)(tmp.c,out,16+residue,key,ivec,0);
257#else
258 (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
259 memcpy(out,tmp.c,16+residue);
260#endif
261 return 16+len+residue;
262}
263
264size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
265 size_t len, const void *key,
266 unsigned char ivec[16], cbc128_f cbc)
267{ size_t residue;
268 union { size_t align; unsigned char c[32]; } tmp;
269
270 assert (in && out && key && ivec);
271
272 if (len<16) return 0;
273
274 residue=len%16;
275
276 if (residue==0) {
277 (*cbc)(in,out,len,key,ivec,0);
278 return len;
279 }
280
281 len -= 16+residue;
282
283 if (len) {
284 (*cbc)(in,out,len,key,ivec,0);
285 in += len;
286 out += len;
287 }
288
289 memset(tmp.c,0,sizeof(tmp));
290 /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
291 (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
292
293 memcpy(tmp.c,in,residue);
294#if defined(CBC_HANDLES_TRUNCATED_IO)
295 (*cbc)(tmp.c,out,16+residue,key,ivec,0);
296#else
297 (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
298 memcpy(out,tmp.c,16+residue);
299#endif
300 return 16+len+residue;
301}
302
303#if defined(SELFTEST)
304#include <stdio.h>
305#include <openssl/aes.h>
306
307/* test vectors from RFC 3962 */
308static const unsigned char test_key[16] = "chicken teriyaki";
309static const unsigned char test_input[64] =
310 "I would like the" " General Gau's C"
311 "hicken, please, " "and wonton soup.";
312static const unsigned char test_iv[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
313
314static const unsigned char vector_17[17] =
315{0xc6,0x35,0x35,0x68,0xf2,0xbf,0x8c,0xb4, 0xd8,0xa5,0x80,0x36,0x2d,0xa7,0xff,0x7f,
316 0x97};
317static const unsigned char vector_31[31] =
318{0xfc,0x00,0x78,0x3e,0x0e,0xfd,0xb2,0xc1, 0xd4,0x45,0xd4,0xc8,0xef,0xf7,0xed,0x22,
319 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5};
320static const unsigned char vector_32[32] =
321{0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8,
322 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84};
323static const unsigned char vector_47[47] =
324{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
325 0xb3,0xff,0xfd,0x94,0x0c,0x16,0xa1,0x8c, 0x1b,0x55,0x49,0xd2,0xf8,0x38,0x02,0x9e,
326 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5};
327static const unsigned char vector_48[48] =
328{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
329 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8,
330 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8};
331static const unsigned char vector_64[64] =
332{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
333 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8,
334 0x48,0x07,0xef,0xe8,0x36,0xee,0x89,0xa5, 0x26,0x73,0x0d,0xbc,0x2f,0x7b,0xc8,0x40,
335 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8};
336
337static AES_KEY encks, decks;
338
339void test_vector(const unsigned char *vector,size_t len)
340{ unsigned char iv[sizeof(test_iv)];
341 unsigned char cleartext[64],ciphertext[64];
342 size_t tail;
343
344 printf("vector_%d\n",len); fflush(stdout);
345
346 if ((tail=len%16) == 0) tail = 16;
347 tail += 16;
348
349 /* test block-based encryption */
350 memcpy(iv,test_iv,sizeof(test_iv));
351 CRYPTO_cts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
352 if (memcmp(ciphertext,vector,len))
353 fprintf(stderr,"output_%d mismatch\n",len), exit(1);
354 if (memcmp(iv,vector+len-tail,sizeof(iv)))
355 fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
356
357 /* test block-based decryption */
358 memcpy(iv,test_iv,sizeof(test_iv));
359 CRYPTO_cts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
360 if (memcmp(cleartext,test_input,len))
361 fprintf(stderr,"input_%d mismatch\n",len), exit(2);
362 if (memcmp(iv,vector+len-tail,sizeof(iv)))
363 fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
364
365 /* test streamed encryption */
366 memcpy(iv,test_iv,sizeof(test_iv));
367 CRYPTO_cts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
368 if (memcmp(ciphertext,vector,len))
369 fprintf(stderr,"output_%d mismatch\n",len), exit(3);
370 if (memcmp(iv,vector+len-tail,sizeof(iv)))
371 fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
372
373 /* test streamed decryption */
374 memcpy(iv,test_iv,sizeof(test_iv));
375 CRYPTO_cts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
376 if (memcmp(cleartext,test_input,len))
377 fprintf(stderr,"input_%d mismatch\n",len), exit(4);
378 if (memcmp(iv,vector+len-tail,sizeof(iv)))
379 fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
380}
381
382void test_nistvector(const unsigned char *vector,size_t len)
383{ unsigned char iv[sizeof(test_iv)];
384 unsigned char cleartext[64],ciphertext[64],nistvector[64];
385 size_t tail;
386
387 printf("nistvector_%d\n",len); fflush(stdout);
388
389 if ((tail=len%16) == 0) tail = 16;
390
391 len -= 16 + tail;
392 memcpy(nistvector,vector,len);
393 /* flip two last blocks */
394 memcpy(nistvector+len,vector+len+16,tail);
395 memcpy(nistvector+len+tail,vector+len,16);
396 len += 16 + tail;
397 tail = 16;
398
399 /* test block-based encryption */
400 memcpy(iv,test_iv,sizeof(test_iv));
401 CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
402 if (memcmp(ciphertext,nistvector,len))
403 fprintf(stderr,"output_%d mismatch\n",len), exit(1);
404 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
405 fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
406
407 /* test block-based decryption */
408 memcpy(iv,test_iv,sizeof(test_iv));
409 CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
410 if (memcmp(cleartext,test_input,len))
411 fprintf(stderr,"input_%d mismatch\n",len), exit(2);
412 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
413 fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
414
415 /* test streamed encryption */
416 memcpy(iv,test_iv,sizeof(test_iv));
417 CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
418 if (memcmp(ciphertext,nistvector,len))
419 fprintf(stderr,"output_%d mismatch\n",len), exit(3);
420 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
421 fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
422
423 /* test streamed decryption */
424 memcpy(iv,test_iv,sizeof(test_iv));
425 CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
426 if (memcmp(cleartext,test_input,len))
427 fprintf(stderr,"input_%d mismatch\n",len), exit(4);
428 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
429 fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
430}
431
432int main()
433{
434 AES_set_encrypt_key(test_key,128,&encks);
435 AES_set_decrypt_key(test_key,128,&decks);
436
437 test_vector(vector_17,sizeof(vector_17));
438 test_vector(vector_31,sizeof(vector_31));
439 test_vector(vector_32,sizeof(vector_32));
440 test_vector(vector_47,sizeof(vector_47));
441 test_vector(vector_48,sizeof(vector_48));
442 test_vector(vector_64,sizeof(vector_64));
443
444 test_nistvector(vector_17,sizeof(vector_17));
445 test_nistvector(vector_31,sizeof(vector_31));
446 test_nistvector(vector_32,sizeof(vector_32));
447 test_nistvector(vector_47,sizeof(vector_47));
448 test_nistvector(vector_48,sizeof(vector_48));
449 test_nistvector(vector_64,sizeof(vector_64));
450
451 return 0;
452}
453#endif
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
deleted file mode 100644
index e1dc2b0f47..0000000000
--- a/src/lib/libcrypto/modes/gcm128.c
+++ /dev/null
@@ -1,1905 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#define OPENSSL_FIPSAPI
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65#undef GETU32
66#define GETU32(p) BSWAP4(*(const u32 *)(p))
67#undef PUTU32
68#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
77 } \
78 else { \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
82 } \
83} while(0)
84
85/*
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 * handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 * subsystem penalties (for example on Windows large enough free
111 * results in VM working set trimming, meaning that consequent
112 * malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 * performance of other code paths (not necessarily even from same
115 * thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123 int i, j;
124 u128 V;
125
126 Htable[0].hi = 0;
127 Htable[0].lo = 0;
128 V.hi = H[0];
129 V.lo = H[1];
130
131 for (Htable[128]=V, i=64; i>0; i>>=1) {
132 REDUCE1BIT(V);
133 Htable[i] = V;
134 }
135
136 for (i=2; i<256; i<<=1) {
137 u128 *Hi = Htable+i, H0 = *Hi;
138 for (j=1; j<i; ++j) {
139 Hi[j].hi = H0.hi^Htable[j].hi;
140 Hi[j].lo = H0.lo^Htable[j].lo;
141 }
142 }
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147 u128 Z = { 0, 0};
148 const u8 *xi = (const u8 *)Xi+15;
149 size_t rem, n = *xi;
150 const union { long one; char little; } is_endian = {1};
151 static const size_t rem_8bit[256] = {
152 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217 while (1) {
218 Z.hi ^= Htable[n].hi;
219 Z.lo ^= Htable[n].lo;
220
221 if ((u8 *)Xi==xi) break;
222
223 n = *(--xi);
224
225 rem = (size_t)Z.lo&0xff;
226 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227 Z.hi = (Z.hi>>8);
228 if (sizeof(size_t)==8)
229 Z.hi ^= rem_8bit[rem];
230 else
231 Z.hi ^= (u64)rem_8bit[rem]<<32;
232 }
233
234 if (is_endian.little) {
235#ifdef BSWAP8
236 Xi[0] = BSWAP8(Z.hi);
237 Xi[1] = BSWAP8(Z.lo);
238#else
239 u8 *p = (u8 *)Xi;
240 u32 v;
241 v = (u32)(Z.hi>>32); PUTU32(p,v);
242 v = (u32)(Z.hi); PUTU32(p+4,v);
243 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
244 v = (u32)(Z.lo); PUTU32(p+12,v);
245#endif
246 }
247 else {
248 Xi[0] = Z.hi;
249 Xi[1] = Z.lo;
250 }
251}
252#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254#elif TABLE_BITS==4
255
256static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257{
258 u128 V;
259#if defined(OPENSSL_SMALL_FOOTPRINT)
260 int i;
261#endif
262
263 Htable[0].hi = 0;
264 Htable[0].lo = 0;
265 V.hi = H[0];
266 V.lo = H[1];
267
268#if defined(OPENSSL_SMALL_FOOTPRINT)
269 for (Htable[8]=V, i=4; i>0; i>>=1) {
270 REDUCE1BIT(V);
271 Htable[i] = V;
272 }
273
274 for (i=2; i<16; i<<=1) {
275 u128 *Hi = Htable+i;
276 int j;
277 for (V=*Hi, j=1; j<i; ++j) {
278 Hi[j].hi = V.hi^Htable[j].hi;
279 Hi[j].lo = V.lo^Htable[j].lo;
280 }
281 }
282#else
283 Htable[8] = V;
284 REDUCE1BIT(V);
285 Htable[4] = V;
286 REDUCE1BIT(V);
287 Htable[2] = V;
288 REDUCE1BIT(V);
289 Htable[1] = V;
290 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
291 V=Htable[4];
292 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
293 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
294 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
295 V=Htable[8];
296 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
297 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303#endif
304#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305 /*
306 * ARM assembler expects specific dword order in Htable.
307 */
308 {
309 int j;
310 const union { long one; char little; } is_endian = {1};
311
312 if (is_endian.little)
313 for (j=0;j<16;++j) {
314 V = Htable[j];
315 Htable[j].hi = V.lo;
316 Htable[j].lo = V.hi;
317 }
318 else
319 for (j=0;j<16;++j) {
320 V = Htable[j];
321 Htable[j].hi = V.lo<<32|V.lo>>32;
322 Htable[j].lo = V.hi<<32|V.hi>>32;
323 }
324 }
325#endif
326}
327
328#ifndef GHASH_ASM
329static const size_t rem_4bit[16] = {
330 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336{
337 u128 Z;
338 int cnt = 15;
339 size_t rem, nlo, nhi;
340 const union { long one; char little; } is_endian = {1};
341
342 nlo = ((const u8 *)Xi)[15];
343 nhi = nlo>>4;
344 nlo &= 0xf;
345
346 Z.hi = Htable[nlo].hi;
347 Z.lo = Htable[nlo].lo;
348
349 while (1) {
350 rem = (size_t)Z.lo&0xf;
351 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352 Z.hi = (Z.hi>>4);
353 if (sizeof(size_t)==8)
354 Z.hi ^= rem_4bit[rem];
355 else
356 Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358 Z.hi ^= Htable[nhi].hi;
359 Z.lo ^= Htable[nhi].lo;
360
361 if (--cnt<0) break;
362
363 nlo = ((const u8 *)Xi)[cnt];
364 nhi = nlo>>4;
365 nlo &= 0xf;
366
367 rem = (size_t)Z.lo&0xf;
368 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369 Z.hi = (Z.hi>>4);
370 if (sizeof(size_t)==8)
371 Z.hi ^= rem_4bit[rem];
372 else
373 Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375 Z.hi ^= Htable[nlo].hi;
376 Z.lo ^= Htable[nlo].lo;
377 }
378
379 if (is_endian.little) {
380#ifdef BSWAP8
381 Xi[0] = BSWAP8(Z.hi);
382 Xi[1] = BSWAP8(Z.lo);
383#else
384 u8 *p = (u8 *)Xi;
385 u32 v;
386 v = (u32)(Z.hi>>32); PUTU32(p,v);
387 v = (u32)(Z.hi); PUTU32(p+4,v);
388 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
389 v = (u32)(Z.lo); PUTU32(p+12,v);
390#endif
391 }
392 else {
393 Xi[0] = Z.hi;
394 Xi[1] = Z.lo;
395 }
396}
397
398#if !defined(OPENSSL_SMALL_FOOTPRINT)
399/*
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
405 */
406static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407 const u8 *inp,size_t len)
408{
409 u128 Z;
410 int cnt;
411 size_t rem, nlo, nhi;
412 const union { long one; char little; } is_endian = {1};
413
414#if 1
415 do {
416 cnt = 15;
417 nlo = ((const u8 *)Xi)[15];
418 nlo ^= inp[15];
419 nhi = nlo>>4;
420 nlo &= 0xf;
421
422 Z.hi = Htable[nlo].hi;
423 Z.lo = Htable[nlo].lo;
424
425 while (1) {
426 rem = (size_t)Z.lo&0xf;
427 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428 Z.hi = (Z.hi>>4);
429 if (sizeof(size_t)==8)
430 Z.hi ^= rem_4bit[rem];
431 else
432 Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434 Z.hi ^= Htable[nhi].hi;
435 Z.lo ^= Htable[nhi].lo;
436
437 if (--cnt<0) break;
438
439 nlo = ((const u8 *)Xi)[cnt];
440 nlo ^= inp[cnt];
441 nhi = nlo>>4;
442 nlo &= 0xf;
443
444 rem = (size_t)Z.lo&0xf;
445 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446 Z.hi = (Z.hi>>4);
447 if (sizeof(size_t)==8)
448 Z.hi ^= rem_4bit[rem];
449 else
450 Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452 Z.hi ^= Htable[nlo].hi;
453 Z.lo ^= Htable[nlo].lo;
454 }
455#else
456 /*
457 * Extra 256+16 bytes per-key plus 512 bytes shared tables
458 * [should] give ~50% improvement... One could have PACK()-ed
459 * the rem_8bit even here, but the priority is to minimize
460 * cache footprint...
461 */
462 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
463 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
464 static const unsigned short rem_8bit[256] = {
465 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497 /*
498 * This pre-processing phase slows down procedure by approximately
499 * same time as it makes each loop spin faster. In other words
500 * single block performance is approximately same as straightforward
501 * "4-bit" implementation, and then it goes only faster...
502 */
503 for (cnt=0; cnt<16; ++cnt) {
504 Z.hi = Htable[cnt].hi;
505 Z.lo = Htable[cnt].lo;
506 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507 Hshr4[cnt].hi = (Z.hi>>4);
508 Hshl4[cnt] = (u8)(Z.lo<<4);
509 }
510
511 do {
512 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513 nlo = ((const u8 *)Xi)[cnt];
514 nlo ^= inp[cnt];
515 nhi = nlo>>4;
516 nlo &= 0xf;
517
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
520
521 rem = (size_t)Z.lo&0xff;
522
523 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524 Z.hi = (Z.hi>>8);
525
526 Z.hi ^= Hshr4[nhi].hi;
527 Z.lo ^= Hshr4[nhi].lo;
528 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529 }
530
531 nlo = ((const u8 *)Xi)[0];
532 nlo ^= inp[0];
533 nhi = nlo>>4;
534 nlo &= 0xf;
535
536 Z.hi ^= Htable[nlo].hi;
537 Z.lo ^= Htable[nlo].lo;
538
539 rem = (size_t)Z.lo&0xf;
540
541 Z.lo = (Z.hi<<60)|(Z.lo>>4);
542 Z.hi = (Z.hi>>4);
543
544 Z.hi ^= Htable[nhi].hi;
545 Z.lo ^= Htable[nhi].lo;
546 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547#endif
548
549 if (is_endian.little) {
550#ifdef BSWAP8
551 Xi[0] = BSWAP8(Z.hi);
552 Xi[1] = BSWAP8(Z.lo);
553#else
554 u8 *p = (u8 *)Xi;
555 u32 v;
556 v = (u32)(Z.hi>>32); PUTU32(p,v);
557 v = (u32)(Z.hi); PUTU32(p+4,v);
558 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
559 v = (u32)(Z.lo); PUTU32(p+12,v);
560#endif
561 }
562 else {
563 Xi[0] = Z.hi;
564 Xi[1] = Z.lo;
565 }
566 } while (inp+=16, len-=16);
567}
568#endif
569#else
570void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572#endif
573
574#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580#define GHASH_CHUNK (3*1024)
581#endif
582
583#else /* TABLE_BITS */
584
585static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586{
587 u128 V,Z = { 0,0 };
588 long X;
589 int i,j;
590 const long *xi = (const long *)Xi;
591 const union { long one; char little; } is_endian = {1};
592
593 V.hi = H[0]; /* H is in host byte order, no byte swapping */
594 V.lo = H[1];
595
596 for (j=0; j<16/sizeof(long); ++j) {
597 if (is_endian.little) {
598 if (sizeof(long)==8) {
599#ifdef BSWAP8
600 X = (long)(BSWAP8(xi[j]));
601#else
602 const u8 *p = (const u8 *)(xi+j);
603 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604#endif
605 }
606 else {
607 const u8 *p = (const u8 *)(xi+j);
608 X = (long)GETU32(p);
609 }
610 }
611 else
612 X = xi[j];
613
614 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615 u64 M = (u64)(X>>(8*sizeof(long)-1));
616 Z.hi ^= V.hi&M;
617 Z.lo ^= V.lo&M;
618
619 REDUCE1BIT(V);
620 }
621 }
622
623 if (is_endian.little) {
624#ifdef BSWAP8
625 Xi[0] = BSWAP8(Z.hi);
626 Xi[1] = BSWAP8(Z.lo);
627#else
628 u8 *p = (u8 *)Xi;
629 u32 v;
630 v = (u32)(Z.hi>>32); PUTU32(p,v);
631 v = (u32)(Z.hi); PUTU32(p+4,v);
632 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
633 v = (u32)(Z.lo); PUTU32(p+12,v);
634#endif
635 }
636 else {
637 Xi[0] = Z.hi;
638 Xi[1] = Z.lo;
639 }
640}
641#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643#endif
644
645#if TABLE_BITS==4 && defined(GHASH_ASM)
646# if !defined(I386_ONLY) && \
647 (defined(__i386) || defined(__i386__) || \
648 defined(__x86_64) || defined(__x86_64__) || \
649 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
650# define GHASH_ASM_X86_OR_64
651# define GCM_FUNCREF_4BIT
652extern unsigned int OPENSSL_ia32cap_P[2];
653
654void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
659# define GHASH_ASM_X86
660void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665# endif
666# elif defined(__arm__) || defined(__arm)
667# include "arm_arch.h"
668# if __ARM_ARCH__>=7
669# define GHASH_ASM_ARM
670# define GCM_FUNCREF_4BIT
671void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673# endif
674# endif
675#endif
676
677#ifdef GCM_FUNCREF_4BIT
678# undef GCM_MUL
679# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680# ifdef GHASH
681# undef GHASH
682# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683# endif
684#endif
685
686void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687{
688 const union { long one; char little; } is_endian = {1};
689
690 memset(ctx,0,sizeof(*ctx));
691 ctx->block = block;
692 ctx->key = key;
693
694 (*block)(ctx->H.c,ctx->H.c,key);
695
696 if (is_endian.little) {
697 /* H is stored in host byte order */
698#ifdef BSWAP8
699 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701#else
702 u8 *p = ctx->H.c;
703 u64 hi,lo;
704 hi = (u64)GETU32(p) <<32|GETU32(p+4);
705 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706 ctx->H.u[0] = hi;
707 ctx->H.u[1] = lo;
708#endif
709 }
710
711#if TABLE_BITS==8
712 gcm_init_8bit(ctx->Htable,ctx->H.u);
713#elif TABLE_BITS==4
714# if defined(GHASH_ASM_X86_OR_64)
715# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
717 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
718 gcm_init_clmul(ctx->Htable,ctx->H.u);
719 ctx->gmult = gcm_gmult_clmul;
720 ctx->ghash = gcm_ghash_clmul;
721 return;
722 }
723# endif
724 gcm_init_4bit(ctx->Htable,ctx->H.u);
725# if defined(GHASH_ASM_X86) /* x86 only */
726# if defined(OPENSSL_IA32_SSE2)
727 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
728# else
729 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
730# endif
731 ctx->gmult = gcm_gmult_4bit_mmx;
732 ctx->ghash = gcm_ghash_4bit_mmx;
733 } else {
734 ctx->gmult = gcm_gmult_4bit_x86;
735 ctx->ghash = gcm_ghash_4bit_x86;
736 }
737# else
738 ctx->gmult = gcm_gmult_4bit;
739 ctx->ghash = gcm_ghash_4bit;
740# endif
741# elif defined(GHASH_ASM_ARM)
742 if (OPENSSL_armcap_P & ARMV7_NEON) {
743 ctx->gmult = gcm_gmult_neon;
744 ctx->ghash = gcm_ghash_neon;
745 } else {
746 gcm_init_4bit(ctx->Htable,ctx->H.u);
747 ctx->gmult = gcm_gmult_4bit;
748 ctx->ghash = gcm_ghash_4bit;
749 }
750# else
751 gcm_init_4bit(ctx->Htable,ctx->H.u);
752# endif
753#endif
754}
755
756void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
757{
758 const union { long one; char little; } is_endian = {1};
759 unsigned int ctr;
760#ifdef GCM_FUNCREF_4BIT
761 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
762#endif
763
764 ctx->Yi.u[0] = 0;
765 ctx->Yi.u[1] = 0;
766 ctx->Xi.u[0] = 0;
767 ctx->Xi.u[1] = 0;
768 ctx->len.u[0] = 0; /* AAD length */
769 ctx->len.u[1] = 0; /* message length */
770 ctx->ares = 0;
771 ctx->mres = 0;
772
773 if (len==12) {
774 memcpy(ctx->Yi.c,iv,12);
775 ctx->Yi.c[15]=1;
776 ctr=1;
777 }
778 else {
779 size_t i;
780 u64 len0 = len;
781
782 while (len>=16) {
783 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
784 GCM_MUL(ctx,Yi);
785 iv += 16;
786 len -= 16;
787 }
788 if (len) {
789 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790 GCM_MUL(ctx,Yi);
791 }
792 len0 <<= 3;
793 if (is_endian.little) {
794#ifdef BSWAP8
795 ctx->Yi.u[1] ^= BSWAP8(len0);
796#else
797 ctx->Yi.c[8] ^= (u8)(len0>>56);
798 ctx->Yi.c[9] ^= (u8)(len0>>48);
799 ctx->Yi.c[10] ^= (u8)(len0>>40);
800 ctx->Yi.c[11] ^= (u8)(len0>>32);
801 ctx->Yi.c[12] ^= (u8)(len0>>24);
802 ctx->Yi.c[13] ^= (u8)(len0>>16);
803 ctx->Yi.c[14] ^= (u8)(len0>>8);
804 ctx->Yi.c[15] ^= (u8)(len0);
805#endif
806 }
807 else
808 ctx->Yi.u[1] ^= len0;
809
810 GCM_MUL(ctx,Yi);
811
812 if (is_endian.little)
813#ifdef BSWAP4
814 ctr = BSWAP4(ctx->Yi.d[3]);
815#else
816 ctr = GETU32(ctx->Yi.c+12);
817#endif
818 else
819 ctr = ctx->Yi.d[3];
820 }
821
822 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
823 ++ctr;
824 if (is_endian.little)
825#ifdef BSWAP4
826 ctx->Yi.d[3] = BSWAP4(ctr);
827#else
828 PUTU32(ctx->Yi.c+12,ctr);
829#endif
830 else
831 ctx->Yi.d[3] = ctr;
832}
833
834int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
835{
836 size_t i;
837 unsigned int n;
838 u64 alen = ctx->len.u[0];
839#ifdef GCM_FUNCREF_4BIT
840 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
841# ifdef GHASH
842 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
843 const u8 *inp,size_t len) = ctx->ghash;
844# endif
845#endif
846
847 if (ctx->len.u[1]) return -2;
848
849 alen += len;
850 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
851 return -1;
852 ctx->len.u[0] = alen;
853
854 n = ctx->ares;
855 if (n) {
856 while (n && len) {
857 ctx->Xi.c[n] ^= *(aad++);
858 --len;
859 n = (n+1)%16;
860 }
861 if (n==0) GCM_MUL(ctx,Xi);
862 else {
863 ctx->ares = n;
864 return 0;
865 }
866 }
867
868#ifdef GHASH
869 if ((i = (len&(size_t)-16))) {
870 GHASH(ctx,aad,i);
871 aad += i;
872 len -= i;
873 }
874#else
875 while (len>=16) {
876 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
877 GCM_MUL(ctx,Xi);
878 aad += 16;
879 len -= 16;
880 }
881#endif
882 if (len) {
883 n = (unsigned int)len;
884 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
885 }
886
887 ctx->ares = n;
888 return 0;
889}
890
891int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
892 const unsigned char *in, unsigned char *out,
893 size_t len)
894{
895 const union { long one; char little; } is_endian = {1};
896 unsigned int n, ctr;
897 size_t i;
898 u64 mlen = ctx->len.u[1];
899 block128_f block = ctx->block;
900 void *key = ctx->key;
901#ifdef GCM_FUNCREF_4BIT
902 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
903# ifdef GHASH
904 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
905 const u8 *inp,size_t len) = ctx->ghash;
906# endif
907#endif
908
909#if 0
910 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
911#endif
912 mlen += len;
913 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
914 return -1;
915 ctx->len.u[1] = mlen;
916
917 if (ctx->ares) {
918 /* First call to encrypt finalizes GHASH(AAD) */
919 GCM_MUL(ctx,Xi);
920 ctx->ares = 0;
921 }
922
923 if (is_endian.little)
924#ifdef BSWAP4
925 ctr = BSWAP4(ctx->Yi.d[3]);
926#else
927 ctr = GETU32(ctx->Yi.c+12);
928#endif
929 else
930 ctr = ctx->Yi.d[3];
931
932 n = ctx->mres;
933#if !defined(OPENSSL_SMALL_FOOTPRINT)
934 if (16%sizeof(size_t) == 0) do { /* always true actually */
935 if (n) {
936 while (n && len) {
937 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
938 --len;
939 n = (n+1)%16;
940 }
941 if (n==0) GCM_MUL(ctx,Xi);
942 else {
943 ctx->mres = n;
944 return 0;
945 }
946 }
947#if defined(STRICT_ALIGNMENT)
948 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
949 break;
950#endif
951#if defined(GHASH) && defined(GHASH_CHUNK)
952 while (len>=GHASH_CHUNK) {
953 size_t j=GHASH_CHUNK;
954
955 while (j) {
956 size_t *out_t=(size_t *)out;
957 const size_t *in_t=(const size_t *)in;
958
959 (*block)(ctx->Yi.c,ctx->EKi.c,key);
960 ++ctr;
961 if (is_endian.little)
962#ifdef BSWAP4
963 ctx->Yi.d[3] = BSWAP4(ctr);
964#else
965 PUTU32(ctx->Yi.c+12,ctr);
966#endif
967 else
968 ctx->Yi.d[3] = ctr;
969 for (i=0; i<16/sizeof(size_t); ++i)
970 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
971 out += 16;
972 in += 16;
973 j -= 16;
974 }
975 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
976 len -= GHASH_CHUNK;
977 }
978 if ((i = (len&(size_t)-16))) {
979 size_t j=i;
980
981 while (len>=16) {
982 size_t *out_t=(size_t *)out;
983 const size_t *in_t=(const size_t *)in;
984
985 (*block)(ctx->Yi.c,ctx->EKi.c,key);
986 ++ctr;
987 if (is_endian.little)
988#ifdef BSWAP4
989 ctx->Yi.d[3] = BSWAP4(ctr);
990#else
991 PUTU32(ctx->Yi.c+12,ctr);
992#endif
993 else
994 ctx->Yi.d[3] = ctr;
995 for (i=0; i<16/sizeof(size_t); ++i)
996 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
997 out += 16;
998 in += 16;
999 len -= 16;
1000 }
1001 GHASH(ctx,out-j,j);
1002 }
1003#else
1004 while (len>=16) {
1005 size_t *out_t=(size_t *)out;
1006 const size_t *in_t=(const size_t *)in;
1007
1008 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1009 ++ctr;
1010 if (is_endian.little)
1011#ifdef BSWAP4
1012 ctx->Yi.d[3] = BSWAP4(ctr);
1013#else
1014 PUTU32(ctx->Yi.c+12,ctr);
1015#endif
1016 else
1017 ctx->Yi.d[3] = ctr;
1018 for (i=0; i<16/sizeof(size_t); ++i)
1019 ctx->Xi.t[i] ^=
1020 out_t[i] = in_t[i]^ctx->EKi.t[i];
1021 GCM_MUL(ctx,Xi);
1022 out += 16;
1023 in += 16;
1024 len -= 16;
1025 }
1026#endif
1027 if (len) {
1028 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1029 ++ctr;
1030 if (is_endian.little)
1031#ifdef BSWAP4
1032 ctx->Yi.d[3] = BSWAP4(ctr);
1033#else
1034 PUTU32(ctx->Yi.c+12,ctr);
1035#endif
1036 else
1037 ctx->Yi.d[3] = ctr;
1038 while (len--) {
1039 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1040 ++n;
1041 }
1042 }
1043
1044 ctx->mres = n;
1045 return 0;
1046 } while(0);
1047#endif
1048 for (i=0;i<len;++i) {
1049 if (n==0) {
1050 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1051 ++ctr;
1052 if (is_endian.little)
1053#ifdef BSWAP4
1054 ctx->Yi.d[3] = BSWAP4(ctr);
1055#else
1056 PUTU32(ctx->Yi.c+12,ctr);
1057#endif
1058 else
1059 ctx->Yi.d[3] = ctr;
1060 }
1061 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1062 n = (n+1)%16;
1063 if (n==0)
1064 GCM_MUL(ctx,Xi);
1065 }
1066
1067 ctx->mres = n;
1068 return 0;
1069}
1070
1071int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1072 const unsigned char *in, unsigned char *out,
1073 size_t len)
1074{
1075 const union { long one; char little; } is_endian = {1};
1076 unsigned int n, ctr;
1077 size_t i;
1078 u64 mlen = ctx->len.u[1];
1079 block128_f block = ctx->block;
1080 void *key = ctx->key;
1081#ifdef GCM_FUNCREF_4BIT
1082 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1083# ifdef GHASH
1084 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1085 const u8 *inp,size_t len) = ctx->ghash;
1086# endif
1087#endif
1088
1089 mlen += len;
1090 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1091 return -1;
1092 ctx->len.u[1] = mlen;
1093
1094 if (ctx->ares) {
1095 /* First call to decrypt finalizes GHASH(AAD) */
1096 GCM_MUL(ctx,Xi);
1097 ctx->ares = 0;
1098 }
1099
1100 if (is_endian.little)
1101#ifdef BSWAP4
1102 ctr = BSWAP4(ctx->Yi.d[3]);
1103#else
1104 ctr = GETU32(ctx->Yi.c+12);
1105#endif
1106 else
1107 ctr = ctx->Yi.d[3];
1108
1109 n = ctx->mres;
1110#if !defined(OPENSSL_SMALL_FOOTPRINT)
1111 if (16%sizeof(size_t) == 0) do { /* always true actually */
1112 if (n) {
1113 while (n && len) {
1114 u8 c = *(in++);
1115 *(out++) = c^ctx->EKi.c[n];
1116 ctx->Xi.c[n] ^= c;
1117 --len;
1118 n = (n+1)%16;
1119 }
1120 if (n==0) GCM_MUL (ctx,Xi);
1121 else {
1122 ctx->mres = n;
1123 return 0;
1124 }
1125 }
1126#if defined(STRICT_ALIGNMENT)
1127 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1128 break;
1129#endif
1130#if defined(GHASH) && defined(GHASH_CHUNK)
1131 while (len>=GHASH_CHUNK) {
1132 size_t j=GHASH_CHUNK;
1133
1134 GHASH(ctx,in,GHASH_CHUNK);
1135 while (j) {
1136 size_t *out_t=(size_t *)out;
1137 const size_t *in_t=(const size_t *)in;
1138
1139 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1140 ++ctr;
1141 if (is_endian.little)
1142#ifdef BSWAP4
1143 ctx->Yi.d[3] = BSWAP4(ctr);
1144#else
1145 PUTU32(ctx->Yi.c+12,ctr);
1146#endif
1147 else
1148 ctx->Yi.d[3] = ctr;
1149 for (i=0; i<16/sizeof(size_t); ++i)
1150 out_t[i] = in_t[i]^ctx->EKi.t[i];
1151 out += 16;
1152 in += 16;
1153 j -= 16;
1154 }
1155 len -= GHASH_CHUNK;
1156 }
1157 if ((i = (len&(size_t)-16))) {
1158 GHASH(ctx,in,i);
1159 while (len>=16) {
1160 size_t *out_t=(size_t *)out;
1161 const size_t *in_t=(const size_t *)in;
1162
1163 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1164 ++ctr;
1165 if (is_endian.little)
1166#ifdef BSWAP4
1167 ctx->Yi.d[3] = BSWAP4(ctr);
1168#else
1169 PUTU32(ctx->Yi.c+12,ctr);
1170#endif
1171 else
1172 ctx->Yi.d[3] = ctr;
1173 for (i=0; i<16/sizeof(size_t); ++i)
1174 out_t[i] = in_t[i]^ctx->EKi.t[i];
1175 out += 16;
1176 in += 16;
1177 len -= 16;
1178 }
1179 }
1180#else
1181 while (len>=16) {
1182 size_t *out_t=(size_t *)out;
1183 const size_t *in_t=(const size_t *)in;
1184
1185 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1186 ++ctr;
1187 if (is_endian.little)
1188#ifdef BSWAP4
1189 ctx->Yi.d[3] = BSWAP4(ctr);
1190#else
1191 PUTU32(ctx->Yi.c+12,ctr);
1192#endif
1193 else
1194 ctx->Yi.d[3] = ctr;
1195 for (i=0; i<16/sizeof(size_t); ++i) {
1196 size_t c = in[i];
1197 out[i] = c^ctx->EKi.t[i];
1198 ctx->Xi.t[i] ^= c;
1199 }
1200 GCM_MUL(ctx,Xi);
1201 out += 16;
1202 in += 16;
1203 len -= 16;
1204 }
1205#endif
1206 if (len) {
1207 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1208 ++ctr;
1209 if (is_endian.little)
1210#ifdef BSWAP4
1211 ctx->Yi.d[3] = BSWAP4(ctr);
1212#else
1213 PUTU32(ctx->Yi.c+12,ctr);
1214#endif
1215 else
1216 ctx->Yi.d[3] = ctr;
1217 while (len--) {
1218 u8 c = in[n];
1219 ctx->Xi.c[n] ^= c;
1220 out[n] = c^ctx->EKi.c[n];
1221 ++n;
1222 }
1223 }
1224
1225 ctx->mres = n;
1226 return 0;
1227 } while(0);
1228#endif
1229 for (i=0;i<len;++i) {
1230 u8 c;
1231 if (n==0) {
1232 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1233 ++ctr;
1234 if (is_endian.little)
1235#ifdef BSWAP4
1236 ctx->Yi.d[3] = BSWAP4(ctr);
1237#else
1238 PUTU32(ctx->Yi.c+12,ctr);
1239#endif
1240 else
1241 ctx->Yi.d[3] = ctr;
1242 }
1243 c = in[i];
1244 out[i] = c^ctx->EKi.c[n];
1245 ctx->Xi.c[n] ^= c;
1246 n = (n+1)%16;
1247 if (n==0)
1248 GCM_MUL(ctx,Xi);
1249 }
1250
1251 ctx->mres = n;
1252 return 0;
1253}
1254
1255int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1256 const unsigned char *in, unsigned char *out,
1257 size_t len, ctr128_f stream)
1258{
1259 const union { long one; char little; } is_endian = {1};
1260 unsigned int n, ctr;
1261 size_t i;
1262 u64 mlen = ctx->len.u[1];
1263 void *key = ctx->key;
1264#ifdef GCM_FUNCREF_4BIT
1265 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1266# ifdef GHASH
1267 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1268 const u8 *inp,size_t len) = ctx->ghash;
1269# endif
1270#endif
1271
1272 mlen += len;
1273 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1274 return -1;
1275 ctx->len.u[1] = mlen;
1276
1277 if (ctx->ares) {
1278 /* First call to encrypt finalizes GHASH(AAD) */
1279 GCM_MUL(ctx,Xi);
1280 ctx->ares = 0;
1281 }
1282
1283 if (is_endian.little)
1284#ifdef BSWAP4
1285 ctr = BSWAP4(ctx->Yi.d[3]);
1286#else
1287 ctr = GETU32(ctx->Yi.c+12);
1288#endif
1289 else
1290 ctr = ctx->Yi.d[3];
1291
1292 n = ctx->mres;
1293 if (n) {
1294 while (n && len) {
1295 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1296 --len;
1297 n = (n+1)%16;
1298 }
1299 if (n==0) GCM_MUL(ctx,Xi);
1300 else {
1301 ctx->mres = n;
1302 return 0;
1303 }
1304 }
1305#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1306 while (len>=GHASH_CHUNK) {
1307 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1308 ctr += GHASH_CHUNK/16;
1309 if (is_endian.little)
1310#ifdef BSWAP4
1311 ctx->Yi.d[3] = BSWAP4(ctr);
1312#else
1313 PUTU32(ctx->Yi.c+12,ctr);
1314#endif
1315 else
1316 ctx->Yi.d[3] = ctr;
1317 GHASH(ctx,out,GHASH_CHUNK);
1318 out += GHASH_CHUNK;
1319 in += GHASH_CHUNK;
1320 len -= GHASH_CHUNK;
1321 }
1322#endif
1323 if ((i = (len&(size_t)-16))) {
1324 size_t j=i/16;
1325
1326 (*stream)(in,out,j,key,ctx->Yi.c);
1327 ctr += (unsigned int)j;
1328 if (is_endian.little)
1329#ifdef BSWAP4
1330 ctx->Yi.d[3] = BSWAP4(ctr);
1331#else
1332 PUTU32(ctx->Yi.c+12,ctr);
1333#endif
1334 else
1335 ctx->Yi.d[3] = ctr;
1336 in += i;
1337 len -= i;
1338#if defined(GHASH)
1339 GHASH(ctx,out,i);
1340 out += i;
1341#else
1342 while (j--) {
1343 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1344 GCM_MUL(ctx,Xi);
1345 out += 16;
1346 }
1347#endif
1348 }
1349 if (len) {
1350 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1351 ++ctr;
1352 if (is_endian.little)
1353#ifdef BSWAP4
1354 ctx->Yi.d[3] = BSWAP4(ctr);
1355#else
1356 PUTU32(ctx->Yi.c+12,ctr);
1357#endif
1358 else
1359 ctx->Yi.d[3] = ctr;
1360 while (len--) {
1361 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1362 ++n;
1363 }
1364 }
1365
1366 ctx->mres = n;
1367 return 0;
1368}
1369
1370int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1371 const unsigned char *in, unsigned char *out,
1372 size_t len,ctr128_f stream)
1373{
1374 const union { long one; char little; } is_endian = {1};
1375 unsigned int n, ctr;
1376 size_t i;
1377 u64 mlen = ctx->len.u[1];
1378 void *key = ctx->key;
1379#ifdef GCM_FUNCREF_4BIT
1380 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1381# ifdef GHASH
1382 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1383 const u8 *inp,size_t len) = ctx->ghash;
1384# endif
1385#endif
1386
1387 mlen += len;
1388 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1389 return -1;
1390 ctx->len.u[1] = mlen;
1391
1392 if (ctx->ares) {
1393 /* First call to decrypt finalizes GHASH(AAD) */
1394 GCM_MUL(ctx,Xi);
1395 ctx->ares = 0;
1396 }
1397
1398 if (is_endian.little)
1399#ifdef BSWAP4
1400 ctr = BSWAP4(ctx->Yi.d[3]);
1401#else
1402 ctr = GETU32(ctx->Yi.c+12);
1403#endif
1404 else
1405 ctr = ctx->Yi.d[3];
1406
1407 n = ctx->mres;
1408 if (n) {
1409 while (n && len) {
1410 u8 c = *(in++);
1411 *(out++) = c^ctx->EKi.c[n];
1412 ctx->Xi.c[n] ^= c;
1413 --len;
1414 n = (n+1)%16;
1415 }
1416 if (n==0) GCM_MUL (ctx,Xi);
1417 else {
1418 ctx->mres = n;
1419 return 0;
1420 }
1421 }
1422#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1423 while (len>=GHASH_CHUNK) {
1424 GHASH(ctx,in,GHASH_CHUNK);
1425 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1426 ctr += GHASH_CHUNK/16;
1427 if (is_endian.little)
1428#ifdef BSWAP4
1429 ctx->Yi.d[3] = BSWAP4(ctr);
1430#else
1431 PUTU32(ctx->Yi.c+12,ctr);
1432#endif
1433 else
1434 ctx->Yi.d[3] = ctr;
1435 out += GHASH_CHUNK;
1436 in += GHASH_CHUNK;
1437 len -= GHASH_CHUNK;
1438 }
1439#endif
1440 if ((i = (len&(size_t)-16))) {
1441 size_t j=i/16;
1442
1443#if defined(GHASH)
1444 GHASH(ctx,in,i);
1445#else
1446 while (j--) {
1447 size_t k;
1448 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1449 GCM_MUL(ctx,Xi);
1450 in += 16;
1451 }
1452 j = i/16;
1453 in -= i;
1454#endif
1455 (*stream)(in,out,j,key,ctx->Yi.c);
1456 ctr += (unsigned int)j;
1457 if (is_endian.little)
1458#ifdef BSWAP4
1459 ctx->Yi.d[3] = BSWAP4(ctr);
1460#else
1461 PUTU32(ctx->Yi.c+12,ctr);
1462#endif
1463 else
1464 ctx->Yi.d[3] = ctr;
1465 out += i;
1466 in += i;
1467 len -= i;
1468 }
1469 if (len) {
1470 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1471 ++ctr;
1472 if (is_endian.little)
1473#ifdef BSWAP4
1474 ctx->Yi.d[3] = BSWAP4(ctr);
1475#else
1476 PUTU32(ctx->Yi.c+12,ctr);
1477#endif
1478 else
1479 ctx->Yi.d[3] = ctr;
1480 while (len--) {
1481 u8 c = in[n];
1482 ctx->Xi.c[n] ^= c;
1483 out[n] = c^ctx->EKi.c[n];
1484 ++n;
1485 }
1486 }
1487
1488 ctx->mres = n;
1489 return 0;
1490}
1491
1492int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1493 size_t len)
1494{
1495 const union { long one; char little; } is_endian = {1};
1496 u64 alen = ctx->len.u[0]<<3;
1497 u64 clen = ctx->len.u[1]<<3;
1498#ifdef GCM_FUNCREF_4BIT
1499 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1500#endif
1501
1502 if (ctx->mres || ctx->ares)
1503 GCM_MUL(ctx,Xi);
1504
1505 if (is_endian.little) {
1506#ifdef BSWAP8
1507 alen = BSWAP8(alen);
1508 clen = BSWAP8(clen);
1509#else
1510 u8 *p = ctx->len.c;
1511
1512 ctx->len.u[0] = alen;
1513 ctx->len.u[1] = clen;
1514
1515 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1516 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1517#endif
1518 }
1519
1520 ctx->Xi.u[0] ^= alen;
1521 ctx->Xi.u[1] ^= clen;
1522 GCM_MUL(ctx,Xi);
1523
1524 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1525 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1526
1527 if (tag && len<=sizeof(ctx->Xi))
1528 return memcmp(ctx->Xi.c,tag,len);
1529 else
1530 return -1;
1531}
1532
1533void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1534{
1535 CRYPTO_gcm128_finish(ctx, NULL, 0);
1536 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1537}
1538
1539GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1540{
1541 GCM128_CONTEXT *ret;
1542
1543 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1544 CRYPTO_gcm128_init(ret,key,block);
1545
1546 return ret;
1547}
1548
1549void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1550{
1551 if (ctx) {
1552 OPENSSL_cleanse(ctx,sizeof(*ctx));
1553 OPENSSL_free(ctx);
1554 }
1555}
1556
1557#if defined(SELFTEST)
1558#include <stdio.h>
1559#include <openssl/aes.h>
1560
1561/* Test Case 1 */
1562static const u8 K1[16],
1563 *P1=NULL,
1564 *A1=NULL,
1565 IV1[12],
1566 *C1=NULL,
1567 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1568
1569/* Test Case 2 */
1570#define K2 K1
1571#define A2 A1
1572#define IV2 IV1
1573static const u8 P2[16],
1574 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1575 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1576
1577/* Test Case 3 */
1578#define A3 A2
1579static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1580 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1581 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1582 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1583 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1584 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1585 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1586 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1587 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1588 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1589 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1590
1591/* Test Case 4 */
1592#define K4 K3
1593#define IV4 IV3
1594static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1595 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1596 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1597 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1598 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1599 0xab,0xad,0xda,0xd2},
1600 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1601 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1602 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1603 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1604 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1605
1606/* Test Case 5 */
1607#define K5 K4
1608#define P5 P4
1609#define A5 A4
1610static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1611 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1612 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1613 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1614 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1615 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1616
1617/* Test Case 6 */
1618#define K6 K5
1619#define P6 P5
1620#define A6 A5
1621static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1622 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1623 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1624 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1625 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1626 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1627 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1628 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1629 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1630
1631/* Test Case 7 */
1632static const u8 K7[24],
1633 *P7=NULL,
1634 *A7=NULL,
1635 IV7[12],
1636 *C7=NULL,
1637 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1638
1639/* Test Case 8 */
1640#define K8 K7
1641#define IV8 IV7
1642#define A8 A7
1643static const u8 P8[16],
1644 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1645 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1646
1647/* Test Case 9 */
1648#define A9 A8
1649static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1650 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1651 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1652 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1653 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1654 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1655 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1656 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1657 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1658 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1659 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1660 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1661
1662/* Test Case 10 */
1663#define K10 K9
1664#define IV10 IV9
1665static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1666 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1667 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1668 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1669 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1670 0xab,0xad,0xda,0xd2},
1671 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1672 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1673 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1674 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1675 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1676
1677/* Test Case 11 */
1678#define K11 K10
1679#define P11 P10
1680#define A11 A10
1681static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1682 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1683 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1684 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1685 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1686 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1687
1688/* Test Case 12 */
1689#define K12 K11
1690#define P12 P11
1691#define A12 A11
1692static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1693 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1694 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1695 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1696 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1697 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1698 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1699 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1700 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1701
1702/* Test Case 13 */
1703static const u8 K13[32],
1704 *P13=NULL,
1705 *A13=NULL,
1706 IV13[12],
1707 *C13=NULL,
1708 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1709
1710/* Test Case 14 */
1711#define K14 K13
1712#define A14 A13
1713static const u8 P14[16],
1714 IV14[12],
1715 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1716 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1717
1718/* Test Case 15 */
1719#define A15 A14
1720static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1721 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1722 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1723 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1724 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1725 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1726 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1727 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1728 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1729 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1730 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1731 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1732
1733/* Test Case 16 */
1734#define K16 K15
1735#define IV16 IV15
1736static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1737 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1738 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1739 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1740 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1741 0xab,0xad,0xda,0xd2},
1742 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1743 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1744 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1745 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1746 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1747
1748/* Test Case 17 */
1749#define K17 K16
1750#define P17 P16
1751#define A17 A16
1752static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1753 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1754 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1755 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1756 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1757 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1758
1759/* Test Case 18 */
1760#define K18 K17
1761#define P18 P17
1762#define A18 A17
1763static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1764 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1765 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1766 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1767 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1768 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1769 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1770 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1771 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1772
1773/* Test Case 19 */
1774#define K19 K1
1775#define P19 P1
1776#define IV19 IV1
1777#define C19 C1
1778static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1779 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1780 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1781 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1782 0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1783 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1784 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1785 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1786 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1787
1788/* Test Case 20 */
1789#define K20 K1
1790#define A20 A1
1791static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1792 P20[288],
1793 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1794 0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1795 0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1796 0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1797 0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1798 0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1799 0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1800 0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1801 0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1802 0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1803 0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1804 0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1805 0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1806 0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1807 0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1808 0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1809 0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1810 0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1811 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1812
1813#define TEST_CASE(n) do { \
1814 u8 out[sizeof(P##n)]; \
1815 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1816 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1817 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1818 memset(out,0,sizeof(out)); \
1819 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1820 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1821 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1822 (C##n && memcmp(out,C##n,sizeof(out)))) \
1823 ret++, printf ("encrypt test#%d failed.\n",n); \
1824 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1825 memset(out,0,sizeof(out)); \
1826 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1827 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1828 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1829 (P##n && memcmp(out,P##n,sizeof(out)))) \
1830 ret++, printf ("decrypt test#%d failed.\n",n); \
1831 } while(0)
1832
1833int main()
1834{
1835 GCM128_CONTEXT ctx;
1836 AES_KEY key;
1837 int ret=0;
1838
1839 TEST_CASE(1);
1840 TEST_CASE(2);
1841 TEST_CASE(3);
1842 TEST_CASE(4);
1843 TEST_CASE(5);
1844 TEST_CASE(6);
1845 TEST_CASE(7);
1846 TEST_CASE(8);
1847 TEST_CASE(9);
1848 TEST_CASE(10);
1849 TEST_CASE(11);
1850 TEST_CASE(12);
1851 TEST_CASE(13);
1852 TEST_CASE(14);
1853 TEST_CASE(15);
1854 TEST_CASE(16);
1855 TEST_CASE(17);
1856 TEST_CASE(18);
1857 TEST_CASE(19);
1858 TEST_CASE(20);
1859
1860#ifdef OPENSSL_CPUID_OBJ
1861 {
1862 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1863 union { u64 u; u8 c[1024]; } buf;
1864 int i;
1865
1866 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1867 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1868 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1869
1870 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1871 start = OPENSSL_rdtsc();
1872 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1873 gcm_t = OPENSSL_rdtsc() - start;
1874
1875 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1876 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1877 (block128_f)AES_encrypt);
1878 start = OPENSSL_rdtsc();
1879 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1880 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1881 (block128_f)AES_encrypt);
1882 ctr_t = OPENSSL_rdtsc() - start;
1883
1884 printf("%.2f-%.2f=%.2f\n",
1885 gcm_t/(double)sizeof(buf),
1886 ctr_t/(double)sizeof(buf),
1887 (gcm_t-ctr_t)/(double)sizeof(buf));
1888#ifdef GHASH
1889 {
1890 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1891 const u8 *inp,size_t len) = ctx.ghash;
1892
1893 GHASH((&ctx),buf.c,sizeof(buf));
1894 start = OPENSSL_rdtsc();
1895 for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1896 gcm_t = OPENSSL_rdtsc() - start;
1897 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1898 }
1899#endif
1900 }
1901#endif
1902
1903 return ret;
1904}
1905#endif
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
deleted file mode 100644
index f18215bb2b..0000000000
--- a/src/lib/libcrypto/modes/modes.h
+++ /dev/null
@@ -1,135 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
3 *
4 * Rights for redistribution and usage in source and binary
5 * forms are granted according to the OpenSSL license.
6 */
7
8#include <stddef.h>
9
10typedef void (*block128_f)(const unsigned char in[16],
11 unsigned char out[16],
12 const void *key);
13
14typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
15 size_t len, const void *key,
16 unsigned char ivec[16], int enc);
17
18typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
19 size_t blocks, const void *key,
20 const unsigned char ivec[16]);
21
22typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
23 size_t blocks, const void *key,
24 const unsigned char ivec[16],unsigned char cmac[16]);
25
26void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
27 size_t len, const void *key,
28 unsigned char ivec[16], block128_f block);
29void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
30 size_t len, const void *key,
31 unsigned char ivec[16], block128_f block);
32
33void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
34 size_t len, const void *key,
35 unsigned char ivec[16], unsigned char ecount_buf[16],
36 unsigned int *num, block128_f block);
37
38void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
39 size_t len, const void *key,
40 unsigned char ivec[16], unsigned char ecount_buf[16],
41 unsigned int *num, ctr128_f ctr);
42
43void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
44 size_t len, const void *key,
45 unsigned char ivec[16], int *num,
46 block128_f block);
47
48void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
49 size_t len, const void *key,
50 unsigned char ivec[16], int *num,
51 int enc, block128_f block);
52void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
53 size_t length, const void *key,
54 unsigned char ivec[16], int *num,
55 int enc, block128_f block);
56void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
57 size_t bits, const void *key,
58 unsigned char ivec[16], int *num,
59 int enc, block128_f block);
60
61size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
62 size_t len, const void *key,
63 unsigned char ivec[16], block128_f block);
64size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
65 size_t len, const void *key,
66 unsigned char ivec[16], cbc128_f cbc);
67size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
68 size_t len, const void *key,
69 unsigned char ivec[16], block128_f block);
70size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
71 size_t len, const void *key,
72 unsigned char ivec[16], cbc128_f cbc);
73
74size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
75 size_t len, const void *key,
76 unsigned char ivec[16], block128_f block);
77size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
78 size_t len, const void *key,
79 unsigned char ivec[16], cbc128_f cbc);
80size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
81 size_t len, const void *key,
82 unsigned char ivec[16], block128_f block);
83size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
84 size_t len, const void *key,
85 unsigned char ivec[16], cbc128_f cbc);
86
87typedef struct gcm128_context GCM128_CONTEXT;
88
89GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
90void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
91void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
92 size_t len);
93int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
94 size_t len);
95int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
96 const unsigned char *in, unsigned char *out,
97 size_t len);
98int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
99 const unsigned char *in, unsigned char *out,
100 size_t len);
101int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
102 const unsigned char *in, unsigned char *out,
103 size_t len, ctr128_f stream);
104int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
105 const unsigned char *in, unsigned char *out,
106 size_t len, ctr128_f stream);
107int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
108 size_t len);
109void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
110void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
111
112typedef struct ccm128_context CCM128_CONTEXT;
113
114void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
115 unsigned int M, unsigned int L, void *key,block128_f block);
116int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
117 const unsigned char *nonce, size_t nlen, size_t mlen);
118void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
119 const unsigned char *aad, size_t alen);
120int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
121 const unsigned char *inp, unsigned char *out, size_t len);
122int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
123 const unsigned char *inp, unsigned char *out, size_t len);
124int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
125 const unsigned char *inp, unsigned char *out, size_t len,
126 ccm128_f stream);
127int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
128 const unsigned char *inp, unsigned char *out, size_t len,
129 ccm128_f stream);
130size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
131
132typedef struct xts128_context XTS128_CONTEXT;
133
134int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
135 const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h
deleted file mode 100644
index 9d83e12844..0000000000
--- a/src/lib/libcrypto/modes/modes_lcl.h
+++ /dev/null
@@ -1,128 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use is governed by OpenSSL license.
5 * ====================================================================
6 */
7
8#include <openssl/modes.h>
9
10
11#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
12typedef __int64 i64;
13typedef unsigned __int64 u64;
14#define U64(C) C##UI64
15#elif defined(__arch64__)
16typedef long i64;
17typedef unsigned long u64;
18#define U64(C) C##UL
19#else
20typedef long long i64;
21typedef unsigned long long u64;
22#define U64(C) C##ULL
23#endif
24
25typedef unsigned int u32;
26typedef unsigned char u8;
27
28#define STRICT_ALIGNMENT 1
29#if defined(__i386) || defined(__i386__) || \
30 defined(__x86_64) || defined(__x86_64__) || \
31 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
32 defined(__s390__) || defined(__s390x__)
33# undef STRICT_ALIGNMENT
34#endif
35
36#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
37#if defined(__GNUC__) && __GNUC__>=2
38# if defined(__x86_64) || defined(__x86_64__)
39# define BSWAP8(x) ({ u64 ret=(x); \
40 asm ("bswapq %0" \
41 : "+r"(ret)); ret; })
42# define BSWAP4(x) ({ u32 ret=(x); \
43 asm ("bswapl %0" \
44 : "+r"(ret)); ret; })
45# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
46# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
47 asm ("bswapl %0; bswapl %1" \
48 : "+r"(hi),"+r"(lo)); \
49 (u64)hi<<32|lo; })
50# define BSWAP4(x) ({ u32 ret=(x); \
51 asm ("bswapl %0" \
52 : "+r"(ret)); ret; })
53# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
54# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
55 asm ("rev %0,%0; rev %1,%1" \
56 : "+r"(hi),"+r"(lo)); \
57 (u64)hi<<32|lo; })
58# define BSWAP4(x) ({ u32 ret; \
59 asm ("rev %0,%1" \
60 : "=r"(ret) : "r"((u32)(x))); \
61 ret; })
62# endif
63#elif defined(_MSC_VER)
64# if _MSC_VER>=1300
65# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
66# define BSWAP8(x) _byteswap_uint64((u64)(x))
67# define BSWAP4(x) _byteswap_ulong((u32)(x))
68# elif defined(_M_IX86)
69 __inline u32 _bswap4(u32 val) {
70 _asm mov eax,val
71 _asm bswap eax
72 }
73# define BSWAP4(x) _bswap4(x)
74# endif
75#endif
76#endif
77
78#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
79#define GETU32(p) BSWAP4(*(const u32 *)(p))
80#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
81#else
82#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
83#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
84#endif
85
86/* GCM definitions */
87
88typedef struct { u64 hi,lo; } u128;
89
90#ifdef TABLE_BITS
91#undef TABLE_BITS
92#endif
93/*
94 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
95 * never be set to 8 [or 1]. For further information see gcm128.c.
96 */
97#define TABLE_BITS 4
98
99struct gcm128_context {
100 /* Following 6 names follow names in GCM specification */
101 union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
102 Yi,EKi,EK0,len,Xi,H;
103 /* Relative position of Xi, H and pre-computed Htable is used
104 * in some assembler modules, i.e. don't change the order! */
105#if TABLE_BITS==8
106 u128 Htable[256];
107#else
108 u128 Htable[16];
109 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
110 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
111#endif
112 unsigned int mres, ares;
113 block128_f block;
114 void *key;
115};
116
117struct xts128_context {
118 void *key1, *key2;
119 block128_f block1,block2;
120};
121
122struct ccm128_context {
123 union { u64 u[2]; u8 c[16]; } nonce, cmac;
124 u64 blocks;
125 block128_f block;
126 void *key;
127};
128
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
deleted file mode 100644
index 01c01702c4..0000000000
--- a/src/lib/libcrypto/modes/ofb128.c
+++ /dev/null
@@ -1,121 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 *
49 */
50
51#include <openssl/crypto.h>
52#include "modes_lcl.h"
53#include <string.h>
54
55#ifndef MODES_DEBUG
56# ifndef NDEBUG
57# define NDEBUG
58# endif
59#endif
60#include <assert.h>
61
62/* The input and output encrypted as though 128bit ofb mode is being
63 * used. The extra state information to record how much of the
64 * 128bit block we have used is contained in *num;
65 */
66void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
67 size_t len, const void *key,
68 unsigned char ivec[16], int *num,
69 block128_f block)
70{
71 unsigned int n;
72 size_t l=0;
73
74 assert(in && out && key && ivec && num);
75
76 n = *num;
77
78#if !defined(OPENSSL_SMALL_FOOTPRINT)
79 if (16%sizeof(size_t) == 0) do { /* always true actually */
80 while (n && len) {
81 *(out++) = *(in++) ^ ivec[n];
82 --len;
83 n = (n+1) % 16;
84 }
85#if defined(STRICT_ALIGNMENT)
86 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
87 break;
88#endif
89 while (len>=16) {
90 (*block)(ivec, ivec, key);
91 for (; n<16; n+=sizeof(size_t))
92 *(size_t*)(out+n) =
93 *(size_t*)(in+n) ^ *(size_t*)(ivec+n);
94 len -= 16;
95 out += 16;
96 in += 16;
97 n = 0;
98 }
99 if (len) {
100 (*block)(ivec, ivec, key);
101 while (len--) {
102 out[n] = in[n] ^ ivec[n];
103 ++n;
104 }
105 }
106 *num = n;
107 return;
108 } while(0);
109 /* the rest would be commonly eliminated by x86* compiler */
110#endif
111 while (l<len) {
112 if (n==0) {
113 (*block)(ivec, ivec, key);
114 }
115 out[l] = in[l] ^ ivec[n];
116 ++l;
117 n = (n+1) % 16;
118 }
119
120 *num=n;
121}
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
deleted file mode 100644
index 9cf27a25e9..0000000000
--- a/src/lib/libcrypto/modes/xts128.c
+++ /dev/null
@@ -1,187 +0,0 @@
1/* ====================================================================
2 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#include <openssl/crypto.h>
51#include "modes_lcl.h"
52#include <string.h>
53
54#ifndef MODES_DEBUG
55# ifndef NDEBUG
56# define NDEBUG
57# endif
58#endif
59#include <assert.h>
60
61int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
62 const unsigned char *inp, unsigned char *out,
63 size_t len, int enc)
64{
65 const union { long one; char little; } is_endian = {1};
66 union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
67 unsigned int i;
68
69 if (len<16) return -1;
70
71 memcpy(tweak.c, iv, 16);
72
73 (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
74
75 if (!enc && (len%16)) len-=16;
76
77 while (len>=16) {
78#if defined(STRICT_ALIGNMENT)
79 memcpy(scratch.c,inp,16);
80 scratch.u[0] ^= tweak.u[0];
81 scratch.u[1] ^= tweak.u[1];
82#else
83 scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
84 scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
85#endif
86 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
87#if defined(STRICT_ALIGNMENT)
88 scratch.u[0] ^= tweak.u[0];
89 scratch.u[1] ^= tweak.u[1];
90 memcpy(out,scratch.c,16);
91#else
92 ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
93 ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
94#endif
95 inp += 16;
96 out += 16;
97 len -= 16;
98
99 if (len==0) return 0;
100
101 if (is_endian.little) {
102 unsigned int carry,res;
103
104 res = 0x87&(((int)tweak.d[3])>>31);
105 carry = (unsigned int)(tweak.u[0]>>63);
106 tweak.u[0] = (tweak.u[0]<<1)^res;
107 tweak.u[1] = (tweak.u[1]<<1)|carry;
108 }
109 else {
110 size_t c;
111
112 for (c=0,i=0;i<16;++i) {
113 /*+ substitutes for |, because c is 1 bit */
114 c += ((size_t)tweak.c[i])<<1;
115 tweak.c[i] = (u8)c;
116 c = c>>8;
117 }
118 tweak.c[0] ^= (u8)(0x87&(0-c));
119 }
120 }
121 if (enc) {
122 for (i=0;i<len;++i) {
123 u8 c = inp[i];
124 out[i] = scratch.c[i];
125 scratch.c[i] = c;
126 }
127 scratch.u[0] ^= tweak.u[0];
128 scratch.u[1] ^= tweak.u[1];
129 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
130 scratch.u[0] ^= tweak.u[0];
131 scratch.u[1] ^= tweak.u[1];
132 memcpy(out-16,scratch.c,16);
133 }
134 else {
135 union { u64 u[2]; u8 c[16]; } tweak1;
136
137 if (is_endian.little) {
138 unsigned int carry,res;
139
140 res = 0x87&(((int)tweak.d[3])>>31);
141 carry = (unsigned int)(tweak.u[0]>>63);
142 tweak1.u[0] = (tweak.u[0]<<1)^res;
143 tweak1.u[1] = (tweak.u[1]<<1)|carry;
144 }
145 else {
146 size_t c;
147
148 for (c=0,i=0;i<16;++i) {
149 /*+ substitutes for |, because c is 1 bit */
150 c += ((size_t)tweak.c[i])<<1;
151 tweak1.c[i] = (u8)c;
152 c = c>>8;
153 }
154 tweak1.c[0] ^= (u8)(0x87&(0-c));
155 }
156#if defined(STRICT_ALIGNMENT)
157 memcpy(scratch.c,inp,16);
158 scratch.u[0] ^= tweak1.u[0];
159 scratch.u[1] ^= tweak1.u[1];
160#else
161 scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
162 scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
163#endif
164 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
165 scratch.u[0] ^= tweak1.u[0];
166 scratch.u[1] ^= tweak1.u[1];
167
168 for (i=0;i<len;++i) {
169 u8 c = inp[16+i];
170 out[16+i] = scratch.c[i];
171 scratch.c[i] = c;
172 }
173 scratch.u[0] ^= tweak.u[0];
174 scratch.u[1] ^= tweak.u[1];
175 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
176#if defined(STRICT_ALIGNMENT)
177 scratch.u[0] ^= tweak.u[0];
178 scratch.u[1] ^= tweak.u[1];
179 memcpy (out,scratch.c,16);
180#else
181 ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
182 ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
183#endif
184 }
185
186 return 0;
187}