summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/modes
diff options
context:
space:
mode:
authordjm <>2012-10-13 21:23:50 +0000
committerdjm <>2012-10-13 21:23:50 +0000
commit228cae30b117c2493f69ad3c195341cd6ec8d430 (patch)
tree29ff00b10d52c0978077c4fd83c33b065bade73e /src/lib/libcrypto/modes
parent731838c66b52c0ae5888333005b74115a620aa96 (diff)
downloadopenbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.tar.gz
openbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.tar.bz2
openbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.zip
import OpenSSL-1.0.1c
Diffstat (limited to 'src/lib/libcrypto/modes')
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-alpha.pl451
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-armv4.pl429
-rwxr-xr-xsrc/lib/libcrypto/modes/asm/ghash-ia64.pl463
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-parisc.pl730
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-s390x.pl262
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86.pl1342
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86_64.pl805
-rw-r--r--src/lib/libcrypto/modes/cbc128.c10
-rw-r--r--src/lib/libcrypto/modes/ccm128.c441
-rw-r--r--src/lib/libcrypto/modes/cfb128.c11
-rw-r--r--src/lib/libcrypto/modes/ctr128.c92
-rw-r--r--src/lib/libcrypto/modes/cts128.c226
-rw-r--r--src/lib/libcrypto/modes/gcm128.c1757
-rw-r--r--src/lib/libcrypto/modes/modes.h76
-rw-r--r--src/lib/libcrypto/modes/modes_lcl.h131
-rw-r--r--src/lib/libcrypto/modes/ofb128.c11
-rw-r--r--src/lib/libcrypto/modes/xts128.c187
18 files changed, 7707 insertions, 47 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000000..6358b2750f
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,451 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#ifdef __linux__
249#include <asm/regdef.h>
250#else
251#include <asm.h>
252#include <regdef.h>
253#endif
254
255.text
256
257.set noat
258.set noreorder
259.globl gcm_gmult_4bit
260.align 4
261.ent gcm_gmult_4bit
262gcm_gmult_4bit:
263 .frame sp,0,ra
264 .prologue 0
265
266 ldq $Xlo,8($Xi)
267 ldq $Xhi,0($Xi)
268
269 br $rem_4bit,.Lpic1
270.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
271___
272
273 &loop();
274
275$code.=<<___;
276 srl $Zlo,24,$t0 # byte swap
277 srl $Zlo,8,$t1
278
279 sll $Zlo,8,$t2
280 sll $Zlo,24,$Zlo
281 zapnot $t0,0x11,$t0
282 zapnot $t1,0x22,$t1
283
284 zapnot $Zlo,0x88,$Zlo
285 or $t0,$t1,$t0
286 zapnot $t2,0x44,$t2
287
288 or $Zlo,$t0,$Zlo
289 srl $Zhi,24,$t0
290 srl $Zhi,8,$t1
291
292 or $Zlo,$t2,$Zlo
293 sll $Zhi,8,$t2
294 sll $Zhi,24,$Zhi
295
296 srl $Zlo,32,$Xlo
297 sll $Zlo,32,$Zlo
298
299 zapnot $t0,0x11,$t0
300 zapnot $t1,0x22,$t1
301 or $Zlo,$Xlo,$Xlo
302
303 zapnot $Zhi,0x88,$Zhi
304 or $t0,$t1,$t0
305 zapnot $t2,0x44,$t2
306
307 or $Zhi,$t0,$Zhi
308 or $Zhi,$t2,$Zhi
309
310 srl $Zhi,32,$Xhi
311 sll $Zhi,32,$Zhi
312
313 or $Zhi,$Xhi,$Xhi
314 stq $Xlo,8($Xi)
315 stq $Xhi,0($Xi)
316
317 ret (ra)
318.end gcm_gmult_4bit
319___
320
321$inhi="s0";
322$inlo="s1";
323
324$code.=<<___;
325.globl gcm_ghash_4bit
326.align 4
327.ent gcm_ghash_4bit
328gcm_ghash_4bit:
329 lda sp,-32(sp)
330 stq ra,0(sp)
331 stq s0,8(sp)
332 stq s1,16(sp)
333 .mask 0x04000600,-32
334 .frame sp,32,ra
335 .prologue 0
336
337 ldq_u $inhi,0($inp)
338 ldq_u $Thi0,7($inp)
339 ldq_u $inlo,8($inp)
340 ldq_u $Tlo0,15($inp)
341 ldq $Xhi,0($Xi)
342 ldq $Xlo,8($Xi)
343
344 br $rem_4bit,.Lpic2
345.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
346
347.Louter:
348 extql $inhi,$inp,$inhi
349 extqh $Thi0,$inp,$Thi0
350 or $inhi,$Thi0,$inhi
351 lda $inp,16($inp)
352
353 extql $inlo,$inp,$inlo
354 extqh $Tlo0,$inp,$Tlo0
355 or $inlo,$Tlo0,$inlo
356 subq $len,16,$len
357
358 xor $Xlo,$inlo,$Xlo
359 xor $Xhi,$inhi,$Xhi
360___
361
362 &loop();
363
364$code.=<<___;
365 srl $Zlo,24,$t0 # byte swap
366 srl $Zlo,8,$t1
367
368 sll $Zlo,8,$t2
369 sll $Zlo,24,$Zlo
370 zapnot $t0,0x11,$t0
371 zapnot $t1,0x22,$t1
372
373 zapnot $Zlo,0x88,$Zlo
374 or $t0,$t1,$t0
375 zapnot $t2,0x44,$t2
376
377 or $Zlo,$t0,$Zlo
378 srl $Zhi,24,$t0
379 srl $Zhi,8,$t1
380
381 or $Zlo,$t2,$Zlo
382 sll $Zhi,8,$t2
383 sll $Zhi,24,$Zhi
384
385 srl $Zlo,32,$Xlo
386 sll $Zlo,32,$Zlo
387 beq $len,.Ldone
388
389 zapnot $t0,0x11,$t0
390 zapnot $t1,0x22,$t1
391 or $Zlo,$Xlo,$Xlo
392 ldq_u $inhi,0($inp)
393
394 zapnot $Zhi,0x88,$Zhi
395 or $t0,$t1,$t0
396 zapnot $t2,0x44,$t2
397 ldq_u $Thi0,7($inp)
398
399 or $Zhi,$t0,$Zhi
400 or $Zhi,$t2,$Zhi
401 ldq_u $inlo,8($inp)
402 ldq_u $Tlo0,15($inp)
403
404 srl $Zhi,32,$Xhi
405 sll $Zhi,32,$Zhi
406
407 or $Zhi,$Xhi,$Xhi
408 br zero,.Louter
409
410.Ldone:
411 zapnot $t0,0x11,$t0
412 zapnot $t1,0x22,$t1
413 or $Zlo,$Xlo,$Xlo
414
415 zapnot $Zhi,0x88,$Zhi
416 or $t0,$t1,$t0
417 zapnot $t2,0x44,$t2
418
419 or $Zhi,$t0,$Zhi
420 or $Zhi,$t2,$Zhi
421
422 srl $Zhi,32,$Xhi
423 sll $Zhi,32,$Zhi
424
425 or $Zhi,$Xhi,$Xhi
426
427 stq $Xlo,8($Xi)
428 stq $Xhi,0($Xi)
429
430 .set noreorder
431 /*ldq ra,0(sp)*/
432 ldq s0,8(sp)
433 ldq s1,16(sp)
434 lda sp,32(sp)
435 ret (ra)
436.end gcm_ghash_4bit
437
438.align 4
439rem_4bit:
440 .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
441 .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
442 .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
443 .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
444.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
445.align 4
446
447___
448$output=shift and open STDOUT,">$output";
449print $code;
450close STDOUT;
451
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000000..d91586ee29
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+32 bytes shared table]. There is no
15# experimental performance data available yet. The only approximation
16# that can be made at this point is based on code size. Inner loop is
17# 32 instructions long and on single-issue core should execute in <40
18# cycles. Having verified that gcc 3.4 didn't unroll corresponding
19# loop, this assembler loop body was found to be ~3x smaller than
20# compiler-generated one...
21#
22# July 2010
23#
24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
25# Cortex A8 core and ~25 cycles per processed byte (which was observed
26# to be ~3 times faster than gcc-generated code:-)
27#
28# February 2011
29#
30# Profiler-assisted and platform-specific optimization resulted in 7%
31# improvement on Cortex A8 core and ~23.5 cycles per byte.
32#
33# March 2011
34#
35# Add NEON implementation featuring polynomial multiplication, i.e. no
36# lookup tables involved. On Cortex A8 it was measured to process one
37# byte in 15 cycles or 55% faster than integer-only code.
38
39# ====================================================================
40# Note about "528B" variant. In ARM case it makes lesser sense to
41# implement it for following reasons:
42#
43# - performance improvement won't be anywhere near 50%, because 128-
44# bit shift operation is neatly fused with 128-bit xor here, and
45# "538B" variant would eliminate only 4-5 instructions out of 32
46# in the inner loop (meaning that estimated improvement is ~15%);
47# - ARM-based systems are often embedded ones and extra memory
48# consumption might be unappreciated (for so little improvement);
49#
50# Byte order [in]dependence. =========================================
51#
52# Caller is expected to maintain specific *dword* order in Htable,
53# namely with *least* significant dword of 128-bit value at *lower*
54# address. This differs completely from C code and has everything to
55# do with ldm instruction and order in which dwords are "consumed" by
56# algorithm. *Byte* order within these dwords in turn is whatever
57# *native* byte order on current platform. See gcm128.c for working
58# example...
59
60while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61open STDOUT,">$output";
62
63$Xi="r0"; # argument block
64$Htbl="r1";
65$inp="r2";
66$len="r3";
67
68$Zll="r4"; # variables
69$Zlh="r5";
70$Zhl="r6";
71$Zhh="r7";
72$Tll="r8";
73$Tlh="r9";
74$Thl="r10";
75$Thh="r11";
76$nlo="r12";
77################# r13 is stack pointer
78$nhi="r14";
79################# r15 is program counter
80
81$rem_4bit=$inp; # used in gcm_gmult_4bit
82$cnt=$len;
83
84sub Zsmash() {
85 my $i=12;
86 my @args=@_;
87 for ($Zll,$Zlh,$Zhl,$Zhh) {
88 $code.=<<___;
89#if __ARM_ARCH__>=7 && defined(__ARMEL__)
90 rev $_,$_
91 str $_,[$Xi,#$i]
92#elif defined(__ARMEB__)
93 str $_,[$Xi,#$i]
94#else
95 mov $Tlh,$_,lsr#8
96 strb $_,[$Xi,#$i+3]
97 mov $Thl,$_,lsr#16
98 strb $Tlh,[$Xi,#$i+2]
99 mov $Thh,$_,lsr#24
100 strb $Thl,[$Xi,#$i+1]
101 strb $Thh,[$Xi,#$i]
102#endif
103___
104 $code.="\t".shift(@args)."\n";
105 $i-=4;
106 }
107}
108
109$code=<<___;
110#include "arm_arch.h"
111
112.text
113.code 32
114
115.type rem_4bit,%object
116.align 5
117rem_4bit:
118.short 0x0000,0x1C20,0x3840,0x2460
119.short 0x7080,0x6CA0,0x48C0,0x54E0
120.short 0xE100,0xFD20,0xD940,0xC560
121.short 0x9180,0x8DA0,0xA9C0,0xB5E0
122.size rem_4bit,.-rem_4bit
123
124.type rem_4bit_get,%function
125rem_4bit_get:
126 sub $rem_4bit,pc,#8
127 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
128 b .Lrem_4bit_got
129 nop
130.size rem_4bit_get,.-rem_4bit_get
131
132.global gcm_ghash_4bit
133.type gcm_ghash_4bit,%function
134gcm_ghash_4bit:
135 sub r12,pc,#8
136 add $len,$inp,$len @ $len to point at the end
137 stmdb sp!,{r3-r11,lr} @ save $len/end too
138 sub r12,r12,#48 @ &rem_4bit
139
140 ldmia r12,{r4-r11} @ copy rem_4bit ...
141 stmdb sp!,{r4-r11} @ ... to stack
142
143 ldrb $nlo,[$inp,#15]
144 ldrb $nhi,[$Xi,#15]
145.Louter:
146 eor $nlo,$nlo,$nhi
147 and $nhi,$nlo,#0xf0
148 and $nlo,$nlo,#0x0f
149 mov $cnt,#14
150
151 add $Zhh,$Htbl,$nlo,lsl#4
152 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
153 add $Thh,$Htbl,$nhi
154 ldrb $nlo,[$inp,#14]
155
156 and $nhi,$Zll,#0xf @ rem
157 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
158 add $nhi,$nhi,$nhi
159 eor $Zll,$Tll,$Zll,lsr#4
160 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
161 eor $Zll,$Zll,$Zlh,lsl#28
162 ldrb $nhi,[$Xi,#14]
163 eor $Zlh,$Tlh,$Zlh,lsr#4
164 eor $Zlh,$Zlh,$Zhl,lsl#28
165 eor $Zhl,$Thl,$Zhl,lsr#4
166 eor $Zhl,$Zhl,$Zhh,lsl#28
167 eor $Zhh,$Thh,$Zhh,lsr#4
168 eor $nlo,$nlo,$nhi
169 and $nhi,$nlo,#0xf0
170 and $nlo,$nlo,#0x0f
171 eor $Zhh,$Zhh,$Tll,lsl#16
172
173.Linner:
174 add $Thh,$Htbl,$nlo,lsl#4
175 and $nlo,$Zll,#0xf @ rem
176 subs $cnt,$cnt,#1
177 add $nlo,$nlo,$nlo
178 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
179 eor $Zll,$Tll,$Zll,lsr#4
180 eor $Zll,$Zll,$Zlh,lsl#28
181 eor $Zlh,$Tlh,$Zlh,lsr#4
182 eor $Zlh,$Zlh,$Zhl,lsl#28
183 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
184 eor $Zhl,$Thl,$Zhl,lsr#4
185 ldrplb $nlo,[$inp,$cnt]
186 eor $Zhl,$Zhl,$Zhh,lsl#28
187 eor $Zhh,$Thh,$Zhh,lsr#4
188
189 add $Thh,$Htbl,$nhi
190 and $nhi,$Zll,#0xf @ rem
191 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
192 add $nhi,$nhi,$nhi
193 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
194 eor $Zll,$Tll,$Zll,lsr#4
195 ldrplb $Tll,[$Xi,$cnt]
196 eor $Zll,$Zll,$Zlh,lsl#28
197 eor $Zlh,$Tlh,$Zlh,lsr#4
198 ldrh $Tlh,[sp,$nhi]
199 eor $Zlh,$Zlh,$Zhl,lsl#28
200 eor $Zhl,$Thl,$Zhl,lsr#4
201 eor $Zhl,$Zhl,$Zhh,lsl#28
202 eorpl $nlo,$nlo,$Tll
203 eor $Zhh,$Thh,$Zhh,lsr#4
204 andpl $nhi,$nlo,#0xf0
205 andpl $nlo,$nlo,#0x0f
206 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
207 bpl .Linner
208
209 ldr $len,[sp,#32] @ re-load $len/end
210 add $inp,$inp,#16
211 mov $nhi,$Zll
212___
213 &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
214$code.=<<___;
215 bne .Louter
216
217 add sp,sp,#36
218#if __ARM_ARCH__>=5
219 ldmia sp!,{r4-r11,pc}
220#else
221 ldmia sp!,{r4-r11,lr}
222 tst lr,#1
223 moveq pc,lr @ be binary compatible with V4, yet
224 bx lr @ interoperable with Thumb ISA:-)
225#endif
226.size gcm_ghash_4bit,.-gcm_ghash_4bit
227
228.global gcm_gmult_4bit
229.type gcm_gmult_4bit,%function
230gcm_gmult_4bit:
231 stmdb sp!,{r4-r11,lr}
232 ldrb $nlo,[$Xi,#15]
233 b rem_4bit_get
234.Lrem_4bit_got:
235 and $nhi,$nlo,#0xf0
236 and $nlo,$nlo,#0x0f
237 mov $cnt,#14
238
239 add $Zhh,$Htbl,$nlo,lsl#4
240 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
241 ldrb $nlo,[$Xi,#14]
242
243 add $Thh,$Htbl,$nhi
244 and $nhi,$Zll,#0xf @ rem
245 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
246 add $nhi,$nhi,$nhi
247 eor $Zll,$Tll,$Zll,lsr#4
248 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
249 eor $Zll,$Zll,$Zlh,lsl#28
250 eor $Zlh,$Tlh,$Zlh,lsr#4
251 eor $Zlh,$Zlh,$Zhl,lsl#28
252 eor $Zhl,$Thl,$Zhl,lsr#4
253 eor $Zhl,$Zhl,$Zhh,lsl#28
254 eor $Zhh,$Thh,$Zhh,lsr#4
255 and $nhi,$nlo,#0xf0
256 eor $Zhh,$Zhh,$Tll,lsl#16
257 and $nlo,$nlo,#0x0f
258
259.Loop:
260 add $Thh,$Htbl,$nlo,lsl#4
261 and $nlo,$Zll,#0xf @ rem
262 subs $cnt,$cnt,#1
263 add $nlo,$nlo,$nlo
264 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
265 eor $Zll,$Tll,$Zll,lsr#4
266 eor $Zll,$Zll,$Zlh,lsl#28
267 eor $Zlh,$Tlh,$Zlh,lsr#4
268 eor $Zlh,$Zlh,$Zhl,lsl#28
269 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
270 eor $Zhl,$Thl,$Zhl,lsr#4
271 ldrplb $nlo,[$Xi,$cnt]
272 eor $Zhl,$Zhl,$Zhh,lsl#28
273 eor $Zhh,$Thh,$Zhh,lsr#4
274
275 add $Thh,$Htbl,$nhi
276 and $nhi,$Zll,#0xf @ rem
277 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
278 add $nhi,$nhi,$nhi
279 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
280 eor $Zll,$Tll,$Zll,lsr#4
281 eor $Zll,$Zll,$Zlh,lsl#28
282 eor $Zlh,$Tlh,$Zlh,lsr#4
283 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
284 eor $Zlh,$Zlh,$Zhl,lsl#28
285 eor $Zhl,$Thl,$Zhl,lsr#4
286 eor $Zhl,$Zhl,$Zhh,lsl#28
287 eor $Zhh,$Thh,$Zhh,lsr#4
288 andpl $nhi,$nlo,#0xf0
289 andpl $nlo,$nlo,#0x0f
290 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
291 bpl .Loop
292___
293 &Zsmash();
294$code.=<<___;
295#if __ARM_ARCH__>=5
296 ldmia sp!,{r4-r11,pc}
297#else
298 ldmia sp!,{r4-r11,lr}
299 tst lr,#1
300 moveq pc,lr @ be binary compatible with V4, yet
301 bx lr @ interoperable with Thumb ISA:-)
302#endif
303.size gcm_gmult_4bit,.-gcm_gmult_4bit
304___
305{
306my $cnt=$Htbl; # $Htbl is used once in the very beginning
307
308my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
309my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
310
311# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
312# in Zo. Or should I say "top bit", because GHASH is specified in
313# reverse bit order? Otherwise straightforward 128-bt H by one input
314# byte multiplication and modulo-reduction, times 16.
315
316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
318sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
319
320$code.=<<___;
321#if __ARM_ARCH__>=7
322.fpu neon
323
324.global gcm_gmult_neon
325.type gcm_gmult_neon,%function
326.align 4
327gcm_gmult_neon:
328 sub $Htbl,#16 @ point at H in GCM128_CTX
329 vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
330 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
331 vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
332 vshr.u64 $mod,#32
333 vldmia $Htbl,{$Hhi-$Hlo} @ load H
334 veor $zero,$zero
335#ifdef __ARMEL__
336 vrev64.8 $IN,$IN
337#endif
338 veor $Qpost,$Qpost
339 veor $R,$R
340 mov $cnt,#16
341 veor $Z,$Z
342 mov $len,#16
343 veor $Zo,$Zo
344 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
345 b .Linner_neon
346.size gcm_gmult_neon,.-gcm_gmult_neon
347
348.global gcm_ghash_neon
349.type gcm_ghash_neon,%function
350.align 4
351gcm_ghash_neon:
352 vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
353 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
354 vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
355 vshr.u64 $mod,#32
356 vldmia $Xi,{$Hhi-$Hlo} @ load H
357 veor $zero,$zero
358 nop
359#ifdef __ARMEL__
360 vrev64.8 $Z,$Z
361#endif
362.Louter_neon:
363 vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
364 veor $Qpost,$Qpost
365 vld1.64 `&Dlo($IN)`,[$inp]!
366 veor $R,$R
367 mov $cnt,#16
368#ifdef __ARMEL__
369 vrev64.8 $IN,$IN
370#endif
371 veor $Zo,$Zo
372 veor $IN,$Z @ inp^=Xi
373 veor $Z,$Z
374 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
375.Linner_neon:
376 subs $cnt,$cnt,#1
377 vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
378 vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
379 vext.8 $IN,$zero,#1 @ IN>>=8
380
381 veor $Z,$Qpost @ modulo-scheduled part
382 vshl.i64 `&Dlo("$R")`,#48
383 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
384 veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
385
386 veor `&Dhi("$Z")`,`&Dlo("$R")`
387 vuzp.8 $Qlo,$Qhi
388 vsli.8 $Zo,$T,#1 @ compose the "carry" byte
389 vext.8 $Z,$zero,#1 @ Z>>=8
390
391 vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
392 vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
393 vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
394 veor $Z,$Qhi
395 bne .Linner_neon
396
397 veor $Z,$Qpost @ modulo-scheduled artefact
398 vshl.i64 `&Dlo("$R")`,#48
399 veor `&Dhi("$Z")`,`&Dlo("$R")`
400
401 @ finalization, normalize Z:Zo
402 vand $Zo,$mod @ suffices to mask the bit
403 vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
404 vshl.i64 $Z,#1
405 subs $len,#16
406 vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
407 bne .Louter_neon
408
409#ifdef __ARMEL__
410 vrev64.8 $Z,$Z
411#endif
412 sub $Xi,#16
413 vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
414 vst1.64 `&Dlo("$Z")`,[$Xi,:64]
415
416 bx lr
417.size gcm_ghash_neon,.-gcm_ghash_neon
418#endif
419___
420}
421$code.=<<___;
422.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
423.align 2
424___
425
426$code =~ s/\`([^\`]*)\`/eval $1/gem;
427$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
428print $code;
429close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000000..0354c95444
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15# GHASH performance was measured to be 6.67 cycles per processed byte
16# on Itanium 2, which is >90% better than Microsoft compiler generated
17# code. To anchor to something else sha1-ia64.pl module processes one
18# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19# byte.
20
21# September 2010
22#
23# It was originally thought that it makes lesser sense to implement
24# "528B" variant on Itanium 2 for following reason. Because number of
25# functional units is naturally limited, it appeared impossible to
26# implement "528B" loop in 4 cycles, only in 5. This would mean that
27# theoretically performance improvement couldn't be more than 20%.
28# But occasionally you prove yourself wrong:-) I figured out a way to
29# fold couple of instructions and having freed yet another instruction
30# slot by unrolling the loop... Resulting performance is 4.45 cycles
31# per processed byte and 50% better than "256B" version. On original
32# Itanium performance should remain the same as the "256B" version,
33# i.e. ~8.5 cycles.
34
35$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36
37if ($^O eq "hpux") {
38 $ADDP="addp4";
39 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40} else { $ADDP="add"; }
41for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
42 $big_endian=0 if (/\-DL_ENDIAN/); }
43if (!defined($big_endian))
44 { $big_endian=(unpack('L',pack('N',1))==1); }
45
46sub loop() {
47my $label=shift;
48my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49
50# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51# in scalable manner;-) Naturally assuming data in L1 cache...
52# Special note about 'dep' instruction, which is used to construct
53# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54# bytes boundary and lower 7 bits of its address are guaranteed to
55# be zero.
56$code.=<<___;
57$label:
58{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
59 (p19) dep rem=Zlo,rem_4bitp,3,4 }
60{ .mfi; (p19) xor Zhi=Zhi,Hhi
61 ($p17) xor xi[1]=xi[1],in[1] };;
62{ .mfi; (p18) ld8 Hhi=[Hi[1]]
63 (p19) shrp Zlo=Zhi,Zlo,4 }
64{ .mfi; (p19) ld8 rem=[rem]
65 (p18) and Hi[1]=mask0xf0,xi[2] };;
66{ .mmi; ($p16) ld1 in[0]=[inp],-1
67 (p18) xor Zlo=Zlo,Hlo
68 (p19) shr.u Zhi=Zhi,4 }
69{ .mib; (p19) xor Hhi=Hhi,rem
70 (p18) add Hi[1]=Htbl,Hi[1] };;
71
72{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
73 (p18) dep rem=Zlo,rem_4bitp,3,4 }
74{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
75 (p18) xor Zhi=Zhi,Hhi };;
76{ .mfi; (p18) ld8 Hhi=[Hi[1]]
77 (p18) shrp Zlo=Zhi,Zlo,4 }
78{ .mfi; (p18) ld8 rem=[rem]
79 (p17) and Hi[0]=mask0xf0,Hi[0] };;
80{ .mmi; (p16) ld1 xi[0]=[Xi],-1
81 (p18) xor Zlo=Zlo,Hlo
82 (p18) shr.u Zhi=Zhi,4 }
83{ .mib; (p18) xor Hhi=Hhi,rem
84 (p17) add Hi[0]=Htbl,Hi[0]
85 br.ctop.sptk $label };;
86___
87}
88
89$code=<<___;
90.explicit
91.text
92
93prevfs=r2; prevlc=r3; prevpr=r8;
94mask0xf0=r21;
95rem=r22; rem_4bitp=r23;
96Xi=r24; Htbl=r25;
97inp=r26; end=r27;
98Hhi=r28; Hlo=r29;
99Zhi=r30; Zlo=r31;
100
101.align 128
102.skip 16 // aligns loop body
103.global gcm_gmult_4bit#
104.proc gcm_gmult_4bit#
105gcm_gmult_4bit:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,2,6,0,8
109 $ADDP Xi=15,in0 // &Xi[15]
110 mov rem_4bitp=ip }
111{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
112 .save ar.lc,prevlc
113 mov prevlc=ar.lc
114 .save pr,prevpr
115 mov prevpr=pr };;
116
117 .body
118 .rotr in[3],xi[3],Hi[2]
119
120{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
121 mov mask0xf0=0xf0
122 brp.loop.imp .Loop1,.Lend1-16};;
123{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
124 };;
125{ .mii; shladd Hi[1]=xi[2],4,r0
126 mov pr.rot=0x7<<16
127 mov ar.lc=13 };;
128{ .mii; and Hi[1]=mask0xf0,Hi[1]
129 mov ar.ec=3
130 xor Zlo=Zlo,Zlo };;
131{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
132 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133 xor Zhi=Zhi,Zhi };;
134___
135 &loop (".Loop1",1);
136$code.=<<___;
137.Lend1:
138{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
139{ .mib; mux1 Zlo=Zlo,\@rev };;
140{ .mib; mux1 Zhi=Zhi,\@rev };;
141{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
142 add Hhi=1,Xi };; // pipeline flush on Itanium
143{ .mib; st8 [Hlo]=Zlo
144 mov pr=prevpr,0x1ffff };;
145{ .mib; st8 [Hhi]=Zhi
146 mov ar.lc=prevlc
147 br.ret.sptk.many b0 };;
148.endp gcm_gmult_4bit#
149___
150
151######################################################################
152# "528B" (well, "512B" actualy) streamed GHASH
153#
154$Xip="in0";
155$Htbl="in1";
156$inp="in2";
157$len="in3";
158$rem_8bit="loc0";
159$mask0xff="loc1";
160($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161
162sub load_htable() {
163 for (my $i=0;$i<8;$i++) {
164 $code.=<<___;
165{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
166 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
167{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
168 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
169___
170 $code.=shift if (($i+$#_)==7);
171 $code.="\t};;\n"
172 }
173}
174
175$code.=<<___;
176prevsp=r3;
177
178.align 32
179.skip 16 // aligns loop body
180.global gcm_ghash_4bit#
181.proc gcm_ghash_4bit#
182gcm_ghash_4bit:
183 .prologue
184{ .mmi; .save ar.pfs,prevfs
185 alloc prevfs=ar.pfs,4,2,0,0
186 .vframe prevsp
187 mov prevsp=sp
188 mov $rem_8bit=ip };;
189 .body
190{ .mfi; $ADDP r8=0+0,$Htbl
191 $ADDP r9=0+8,$Htbl }
192{ .mfi; $ADDP r10=128+0,$Htbl
193 $ADDP r11=128+8,$Htbl };;
194___
195 &load_htable(
196 " $ADDP $Xip=15,$Xip", # &Xi[15]
197 " $ADDP $len=$len,$inp", # &inp[len]
198 " $ADDP $inp=15,$inp", # &inp[15]
199 " mov $mask0xff=0xff",
200 " add sp=-512,sp",
201 " andcm sp=sp,$mask0xff", # align stack frame
202 " add r14=0,sp",
203 " add r15=8,sp");
204$code.=<<___;
205{ .mmi; $sum 1<<1 // go big-endian
206 add r8=256+0,sp
207 add r9=256+8,sp }
208{ .mmi; add r10=256+128+0,sp
209 add r11=256+128+8,sp
210 add $len=-17,$len };;
211___
212for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
213my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214$code.=<<___;
215{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
216 st8 [r9]=$rhi,16 // Htable[$i].hi
217 shrp $rlo=$rhi,$rlo,4 }//;;
218{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
219 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
220 shr.u $rhi=$rhi,4 };;
221{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
222 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
223___
224}
225$code.=<<___;
226{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
227 ld8 r17=[r9],16 };; // Htable[8].hi
228{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
229 ld8 r19=[r9],16 } // Htable[9].hi
230{ .mmi; rum 1<<5 // clear um.mfh
231 shrp r16=r17,r16,4 };;
232___
233for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
234$code.=<<___;
235{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
236 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
237 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
238{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
239 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
240 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
241___
242}
243$code.=<<___;
244{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248{ .mmi; add $Htbl=256,sp // &Htable[0]
249 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
251{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
252 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
253___
254
255$in="r15";
256@xi=("r16","r17");
257@rem=("r18","r19");
258($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259($Atbl,$Btbl)=("r26","r27");
260
261$code.=<<___; # (p16)
262{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
263 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
264 cmp.eq p0,p6=r0,r0 };; // clear p6
265___
266push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
267
268$code.=<<___; # (p16),(p17)
269{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
270 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
271{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
272 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
273 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
274.align 32
275.LOOP:
276{ .mmi;
277(p6) st8 [$Xip]=$Zhi,13
278 xor $Zlo=$Zlo,$Zlo
279 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
280___
281push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
282
283$code.=<<___; # (p16),(p17),(p18)
284{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
285 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
287{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
288 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
289{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
290 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
291{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292 ld1 $in=[$inp],-1 } //(p16) *inp--
293{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
294 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
295 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
296{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
297 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
298 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
301___
302push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
303
304for ($i=1;$i<14;$i++) {
305# Above and below fragments are derived from this one by removing
306# unsuitable (p??) instructions.
307$code.=<<___; # (p16),(p17),(p18),(p19)
308{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
309 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
311{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
312 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
313 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
314{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
315 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
316 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
317{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
318 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
319 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
320{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321 ld1 $in=[$inp],-1 //(p16) *inp--
322 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
323{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
324 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
325 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
326{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
327 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
328 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
331 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
332___
333push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
334}
335
336$code.=<<___; # (p17),(p18),(p19)
337{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
338 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
340{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
341 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
342 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
343{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
344 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
345 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
346{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
347 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
348 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
349{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
351{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
352 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
353 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
354{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
355 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
358 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
359___
360push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
361
362$code.=<<___; # (p18),(p19)
363{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
364 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
365{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
366 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
367{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
368 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
369{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
370 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
371{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
372 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
373{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
374 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
375{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
376 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
379___
380push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
381
382$code.=<<___; # (p19)
383{ .mmi; cmp.ltu p6,p0=$inp,$len
384 add $inp=32,$inp
385 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
386{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
387 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
388 add $Xip=9,$Xip };; // &Xi.lo
389{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
390(p6) ld1 $in=[$inp],-1 //[p16] *inp--
391(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
392{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
393(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
394{ .mmi; st8 [$Xip]=$Zlo,-8
395(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
396 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
397{ .mmi;
398(p6) ld1 $in=[$inp],-1 //[p16] *inp--
399 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
400(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
401{ .mib;
402(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
403(p6) br.cond.dptk.many .LOOP };;
404
405{ .mib; st8 [$Xip]=$Zhi };;
406{ .mib; $rum 1<<1 // return to little-endian
407 .restore sp
408 mov sp=prevsp
409 br.ret.sptk.many b0 };;
410.endp gcm_ghash_4bit#
411___
412$code.=<<___;
413.align 128
414.type rem_4bit#,\@object
415rem_4bit:
416 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420.size rem_4bit#,128
421.type rem_8bit#,\@object
422rem_8bit:
423 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455.size rem_8bit#,512
456stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457___
458
459$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461
462print $code;
463close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 0000000000..8c7454ee93
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,730 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36 $NREGS =6;
37} else {
38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46 $NREGS =11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50 # [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26"; # argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl; # variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73 $Zhl="%r6";
74 $Zlh="%r7";
75 $Hhl="%r8";
76 $Hlh="%r9";
77 $Thl="%r10";
78 $Tlh="%r11";
79}
80$rem2="%r6"; # used in PA-RISC 2.0 code
81
82$code.=<<___;
83 .LEVEL $LEVEL
84 .SPACE \$TEXT\$
85 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
86
87 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
88 .ALIGN 64
89gcm_gmult_4bit
90 .PROC
91 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
92 .ENTRY
93 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
94 $PUSHMA %r3,$FRAME(%sp)
95 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
96 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
97 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
98___
99$code.=<<___ if ($SIZE_T==4);
100 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
101 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
102 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
103 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
104 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
105___
106$code.=<<___;
107 blr %r0,$rem_4bit
108 ldi 3,$rem
109L\$pic_gmult
110 andcm $rem_4bit,$rem,$rem_4bit
111 addl $inp,$len,$len
112 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
113 ldi 0xf0,$mask0xf0
114___
115$code.=<<___ if ($SIZE_T==4);
116 ldi 31,$rem
117 mtctl $rem,%cr11
118 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
119 b L\$parisc1_gmult
120 nop
121___
122
123$code.=<<___;
124 ldb 15($Xi),$nlo
125 ldo 8($Htbl),$Hll
126
127 and $mask0xf0,$nlo,$nhi
128 depd,z $nlo,59,4,$nlo
129
130 ldd $nlo($Hll),$Zll
131 ldd $nlo($Hhh),$Zhh
132
133 depd,z $Zll,60,4,$rem
134 shrpd $Zhh,$Zll,4,$Zll
135 extrd,u $Zhh,59,60,$Zhh
136 ldb 14($Xi),$nlo
137
138 ldd $nhi($Hll),$Tll
139 ldd $nhi($Hhh),$Thh
140 and $mask0xf0,$nlo,$nhi
141 depd,z $nlo,59,4,$nlo
142
143 xor $Tll,$Zll,$Zll
144 xor $Thh,$Zhh,$Zhh
145 ldd $rem($rem_4bit),$rem
146 b L\$oop_gmult_pa2
147 ldi 13,$cnt
148
149 .ALIGN 8
150L\$oop_gmult_pa2
151 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
152 depd,z $Zll,60,4,$rem
153
154 shrpd $Zhh,$Zll,4,$Zll
155 extrd,u $Zhh,59,60,$Zhh
156 ldd $nlo($Hll),$Tll
157 ldd $nlo($Hhh),$Thh
158
159 xor $Tll,$Zll,$Zll
160 xor $Thh,$Zhh,$Zhh
161 ldd $rem($rem_4bit),$rem
162
163 xor $rem,$Zhh,$Zhh
164 depd,z $Zll,60,4,$rem
165 ldbx $cnt($Xi),$nlo
166
167 shrpd $Zhh,$Zll,4,$Zll
168 extrd,u $Zhh,59,60,$Zhh
169 ldd $nhi($Hll),$Tll
170 ldd $nhi($Hhh),$Thh
171
172 and $mask0xf0,$nlo,$nhi
173 depd,z $nlo,59,4,$nlo
174 ldd $rem($rem_4bit),$rem
175
176 xor $Tll,$Zll,$Zll
177 addib,uv -1,$cnt,L\$oop_gmult_pa2
178 xor $Thh,$Zhh,$Zhh
179
180 xor $rem,$Zhh,$Zhh
181 depd,z $Zll,60,4,$rem
182
183 shrpd $Zhh,$Zll,4,$Zll
184 extrd,u $Zhh,59,60,$Zhh
185 ldd $nlo($Hll),$Tll
186 ldd $nlo($Hhh),$Thh
187
188 xor $Tll,$Zll,$Zll
189 xor $Thh,$Zhh,$Zhh
190 ldd $rem($rem_4bit),$rem
191
192 xor $rem,$Zhh,$Zhh
193 depd,z $Zll,60,4,$rem
194
195 shrpd $Zhh,$Zll,4,$Zll
196 extrd,u $Zhh,59,60,$Zhh
197 ldd $nhi($Hll),$Tll
198 ldd $nhi($Hhh),$Thh
199
200 xor $Tll,$Zll,$Zll
201 xor $Thh,$Zhh,$Zhh
202 ldd $rem($rem_4bit),$rem
203
204 xor $rem,$Zhh,$Zhh
205 std $Zll,8($Xi)
206 std $Zhh,0($Xi)
207___
208
209$code.=<<___ if ($SIZE_T==4);
210 b L\$done_gmult
211 nop
212
213L\$parisc1_gmult
214 ldb 15($Xi),$nlo
215 ldo 12($Htbl),$Hll
216 ldo 8($Htbl),$Hlh
217 ldo 4($Htbl),$Hhl
218
219 and $mask0xf0,$nlo,$nhi
220 zdep $nlo,27,4,$nlo
221
222 ldwx $nlo($Hll),$Zll
223 ldwx $nlo($Hlh),$Zlh
224 ldwx $nlo($Hhl),$Zhl
225 ldwx $nlo($Hhh),$Zhh
226 zdep $Zll,28,4,$rem
227 ldb 14($Xi),$nlo
228 ldwx $rem($rem_4bit),$rem
229 shrpw $Zlh,$Zll,4,$Zll
230 ldwx $nhi($Hll),$Tll
231 shrpw $Zhl,$Zlh,4,$Zlh
232 ldwx $nhi($Hlh),$Tlh
233 shrpw $Zhh,$Zhl,4,$Zhl
234 ldwx $nhi($Hhl),$Thl
235 extru $Zhh,27,28,$Zhh
236 ldwx $nhi($Hhh),$Thh
237 xor $rem,$Zhh,$Zhh
238 and $mask0xf0,$nlo,$nhi
239 zdep $nlo,27,4,$nlo
240
241 xor $Tll,$Zll,$Zll
242 ldwx $nlo($Hll),$Tll
243 xor $Tlh,$Zlh,$Zlh
244 ldwx $nlo($Hlh),$Tlh
245 xor $Thl,$Zhl,$Zhl
246 b L\$oop_gmult_pa1
247 ldi 13,$cnt
248
249 .ALIGN 8
250L\$oop_gmult_pa1
251 zdep $Zll,28,4,$rem
252 ldwx $nlo($Hhl),$Thl
253 xor $Thh,$Zhh,$Zhh
254 ldwx $rem($rem_4bit),$rem
255 shrpw $Zlh,$Zll,4,$Zll
256 ldwx $nlo($Hhh),$Thh
257 shrpw $Zhl,$Zlh,4,$Zlh
258 ldbx $cnt($Xi),$nlo
259 xor $Tll,$Zll,$Zll
260 ldwx $nhi($Hll),$Tll
261 shrpw $Zhh,$Zhl,4,$Zhl
262 xor $Tlh,$Zlh,$Zlh
263 ldwx $nhi($Hlh),$Tlh
264 extru $Zhh,27,28,$Zhh
265 xor $Thl,$Zhl,$Zhl
266 ldwx $nhi($Hhl),$Thl
267 xor $rem,$Zhh,$Zhh
268 zdep $Zll,28,4,$rem
269 xor $Thh,$Zhh,$Zhh
270 ldwx $nhi($Hhh),$Thh
271 shrpw $Zlh,$Zll,4,$Zll
272 ldwx $rem($rem_4bit),$rem
273 shrpw $Zhl,$Zlh,4,$Zlh
274 shrpw $Zhh,$Zhl,4,$Zhl
275 and $mask0xf0,$nlo,$nhi
276 extru $Zhh,27,28,$Zhh
277 zdep $nlo,27,4,$nlo
278 xor $Tll,$Zll,$Zll
279 ldwx $nlo($Hll),$Tll
280 xor $Tlh,$Zlh,$Zlh
281 ldwx $nlo($Hlh),$Tlh
282 xor $rem,$Zhh,$Zhh
283 addib,uv -1,$cnt,L\$oop_gmult_pa1
284 xor $Thl,$Zhl,$Zhl
285
286 zdep $Zll,28,4,$rem
287 ldwx $nlo($Hhl),$Thl
288 xor $Thh,$Zhh,$Zhh
289 ldwx $rem($rem_4bit),$rem
290 shrpw $Zlh,$Zll,4,$Zll
291 ldwx $nlo($Hhh),$Thh
292 shrpw $Zhl,$Zlh,4,$Zlh
293 xor $Tll,$Zll,$Zll
294 ldwx $nhi($Hll),$Tll
295 shrpw $Zhh,$Zhl,4,$Zhl
296 xor $Tlh,$Zlh,$Zlh
297 ldwx $nhi($Hlh),$Tlh
298 extru $Zhh,27,28,$Zhh
299 xor $rem,$Zhh,$Zhh
300 xor $Thl,$Zhl,$Zhl
301 ldwx $nhi($Hhl),$Thl
302 xor $Thh,$Zhh,$Zhh
303 ldwx $nhi($Hhh),$Thh
304 zdep $Zll,28,4,$rem
305 ldwx $rem($rem_4bit),$rem
306 shrpw $Zlh,$Zll,4,$Zll
307 shrpw $Zhl,$Zlh,4,$Zlh
308 shrpw $Zhh,$Zhl,4,$Zhl
309 extru $Zhh,27,28,$Zhh
310 xor $Tll,$Zll,$Zll
311 xor $Tlh,$Zlh,$Zlh
312 xor $rem,$Zhh,$Zhh
313 stw $Zll,12($Xi)
314 xor $Thl,$Zhl,$Zhl
315 stw $Zlh,8($Xi)
316 xor $Thh,$Zhh,$Zhh
317 stw $Zhl,4($Xi)
318 stw $Zhh,0($Xi)
319___
320$code.=<<___;
321L\$done_gmult
322 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
323 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
324 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
325 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
326___
327$code.=<<___ if ($SIZE_T==4);
328 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
329 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
330 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
331 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
332 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
333___
334$code.=<<___;
335 bv (%r2)
336 .EXIT
337 $POPMB -$FRAME(%sp),%r3
338 .PROCEND
339
340 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
341 .ALIGN 64
342gcm_ghash_4bit
343 .PROC
344 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
345 .ENTRY
346 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
347 $PUSHMA %r3,$FRAME(%sp)
348 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
349 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
350 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
351___
352$code.=<<___ if ($SIZE_T==4);
353 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
354 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
355 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
356 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
357 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
358___
359$code.=<<___;
360 blr %r0,$rem_4bit
361 ldi 3,$rem
362L\$pic_ghash
363 andcm $rem_4bit,$rem,$rem_4bit
364 addl $inp,$len,$len
365 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
366 ldi 0xf0,$mask0xf0
367___
368$code.=<<___ if ($SIZE_T==4);
369 ldi 31,$rem
370 mtctl $rem,%cr11
371 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
372 b L\$parisc1_ghash
373 nop
374___
375
376$code.=<<___;
377 ldb 15($Xi),$nlo
378 ldo 8($Htbl),$Hll
379
380L\$outer_ghash_pa2
381 ldb 15($inp),$nhi
382 xor $nhi,$nlo,$nlo
383 and $mask0xf0,$nlo,$nhi
384 depd,z $nlo,59,4,$nlo
385
386 ldd $nlo($Hll),$Zll
387 ldd $nlo($Hhh),$Zhh
388
389 depd,z $Zll,60,4,$rem
390 shrpd $Zhh,$Zll,4,$Zll
391 extrd,u $Zhh,59,60,$Zhh
392 ldb 14($Xi),$nlo
393 ldb 14($inp),$byte
394
395 ldd $nhi($Hll),$Tll
396 ldd $nhi($Hhh),$Thh
397 xor $byte,$nlo,$nlo
398 and $mask0xf0,$nlo,$nhi
399 depd,z $nlo,59,4,$nlo
400
401 xor $Tll,$Zll,$Zll
402 xor $Thh,$Zhh,$Zhh
403 ldd $rem($rem_4bit),$rem
404 b L\$oop_ghash_pa2
405 ldi 13,$cnt
406
407 .ALIGN 8
408L\$oop_ghash_pa2
409 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
410 depd,z $Zll,60,4,$rem2
411
412 shrpd $Zhh,$Zll,4,$Zll
413 extrd,u $Zhh,59,60,$Zhh
414 ldd $nlo($Hll),$Tll
415 ldd $nlo($Hhh),$Thh
416
417 xor $Tll,$Zll,$Zll
418 xor $Thh,$Zhh,$Zhh
419 ldbx $cnt($Xi),$nlo
420 ldbx $cnt($inp),$byte
421
422 depd,z $Zll,60,4,$rem
423 shrpd $Zhh,$Zll,4,$Zll
424 ldd $rem2($rem_4bit),$rem2
425
426 xor $rem2,$Zhh,$Zhh
427 xor $byte,$nlo,$nlo
428 ldd $nhi($Hll),$Tll
429 ldd $nhi($Hhh),$Thh
430
431 and $mask0xf0,$nlo,$nhi
432 depd,z $nlo,59,4,$nlo
433
434 extrd,u $Zhh,59,60,$Zhh
435 xor $Tll,$Zll,$Zll
436
437 ldd $rem($rem_4bit),$rem
438 addib,uv -1,$cnt,L\$oop_ghash_pa2
439 xor $Thh,$Zhh,$Zhh
440
441 xor $rem,$Zhh,$Zhh
442 depd,z $Zll,60,4,$rem2
443
444 shrpd $Zhh,$Zll,4,$Zll
445 extrd,u $Zhh,59,60,$Zhh
446 ldd $nlo($Hll),$Tll
447 ldd $nlo($Hhh),$Thh
448
449 xor $Tll,$Zll,$Zll
450 xor $Thh,$Zhh,$Zhh
451
452 depd,z $Zll,60,4,$rem
453 shrpd $Zhh,$Zll,4,$Zll
454 ldd $rem2($rem_4bit),$rem2
455
456 xor $rem2,$Zhh,$Zhh
457 ldd $nhi($Hll),$Tll
458 ldd $nhi($Hhh),$Thh
459
460 extrd,u $Zhh,59,60,$Zhh
461 xor $Tll,$Zll,$Zll
462 xor $Thh,$Zhh,$Zhh
463 ldd $rem($rem_4bit),$rem
464
465 xor $rem,$Zhh,$Zhh
466 std $Zll,8($Xi)
467 ldo 16($inp),$inp
468 std $Zhh,0($Xi)
469 cmpb,*<> $inp,$len,L\$outer_ghash_pa2
470 copy $Zll,$nlo
471___
472
473$code.=<<___ if ($SIZE_T==4);
474 b L\$done_ghash
475 nop
476
477L\$parisc1_ghash
478 ldb 15($Xi),$nlo
479 ldo 12($Htbl),$Hll
480 ldo 8($Htbl),$Hlh
481 ldo 4($Htbl),$Hhl
482
483L\$outer_ghash_pa1
484 ldb 15($inp),$byte
485 xor $byte,$nlo,$nlo
486 and $mask0xf0,$nlo,$nhi
487 zdep $nlo,27,4,$nlo
488
489 ldwx $nlo($Hll),$Zll
490 ldwx $nlo($Hlh),$Zlh
491 ldwx $nlo($Hhl),$Zhl
492 ldwx $nlo($Hhh),$Zhh
493 zdep $Zll,28,4,$rem
494 ldb 14($Xi),$nlo
495 ldb 14($inp),$byte
496 ldwx $rem($rem_4bit),$rem
497 shrpw $Zlh,$Zll,4,$Zll
498 ldwx $nhi($Hll),$Tll
499 shrpw $Zhl,$Zlh,4,$Zlh
500 ldwx $nhi($Hlh),$Tlh
501 shrpw $Zhh,$Zhl,4,$Zhl
502 ldwx $nhi($Hhl),$Thl
503 extru $Zhh,27,28,$Zhh
504 ldwx $nhi($Hhh),$Thh
505 xor $byte,$nlo,$nlo
506 xor $rem,$Zhh,$Zhh
507 and $mask0xf0,$nlo,$nhi
508 zdep $nlo,27,4,$nlo
509
510 xor $Tll,$Zll,$Zll
511 ldwx $nlo($Hll),$Tll
512 xor $Tlh,$Zlh,$Zlh
513 ldwx $nlo($Hlh),$Tlh
514 xor $Thl,$Zhl,$Zhl
515 b L\$oop_ghash_pa1
516 ldi 13,$cnt
517
518 .ALIGN 8
519L\$oop_ghash_pa1
520 zdep $Zll,28,4,$rem
521 ldwx $nlo($Hhl),$Thl
522 xor $Thh,$Zhh,$Zhh
523 ldwx $rem($rem_4bit),$rem
524 shrpw $Zlh,$Zll,4,$Zll
525 ldwx $nlo($Hhh),$Thh
526 shrpw $Zhl,$Zlh,4,$Zlh
527 ldbx $cnt($Xi),$nlo
528 xor $Tll,$Zll,$Zll
529 ldwx $nhi($Hll),$Tll
530 shrpw $Zhh,$Zhl,4,$Zhl
531 ldbx $cnt($inp),$byte
532 xor $Tlh,$Zlh,$Zlh
533 ldwx $nhi($Hlh),$Tlh
534 extru $Zhh,27,28,$Zhh
535 xor $Thl,$Zhl,$Zhl
536 ldwx $nhi($Hhl),$Thl
537 xor $rem,$Zhh,$Zhh
538 zdep $Zll,28,4,$rem
539 xor $Thh,$Zhh,$Zhh
540 ldwx $nhi($Hhh),$Thh
541 shrpw $Zlh,$Zll,4,$Zll
542 ldwx $rem($rem_4bit),$rem
543 shrpw $Zhl,$Zlh,4,$Zlh
544 xor $byte,$nlo,$nlo
545 shrpw $Zhh,$Zhl,4,$Zhl
546 and $mask0xf0,$nlo,$nhi
547 extru $Zhh,27,28,$Zhh
548 zdep $nlo,27,4,$nlo
549 xor $Tll,$Zll,$Zll
550 ldwx $nlo($Hll),$Tll
551 xor $Tlh,$Zlh,$Zlh
552 ldwx $nlo($Hlh),$Tlh
553 xor $rem,$Zhh,$Zhh
554 addib,uv -1,$cnt,L\$oop_ghash_pa1
555 xor $Thl,$Zhl,$Zhl
556
557 zdep $Zll,28,4,$rem
558 ldwx $nlo($Hhl),$Thl
559 xor $Thh,$Zhh,$Zhh
560 ldwx $rem($rem_4bit),$rem
561 shrpw $Zlh,$Zll,4,$Zll
562 ldwx $nlo($Hhh),$Thh
563 shrpw $Zhl,$Zlh,4,$Zlh
564 xor $Tll,$Zll,$Zll
565 ldwx $nhi($Hll),$Tll
566 shrpw $Zhh,$Zhl,4,$Zhl
567 xor $Tlh,$Zlh,$Zlh
568 ldwx $nhi($Hlh),$Tlh
569 extru $Zhh,27,28,$Zhh
570 xor $rem,$Zhh,$Zhh
571 xor $Thl,$Zhl,$Zhl
572 ldwx $nhi($Hhl),$Thl
573 xor $Thh,$Zhh,$Zhh
574 ldwx $nhi($Hhh),$Thh
575 zdep $Zll,28,4,$rem
576 ldwx $rem($rem_4bit),$rem
577 shrpw $Zlh,$Zll,4,$Zll
578 shrpw $Zhl,$Zlh,4,$Zlh
579 shrpw $Zhh,$Zhl,4,$Zhl
580 extru $Zhh,27,28,$Zhh
581 xor $Tll,$Zll,$Zll
582 xor $Tlh,$Zlh,$Zlh
583 xor $rem,$Zhh,$Zhh
584 stw $Zll,12($Xi)
585 xor $Thl,$Zhl,$Zhl
586 stw $Zlh,8($Xi)
587 xor $Thh,$Zhh,$Zhh
588 stw $Zhl,4($Xi)
589 ldo 16($inp),$inp
590 stw $Zhh,0($Xi)
591 comb,<> $inp,$len,L\$outer_ghash_pa1
592 copy $Zll,$nlo
593___
594$code.=<<___;
595L\$done_ghash
596 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
597 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
598 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
599 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
600___
601$code.=<<___ if ($SIZE_T==4);
602 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
603 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
604 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
605 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
606 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
607___
608$code.=<<___;
609 bv (%r2)
610 .EXIT
611 $POPMB -$FRAME(%sp),%r3
612 .PROCEND
613
614 .ALIGN 64
615L\$rem_4bit
616 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
617 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
618 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
619 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
620 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
621 .ALIGN 64
622___
623
624# Explicitly encode PA-RISC 2.0 instructions used in this module, so
625# that it can be compiled with .LEVEL 1.0. It should be noted that I
626# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
627# directive...
628
629my $ldd = sub {
630 my ($mod,$args) = @_;
631 my $orig = "ldd$mod\t$args";
632
633 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
634 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
635 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
636 }
637 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
638 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
639 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
640 $opcode|=(1<<5) if ($mod =~ /^,m/);
641 $opcode|=(1<<13) if ($mod =~ /^,mb/);
642 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643 }
644 else { "\t".$orig; }
645};
646
647my $std = sub {
648 my ($mod,$args) = @_;
649 my $orig = "std$mod\t$args";
650
651 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
652 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
653 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
654 }
655 else { "\t".$orig; }
656};
657
658my $extrd = sub {
659 my ($mod,$args) = @_;
660 my $orig = "extrd$mod\t$args";
661
662 # I only have ",u" completer, it's implicitly encoded...
663 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
664 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
665 my $len=32-$3;
666 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
667 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
668 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
669 }
670 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
671 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
672 my $len=32-$2;
673 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
674 $opcode |= (1<<13) if ($mod =~ /,\**=/);
675 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676 }
677 else { "\t".$orig; }
678};
679
680my $shrpd = sub {
681 my ($mod,$args) = @_;
682 my $orig = "shrpd$mod\t$args";
683
684 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
685 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
686 my $cpos=63-$3;
687 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
688 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
689 }
690 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
691 { sprintf "\t.WORD\t0x%08x\t; %s",
692 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
693 }
694 else { "\t".$orig; }
695};
696
697my $depd = sub {
698 my ($mod,$args) = @_;
699 my $orig = "depd$mod\t$args";
700
701 # I only have ",z" completer, it's impicitly encoded...
702 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
703 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
704 my $cpos=63-$2;
705 my $len=32-$3;
706 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
707 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
708 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709 }
710 else { "\t".$orig; }
711};
712
713sub assemble {
714 my ($mnemonic,$mod,$args)=@_;
715 my $opcode = eval("\$$mnemonic");
716
717 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
718}
719
720foreach (split("\n",$code)) {
721 s/\`([^\`]*)\`/eval $1/ge;
722 if ($SIZE_T==4) {
723 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
724 s/cmpb,\*/comb,/;
725 s/,\*/,/;
726 }
727 print $_,"\n";
728}
729
730close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 0000000000..6a40d5d89c
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,262 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# September 2010.
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# was measured to be ~18 cycles per processed byte on z10, which is
16# almost 40% better than gcc-generated code. It should be noted that
17# 18 cycles is worse result than expected: loop is scheduled for 12
18# and the result should be close to 12. In the lack of instruction-
19# level profiling data it's impossible to tell why...
20
21# November 2010.
22#
23# Adapt for -m31 build. If kernel supports what's called "highgprs"
24# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
25# instructions and achieve "64-bit" performance even in 31-bit legacy
26# application context. The feature is not specific to any particular
27# processor, as long as it's "z-CPU". Latter implies that the code
28# remains z/Architecture specific. On z990 it was measured to perform
29# 2.8x better than 32-bit code generated by gcc 4.3.
30
31# March 2011.
32#
33# Support for hardware KIMD-GHASH is verified to produce correct
34# result and therefore is engaged. On z196 it was measured to process
35# 8KB buffer ~7 faster than software implementation. It's not as
36# impressive for smaller buffer sizes and for smallest 16-bytes buffer
37# it's actually almost 2 times slower. Which is the reason why
38# KIMD-GHASH is not used in gcm_gmult_4bit.
39
40$flavour = shift;
41
42if ($flavour =~ /3[12]/) {
43 $SIZE_T=4;
44 $g="";
45} else {
46 $SIZE_T=8;
47 $g="g";
48}
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53$softonly=0;
54
55$Zhi="%r0";
56$Zlo="%r1";
57
58$Xi="%r2"; # argument block
59$Htbl="%r3";
60$inp="%r4";
61$len="%r5";
62
63$rem0="%r6"; # variables
64$rem1="%r7";
65$nlo="%r8";
66$nhi="%r9";
67$xi="%r10";
68$cnt="%r11";
69$tmp="%r12";
70$x78="%r13";
71$rem_4bit="%r14";
72
73$sp="%r15";
74
75$code.=<<___;
76.text
77
78.globl gcm_gmult_4bit
79.align 32
80gcm_gmult_4bit:
81___
82$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
83 larl %r1,OPENSSL_s390xcap_P
84 lg %r0,0(%r1)
85 tmhl %r0,0x4000 # check for message-security-assist
86 jz .Lsoft_gmult
87 lghi %r0,0
88 la %r1,16($sp)
89 .long 0xb93e0004 # kimd %r0,%r4
90 lg %r1,24($sp)
91 tmhh %r1,0x4000 # check for function 65
92 jz .Lsoft_gmult
93 stg %r0,16($sp) # arrange 16 bytes of zero input
94 stg %r0,24($sp)
95 lghi %r0,65 # function 65
96 la %r1,0($Xi) # H lies right after Xi in gcm128_context
97 la $inp,16($sp)
98 lghi $len,16
99 .long 0xb93e0004 # kimd %r0,$inp
100 brc 1,.-4 # pay attention to "partial completion"
101 br %r14
102.align 32
103.Lsoft_gmult:
104___
105$code.=<<___;
106 stm${g} %r6,%r14,6*$SIZE_T($sp)
107
108 aghi $Xi,-1
109 lghi $len,1
110 lghi $x78,`0xf<<3`
111 larl $rem_4bit,rem_4bit
112
113 lg $Zlo,8+1($Xi) # Xi
114 j .Lgmult_shortcut
115.type gcm_gmult_4bit,\@function
116.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
117
118.globl gcm_ghash_4bit
119.align 32
120gcm_ghash_4bit:
121___
122$code.=<<___ if(!$softonly);
123 larl %r1,OPENSSL_s390xcap_P
124 lg %r0,0(%r1)
125 tmhl %r0,0x4000 # check for message-security-assist
126 jz .Lsoft_ghash
127 lghi %r0,0
128 la %r1,16($sp)
129 .long 0xb93e0004 # kimd %r0,%r4
130 lg %r1,24($sp)
131 tmhh %r1,0x4000 # check for function 65
132 jz .Lsoft_ghash
133 lghi %r0,65 # function 65
134 la %r1,0($Xi) # H lies right after Xi in gcm128_context
135 .long 0xb93e0004 # kimd %r0,$inp
136 brc 1,.-4 # pay attention to "partial completion"
137 br %r14
138.align 32
139.Lsoft_ghash:
140___
141$code.=<<___ if ($flavour =~ /3[12]/);
142 llgfr $len,$len
143___
144$code.=<<___;
145 stm${g} %r6,%r14,6*$SIZE_T($sp)
146
147 aghi $Xi,-1
148 srlg $len,$len,4
149 lghi $x78,`0xf<<3`
150 larl $rem_4bit,rem_4bit
151
152 lg $Zlo,8+1($Xi) # Xi
153 lg $Zhi,0+1($Xi)
154 lghi $tmp,0
155.Louter:
156 xg $Zhi,0($inp) # Xi ^= inp
157 xg $Zlo,8($inp)
158 xgr $Zhi,$tmp
159 stg $Zlo,8+1($Xi)
160 stg $Zhi,0+1($Xi)
161
162.Lgmult_shortcut:
163 lghi $tmp,0xf0
164 sllg $nlo,$Zlo,4
165 srlg $xi,$Zlo,8 # extract second byte
166 ngr $nlo,$tmp
167 lgr $nhi,$Zlo
168 lghi $cnt,14
169 ngr $nhi,$tmp
170
171 lg $Zlo,8($nlo,$Htbl)
172 lg $Zhi,0($nlo,$Htbl)
173
174 sllg $nlo,$xi,4
175 sllg $rem0,$Zlo,3
176 ngr $nlo,$tmp
177 ngr $rem0,$x78
178 ngr $xi,$tmp
179
180 sllg $tmp,$Zhi,60
181 srlg $Zlo,$Zlo,4
182 srlg $Zhi,$Zhi,4
183 xg $Zlo,8($nhi,$Htbl)
184 xg $Zhi,0($nhi,$Htbl)
185 lgr $nhi,$xi
186 sllg $rem1,$Zlo,3
187 xgr $Zlo,$tmp
188 ngr $rem1,$x78
189 j .Lghash_inner
190.align 16
191.Lghash_inner:
192 srlg $Zlo,$Zlo,4
193 sllg $tmp,$Zhi,60
194 xg $Zlo,8($nlo,$Htbl)
195 srlg $Zhi,$Zhi,4
196 llgc $xi,0($cnt,$Xi)
197 xg $Zhi,0($nlo,$Htbl)
198 sllg $nlo,$xi,4
199 xg $Zhi,0($rem0,$rem_4bit)
200 nill $nlo,0xf0
201 sllg $rem0,$Zlo,3
202 xgr $Zlo,$tmp
203 ngr $rem0,$x78
204 nill $xi,0xf0
205
206 sllg $tmp,$Zhi,60
207 srlg $Zlo,$Zlo,4
208 srlg $Zhi,$Zhi,4
209 xg $Zlo,8($nhi,$Htbl)
210 xg $Zhi,0($nhi,$Htbl)
211 lgr $nhi,$xi
212 xg $Zhi,0($rem1,$rem_4bit)
213 sllg $rem1,$Zlo,3
214 xgr $Zlo,$tmp
215 ngr $rem1,$x78
216 brct $cnt,.Lghash_inner
217
218 sllg $tmp,$Zhi,60
219 srlg $Zlo,$Zlo,4
220 srlg $Zhi,$Zhi,4
221 xg $Zlo,8($nlo,$Htbl)
222 xg $Zhi,0($nlo,$Htbl)
223 sllg $xi,$Zlo,3
224 xg $Zhi,0($rem0,$rem_4bit)
225 xgr $Zlo,$tmp
226 ngr $xi,$x78
227
228 sllg $tmp,$Zhi,60
229 srlg $Zlo,$Zlo,4
230 srlg $Zhi,$Zhi,4
231 xg $Zlo,8($nhi,$Htbl)
232 xg $Zhi,0($nhi,$Htbl)
233 xgr $Zlo,$tmp
234 xg $Zhi,0($rem1,$rem_4bit)
235
236 lg $tmp,0($xi,$rem_4bit)
237 la $inp,16($inp)
238 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
239 brctg $len,.Louter
240
241 xgr $Zhi,$tmp
242 stg $Zlo,8+1($Xi)
243 stg $Zhi,0+1($Xi)
244 lm${g} %r6,%r14,6*$SIZE_T($sp)
245 br %r14
246.type gcm_ghash_4bit,\@function
247.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
248
249.align 64
250rem_4bit:
251 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
252 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
253 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
254 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
255.type rem_4bit,\@object
256.size rem_4bit,(.-rem_4bit)
257.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
258___
259
260$code =~ s/\`([^\`]*)\`/eval $1/gem;
261print $code;
262close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000000..70e7b044a3
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,330 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18# gcc 3.3.x cc 5.2 this assembler
19#
20# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
21# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25# gcc 4.4.1 this assembler
26#
27# 32-bit build 566 50 (+1000%)
28# 64-bit build 56 50 (+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64) { $bias=2047; $frame=192; }
43else { $bias=0; $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0"; # 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0"; # small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0"; # input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section ".text",#alloc,#execinstr
71
72.align 64
73rem_4bit:
74 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type rem_4bit,#object
79.size rem_4bit,(.-rem_4bit)
80
81.globl gcm_ghash_4bit
82.align 32
83gcm_ghash_4bit:
84 save %sp,-$frame,%sp
85 ldub [$inp+15],$nlo
86 ldub [$Xi+15],$xi0
87 ldub [$Xi+14],$xi1
88 add $len,$inp,$len
89 add $Htbl,8,$Htblo
90
911: call .+8
92 add %o7,rem_4bit-1b,$rem_4bit
93
94.Louter:
95 xor $xi0,$nlo,$nlo
96 and $nlo,0xf0,$nhi
97 and $nlo,0x0f,$nlo
98 sll $nlo,4,$nlo
99 ldx [$Htblo+$nlo],$Zlo
100 ldx [$Htbl+$nlo],$Zhi
101
102 ldub [$inp+14],$nlo
103
104 ldx [$Htblo+$nhi],$Tlo
105 and $Zlo,0xf,$remi
106 ldx [$Htbl+$nhi],$Thi
107 sll $remi,3,$remi
108 ldx [$rem_4bit+$remi],$rem
109 srlx $Zlo,4,$Zlo
110 mov 13,$cnt
111 sllx $Zhi,60,$tmp
112 xor $Tlo,$Zlo,$Zlo
113 srlx $Zhi,4,$Zhi
114 xor $Zlo,$tmp,$Zlo
115
116 xor $xi1,$nlo,$nlo
117 and $Zlo,0xf,$remi
118 and $nlo,0xf0,$nhi
119 and $nlo,0x0f,$nlo
120 ba .Lghash_inner
121 sll $nlo,4,$nlo
122.align 32
123.Lghash_inner:
124 ldx [$Htblo+$nlo],$Tlo
125 sll $remi,3,$remi
126 xor $Thi,$Zhi,$Zhi
127 ldx [$Htbl+$nlo],$Thi
128 srlx $Zlo,4,$Zlo
129 xor $rem,$Zhi,$Zhi
130 ldx [$rem_4bit+$remi],$rem
131 sllx $Zhi,60,$tmp
132 xor $Tlo,$Zlo,$Zlo
133 ldub [$inp+$cnt],$nlo
134 srlx $Zhi,4,$Zhi
135 xor $Zlo,$tmp,$Zlo
136 ldub [$Xi+$cnt],$xi1
137 xor $Thi,$Zhi,$Zhi
138 and $Zlo,0xf,$remi
139
140 ldx [$Htblo+$nhi],$Tlo
141 sll $remi,3,$remi
142 xor $rem,$Zhi,$Zhi
143 ldx [$Htbl+$nhi],$Thi
144 srlx $Zlo,4,$Zlo
145 ldx [$rem_4bit+$remi],$rem
146 sllx $Zhi,60,$tmp
147 xor $xi1,$nlo,$nlo
148 srlx $Zhi,4,$Zhi
149 and $nlo,0xf0,$nhi
150 addcc $cnt,-1,$cnt
151 xor $Zlo,$tmp,$Zlo
152 and $nlo,0x0f,$nlo
153 xor $Tlo,$Zlo,$Zlo
154 sll $nlo,4,$nlo
155 blu .Lghash_inner
156 and $Zlo,0xf,$remi
157
158 ldx [$Htblo+$nlo],$Tlo
159 sll $remi,3,$remi
160 xor $Thi,$Zhi,$Zhi
161 ldx [$Htbl+$nlo],$Thi
162 srlx $Zlo,4,$Zlo
163 xor $rem,$Zhi,$Zhi
164 ldx [$rem_4bit+$remi],$rem
165 sllx $Zhi,60,$tmp
166 xor $Tlo,$Zlo,$Zlo
167 srlx $Zhi,4,$Zhi
168 xor $Zlo,$tmp,$Zlo
169 xor $Thi,$Zhi,$Zhi
170
171 add $inp,16,$inp
172 cmp $inp,$len
173 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
174 and $Zlo,0xf,$remi
175
176 ldx [$Htblo+$nhi],$Tlo
177 sll $remi,3,$remi
178 xor $rem,$Zhi,$Zhi
179 ldx [$Htbl+$nhi],$Thi
180 srlx $Zlo,4,$Zlo
181 ldx [$rem_4bit+$remi],$rem
182 sllx $Zhi,60,$tmp
183 xor $Tlo,$Zlo,$Zlo
184 ldub [$inp+15],$nlo
185 srlx $Zhi,4,$Zhi
186 xor $Zlo,$tmp,$Zlo
187 xor $Thi,$Zhi,$Zhi
188 stx $Zlo,[$Xi+8]
189 xor $rem,$Zhi,$Zhi
190 stx $Zhi,[$Xi]
191 srl $Zlo,8,$xi1
192 and $Zlo,0xff,$xi0
193 ba .Louter
194 and $xi1,0xff,$xi1
195.align 32
196.Ldone:
197 ldx [$Htblo+$nhi],$Tlo
198 sll $remi,3,$remi
199 xor $rem,$Zhi,$Zhi
200 ldx [$Htbl+$nhi],$Thi
201 srlx $Zlo,4,$Zlo
202 ldx [$rem_4bit+$remi],$rem
203 sllx $Zhi,60,$tmp
204 xor $Tlo,$Zlo,$Zlo
205 srlx $Zhi,4,$Zhi
206 xor $Zlo,$tmp,$Zlo
207 xor $Thi,$Zhi,$Zhi
208 stx $Zlo,[$Xi+8]
209 xor $rem,$Zhi,$Zhi
210 stx $Zhi,[$Xi]
211
212 ret
213 restore
214.type gcm_ghash_4bit,#function
215.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
216___
217
218undef $inp;
219undef $len;
220
221$code.=<<___;
222.globl gcm_gmult_4bit
223.align 32
224gcm_gmult_4bit:
225 save %sp,-$frame,%sp
226 ldub [$Xi+15],$nlo
227 add $Htbl,8,$Htblo
228
2291: call .+8
230 add %o7,rem_4bit-1b,$rem_4bit
231
232 and $nlo,0xf0,$nhi
233 and $nlo,0x0f,$nlo
234 sll $nlo,4,$nlo
235 ldx [$Htblo+$nlo],$Zlo
236 ldx [$Htbl+$nlo],$Zhi
237
238 ldub [$Xi+14],$nlo
239
240 ldx [$Htblo+$nhi],$Tlo
241 and $Zlo,0xf,$remi
242 ldx [$Htbl+$nhi],$Thi
243 sll $remi,3,$remi
244 ldx [$rem_4bit+$remi],$rem
245 srlx $Zlo,4,$Zlo
246 mov 13,$cnt
247 sllx $Zhi,60,$tmp
248 xor $Tlo,$Zlo,$Zlo
249 srlx $Zhi,4,$Zhi
250 xor $Zlo,$tmp,$Zlo
251
252 and $Zlo,0xf,$remi
253 and $nlo,0xf0,$nhi
254 and $nlo,0x0f,$nlo
255 ba .Lgmult_inner
256 sll $nlo,4,$nlo
257.align 32
258.Lgmult_inner:
259 ldx [$Htblo+$nlo],$Tlo
260 sll $remi,3,$remi
261 xor $Thi,$Zhi,$Zhi
262 ldx [$Htbl+$nlo],$Thi
263 srlx $Zlo,4,$Zlo
264 xor $rem,$Zhi,$Zhi
265 ldx [$rem_4bit+$remi],$rem
266 sllx $Zhi,60,$tmp
267 xor $Tlo,$Zlo,$Zlo
268 ldub [$Xi+$cnt],$nlo
269 srlx $Zhi,4,$Zhi
270 xor $Zlo,$tmp,$Zlo
271 xor $Thi,$Zhi,$Zhi
272 and $Zlo,0xf,$remi
273
274 ldx [$Htblo+$nhi],$Tlo
275 sll $remi,3,$remi
276 xor $rem,$Zhi,$Zhi
277 ldx [$Htbl+$nhi],$Thi
278 srlx $Zlo,4,$Zlo
279 ldx [$rem_4bit+$remi],$rem
280 sllx $Zhi,60,$tmp
281 srlx $Zhi,4,$Zhi
282 and $nlo,0xf0,$nhi
283 addcc $cnt,-1,$cnt
284 xor $Zlo,$tmp,$Zlo
285 and $nlo,0x0f,$nlo
286 xor $Tlo,$Zlo,$Zlo
287 sll $nlo,4,$nlo
288 blu .Lgmult_inner
289 and $Zlo,0xf,$remi
290
291 ldx [$Htblo+$nlo],$Tlo
292 sll $remi,3,$remi
293 xor $Thi,$Zhi,$Zhi
294 ldx [$Htbl+$nlo],$Thi
295 srlx $Zlo,4,$Zlo
296 xor $rem,$Zhi,$Zhi
297 ldx [$rem_4bit+$remi],$rem
298 sllx $Zhi,60,$tmp
299 xor $Tlo,$Zlo,$Zlo
300 srlx $Zhi,4,$Zhi
301 xor $Zlo,$tmp,$Zlo
302 xor $Thi,$Zhi,$Zhi
303 and $Zlo,0xf,$remi
304
305 ldx [$Htblo+$nhi],$Tlo
306 sll $remi,3,$remi
307 xor $rem,$Zhi,$Zhi
308 ldx [$Htbl+$nhi],$Thi
309 srlx $Zlo,4,$Zlo
310 ldx [$rem_4bit+$remi],$rem
311 sllx $Zhi,60,$tmp
312 xor $Tlo,$Zlo,$Zlo
313 srlx $Zhi,4,$Zhi
314 xor $Zlo,$tmp,$Zlo
315 xor $Thi,$Zhi,$Zhi
316 stx $Zlo,[$Xi+8]
317 xor $rem,$Zhi,$Zhi
318 stx $Zhi,[$Xi]
319
320 ret
321 restore
322.type gcm_gmult_4bit,#function
323.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
324.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
325.align 4
326___
327
328$code =~ s/\`([^\`]*)\`/eval $1/gem;
329print $code;
330close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000000..6b09669d47
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1342 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, May, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15# code paths: vanilla x86 and vanilla MMX. Former will be executed on
16# 486 and Pentium, latter on all others. MMX GHASH features so called
17# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18# of per-key storage [+512 bytes shared table]. Performance results
19# are for streamed GHASH subroutine and are expressed in cycles per
20# processed byte, less is better:
21#
22# gcc 2.95.3(*) MMX assembler x86 assembler
23#
24# Pentium 105/111(**) - 50
25# PIII 68 /75 12.2 24
26# P4 125/125 17.8 84(***)
27# Opteron 66 /70 10.1 30
28# Core2 54 /67 8.4 18
29#
30# (*) gcc 3.4.x was observed to generate few percent slower code,
31# which is one of reasons why 2.95.3 results were chosen,
32# another reason is lack of 3.4.x results for older CPUs;
33# comparison with MMX results is not completely fair, because C
34# results are for vanilla "256B" implementation, while
35# assembler results are for "528B";-)
36# (**) second number is result for code compiled with -fPIC flag,
37# which is actually more relevant, because assembler code is
38# position-independent;
39# (***) see comment in non-MMX routine for further details;
40#
41# To summarize, it's >2-5 times faster than gcc-generated code. To
42# anchor it to something else SHA1 assembler processes one byte in
43# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
44# particular, see comment at the end of the file...
45
46# May 2010
47#
48# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
49# The question is how close is it to theoretical limit? The pclmulqdq
50# instruction latency appears to be 14 cycles and there can't be more
51# than 2 of them executing at any given time. This means that single
52# Karatsuba multiplication would take 28 cycles *plus* few cycles for
53# pre- and post-processing. Then multiplication has to be followed by
54# modulo-reduction. Given that aggregated reduction method [see
55# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
56# white paper by Intel] allows you to perform reduction only once in
57# a while we can assume that asymptotic performance can be estimated
58# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
59# and Naggr is the aggregation factor.
60#
61# Before we proceed to this implementation let's have closer look at
62# the best-performing code suggested by Intel in their white paper.
63# By tracing inter-register dependencies Tmod is estimated as ~19
64# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
65# processed byte. As implied, this is quite optimistic estimate,
66# because it does not account for Karatsuba pre- and post-processing,
67# which for a single multiplication is ~5 cycles. Unfortunately Intel
68# does not provide performance data for GHASH alone. But benchmarking
69# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
70# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
71# the result accounts even for pre-computing of degrees of the hash
72# key H, but its portion is negligible at 16KB buffer size.
73#
74# Moving on to the implementation in question. Tmod is estimated as
75# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
76# 2.16. How is it possible that measured performance is better than
77# optimistic theoretical estimate? There is one thing Intel failed
78# to recognize. By serializing GHASH with CTR in same subroutine
79# former's performance is really limited to above (Tmul + Tmod/Naggr)
80# equation. But if GHASH procedure is detached, the modulo-reduction
81# can be interleaved with Naggr-1 multiplications at instruction level
82# and under ideal conditions even disappear from the equation. So that
83# optimistic theoretical estimate for this implementation is ...
84# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
85# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
86# where Tproc is time required for Karatsuba pre- and post-processing,
87# is more realistic estimate. In this case it gives ... 1.91 cycles.
88# Or in other words, depending on how well we can interleave reduction
89# and one of the two multiplications the performance should be betwen
90# 1.91 and 2.16. As already mentioned, this implementation processes
91# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
92# - in 2.02. x86_64 performance is better, because larger register
93# bank allows to interleave reduction and multiplication better.
94#
95# Does it make sense to increase Naggr? To start with it's virtually
96# impossible in 32-bit mode, because of limited register bank
97# capacity. Otherwise improvement has to be weighed agiainst slower
98# setup, as well as code size and complexity increase. As even
99# optimistic estimate doesn't promise 30% performance improvement,
100# there are currently no plans to increase Naggr.
101#
102# Special thanks to David Woodhouse <dwmw2@infradead.org> for
103# providing access to a Westmere-based system on behalf of Intel
104# Open Source Technology Centre.
105
106# January 2010
107#
108# Tweaked to optimize transitions between integer and FP operations
109# on same XMM register, PCLMULQDQ subroutine was measured to process
110# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
111# The minor regression on Westmere is outweighed by ~15% improvement
112# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
113# similar manner resulted in almost 20% degradation on Sandy Bridge,
114# where original 64-bit code processes one byte in 1.95 cycles.
115
116$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117push(@INC,"${dir}","${dir}../../perlasm");
118require "x86asm.pl";
119
120&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
121
122$sse2=0;
123for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
124
125($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
126$inp = "edi";
127$Htbl = "esi";
128
129$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
130 # than unrolled, which has to be weighted against
131 # 2.5x x86-specific code size reduction.
132
133sub x86_loop {
134 my $off = shift;
135 my $rem = "eax";
136
137 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
138 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
139 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
140 &mov ($Zll,&DWP(8,$Htbl,$Zll));
141 &xor ($rem,$rem); # avoid partial register stalls on PIII
142
143 # shrd practically kills P4, 2.5x deterioration, but P4 has
144 # MMX code-path to execute. shrd runs tad faster [than twice
145 # the shifts, move's and or's] on pre-MMX Pentium (as well as
146 # PIII and Core2), *but* minimizes code size, spares register
147 # and thus allows to fold the loop...
148 if (!$unroll) {
149 my $cnt = $inp;
150 &mov ($cnt,15);
151 &jmp (&label("x86_loop"));
152 &set_label("x86_loop",16);
153 for($i=1;$i<=2;$i++) {
154 &mov (&LB($rem),&LB($Zll));
155 &shrd ($Zll,$Zlh,4);
156 &and (&LB($rem),0xf);
157 &shrd ($Zlh,$Zhl,4);
158 &shrd ($Zhl,$Zhh,4);
159 &shr ($Zhh,4);
160 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
161
162 &mov (&LB($rem),&BP($off,"esp",$cnt));
163 if ($i&1) {
164 &and (&LB($rem),0xf0);
165 } else {
166 &shl (&LB($rem),4);
167 }
168
169 &xor ($Zll,&DWP(8,$Htbl,$rem));
170 &xor ($Zlh,&DWP(12,$Htbl,$rem));
171 &xor ($Zhl,&DWP(0,$Htbl,$rem));
172 &xor ($Zhh,&DWP(4,$Htbl,$rem));
173
174 if ($i&1) {
175 &dec ($cnt);
176 &js (&label("x86_break"));
177 } else {
178 &jmp (&label("x86_loop"));
179 }
180 }
181 &set_label("x86_break",16);
182 } else {
183 for($i=1;$i<32;$i++) {
184 &comment($i);
185 &mov (&LB($rem),&LB($Zll));
186 &shrd ($Zll,$Zlh,4);
187 &and (&LB($rem),0xf);
188 &shrd ($Zlh,$Zhl,4);
189 &shrd ($Zhl,$Zhh,4);
190 &shr ($Zhh,4);
191 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
192
193 if ($i&1) {
194 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
195 &and (&LB($rem),0xf0);
196 } else {
197 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
198 &shl (&LB($rem),4);
199 }
200
201 &xor ($Zll,&DWP(8,$Htbl,$rem));
202 &xor ($Zlh,&DWP(12,$Htbl,$rem));
203 &xor ($Zhl,&DWP(0,$Htbl,$rem));
204 &xor ($Zhh,&DWP(4,$Htbl,$rem));
205 }
206 }
207 &bswap ($Zll);
208 &bswap ($Zlh);
209 &bswap ($Zhl);
210 if (!$x86only) {
211 &bswap ($Zhh);
212 } else {
213 &mov ("eax",$Zhh);
214 &bswap ("eax");
215 &mov ($Zhh,"eax");
216 }
217}
218
219if ($unroll) {
220 &function_begin_B("_x86_gmult_4bit_inner");
221 &x86_loop(4);
222 &ret ();
223 &function_end_B("_x86_gmult_4bit_inner");
224}
225
226sub deposit_rem_4bit {
227 my $bias = shift;
228
229 &mov (&DWP($bias+0, "esp"),0x0000<<16);
230 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
231 &mov (&DWP($bias+8, "esp"),0x3840<<16);
232 &mov (&DWP($bias+12,"esp"),0x2460<<16);
233 &mov (&DWP($bias+16,"esp"),0x7080<<16);
234 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
235 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
236 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
237 &mov (&DWP($bias+32,"esp"),0xE100<<16);
238 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
239 &mov (&DWP($bias+40,"esp"),0xD940<<16);
240 &mov (&DWP($bias+44,"esp"),0xC560<<16);
241 &mov (&DWP($bias+48,"esp"),0x9180<<16);
242 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
243 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
244 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
245}
246
247$suffix = $x86only ? "" : "_x86";
248
249&function_begin("gcm_gmult_4bit".$suffix);
250 &stack_push(16+4+1); # +1 for stack alignment
251 &mov ($inp,&wparam(0)); # load Xi
252 &mov ($Htbl,&wparam(1)); # load Htable
253
254 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
255 &mov ($Zhl,&DWP(4,$inp));
256 &mov ($Zlh,&DWP(8,$inp));
257 &mov ($Zll,&DWP(12,$inp));
258
259 &deposit_rem_4bit(16);
260
261 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
262 &mov (&DWP(4,"esp"),$Zhl);
263 &mov (&DWP(8,"esp"),$Zlh);
264 &mov (&DWP(12,"esp"),$Zll);
265 &shr ($Zll,20);
266 &and ($Zll,0xf0);
267
268 if ($unroll) {
269 &call ("_x86_gmult_4bit_inner");
270 } else {
271 &x86_loop(0);
272 &mov ($inp,&wparam(0));
273 }
274
275 &mov (&DWP(12,$inp),$Zll);
276 &mov (&DWP(8,$inp),$Zlh);
277 &mov (&DWP(4,$inp),$Zhl);
278 &mov (&DWP(0,$inp),$Zhh);
279 &stack_pop(16+4+1);
280&function_end("gcm_gmult_4bit".$suffix);
281
282&function_begin("gcm_ghash_4bit".$suffix);
283 &stack_push(16+4+1); # +1 for 64-bit alignment
284 &mov ($Zll,&wparam(0)); # load Xi
285 &mov ($Htbl,&wparam(1)); # load Htable
286 &mov ($inp,&wparam(2)); # load in
287 &mov ("ecx",&wparam(3)); # load len
288 &add ("ecx",$inp);
289 &mov (&wparam(3),"ecx");
290
291 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
292 &mov ($Zhl,&DWP(4,$Zll));
293 &mov ($Zlh,&DWP(8,$Zll));
294 &mov ($Zll,&DWP(12,$Zll));
295
296 &deposit_rem_4bit(16);
297
298 &set_label("x86_outer_loop",16);
299 &xor ($Zll,&DWP(12,$inp)); # xor with input
300 &xor ($Zlh,&DWP(8,$inp));
301 &xor ($Zhl,&DWP(4,$inp));
302 &xor ($Zhh,&DWP(0,$inp));
303 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
304 &mov (&DWP(8,"esp"),$Zlh);
305 &mov (&DWP(4,"esp"),$Zhl);
306 &mov (&DWP(0,"esp"),$Zhh);
307
308 &shr ($Zll,20);
309 &and ($Zll,0xf0);
310
311 if ($unroll) {
312 &call ("_x86_gmult_4bit_inner");
313 } else {
314 &x86_loop(0);
315 &mov ($inp,&wparam(2));
316 }
317 &lea ($inp,&DWP(16,$inp));
318 &cmp ($inp,&wparam(3));
319 &mov (&wparam(2),$inp) if (!$unroll);
320 &jb (&label("x86_outer_loop"));
321
322 &mov ($inp,&wparam(0)); # load Xi
323 &mov (&DWP(12,$inp),$Zll);
324 &mov (&DWP(8,$inp),$Zlh);
325 &mov (&DWP(4,$inp),$Zhl);
326 &mov (&DWP(0,$inp),$Zhh);
327 &stack_pop(16+4+1);
328&function_end("gcm_ghash_4bit".$suffix);
329
330if (!$x86only) {{{
331
332&static_label("rem_4bit");
333
334if (!$sse2) {{ # pure-MMX "May" version...
335
336$S=12; # shift factor for rem_4bit
337
338&function_begin_B("_mmx_gmult_4bit_inner");
339# MMX version performs 3.5 times better on P4 (see comment in non-MMX
340# routine for further details), 100% better on Opteron, ~70% better
341# on Core2 and PIII... In other words effort is considered to be well
342# spent... Since initial release the loop was unrolled in order to
343# "liberate" register previously used as loop counter. Instead it's
344# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
345# The path involves move of Z.lo from MMX to integer register,
346# effective address calculation and finally merge of value to Z.hi.
347# Reference to rem_4bit is scheduled so late that I had to >>4
348# rem_4bit elements. This resulted in 20-45% procent improvement
349# on contemporary µ-archs.
350{
351 my $cnt;
352 my $rem_4bit = "eax";
353 my @rem = ($Zhh,$Zll);
354 my $nhi = $Zhl;
355 my $nlo = $Zlh;
356
357 my ($Zlo,$Zhi) = ("mm0","mm1");
358 my $tmp = "mm2";
359
360 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
361 &mov ($nhi,$Zll);
362 &mov (&LB($nlo),&LB($nhi));
363 &shl (&LB($nlo),4);
364 &and ($nhi,0xf0);
365 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
366 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
367 &movd ($rem[0],$Zlo);
368
369 for ($cnt=28;$cnt>=-2;$cnt--) {
370 my $odd = $cnt&1;
371 my $nix = $odd ? $nlo : $nhi;
372
373 &shl (&LB($nlo),4) if ($odd);
374 &psrlq ($Zlo,4);
375 &movq ($tmp,$Zhi);
376 &psrlq ($Zhi,4);
377 &pxor ($Zlo,&QWP(8,$Htbl,$nix));
378 &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
379 &psllq ($tmp,60);
380 &and ($nhi,0xf0) if ($odd);
381 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
382 &and ($rem[0],0xf);
383 &pxor ($Zhi,&QWP(0,$Htbl,$nix));
384 &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
385 &movd ($rem[1],$Zlo);
386 &pxor ($Zlo,$tmp);
387
388 push (@rem,shift(@rem)); # "rotate" registers
389 }
390
391 &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
392
393 &psrlq ($Zlo,32); # lower part of Zlo is already there
394 &movd ($Zhl,$Zhi);
395 &psrlq ($Zhi,32);
396 &movd ($Zlh,$Zlo);
397 &movd ($Zhh,$Zhi);
398 &shl ($inp,4); # compensate for rem_4bit[i] being >>4
399
400 &bswap ($Zll);
401 &bswap ($Zhl);
402 &bswap ($Zlh);
403 &xor ($Zhh,$inp);
404 &bswap ($Zhh);
405
406 &ret ();
407}
408&function_end_B("_mmx_gmult_4bit_inner");
409
410&function_begin("gcm_gmult_4bit_mmx");
411 &mov ($inp,&wparam(0)); # load Xi
412 &mov ($Htbl,&wparam(1)); # load Htable
413
414 &call (&label("pic_point"));
415 &set_label("pic_point");
416 &blindpop("eax");
417 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
418
419 &movz ($Zll,&BP(15,$inp));
420
421 &call ("_mmx_gmult_4bit_inner");
422
423 &mov ($inp,&wparam(0)); # load Xi
424 &emms ();
425 &mov (&DWP(12,$inp),$Zll);
426 &mov (&DWP(4,$inp),$Zhl);
427 &mov (&DWP(8,$inp),$Zlh);
428 &mov (&DWP(0,$inp),$Zhh);
429&function_end("gcm_gmult_4bit_mmx");
430
431# Streamed version performs 20% better on P4, 7% on Opteron,
432# 10% on Core2 and PIII...
433&function_begin("gcm_ghash_4bit_mmx");
434 &mov ($Zhh,&wparam(0)); # load Xi
435 &mov ($Htbl,&wparam(1)); # load Htable
436 &mov ($inp,&wparam(2)); # load in
437 &mov ($Zlh,&wparam(3)); # load len
438
439 &call (&label("pic_point"));
440 &set_label("pic_point");
441 &blindpop("eax");
442 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
443
444 &add ($Zlh,$inp);
445 &mov (&wparam(3),$Zlh); # len to point at the end of input
446 &stack_push(4+1); # +1 for stack alignment
447
448 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
449 &mov ($Zhl,&DWP(4,$Zhh));
450 &mov ($Zlh,&DWP(8,$Zhh));
451 &mov ($Zhh,&DWP(0,$Zhh));
452 &jmp (&label("mmx_outer_loop"));
453
454 &set_label("mmx_outer_loop",16);
455 &xor ($Zll,&DWP(12,$inp));
456 &xor ($Zhl,&DWP(4,$inp));
457 &xor ($Zlh,&DWP(8,$inp));
458 &xor ($Zhh,&DWP(0,$inp));
459 &mov (&wparam(2),$inp);
460 &mov (&DWP(12,"esp"),$Zll);
461 &mov (&DWP(4,"esp"),$Zhl);
462 &mov (&DWP(8,"esp"),$Zlh);
463 &mov (&DWP(0,"esp"),$Zhh);
464
465 &mov ($inp,"esp");
466 &shr ($Zll,24);
467
468 &call ("_mmx_gmult_4bit_inner");
469
470 &mov ($inp,&wparam(2));
471 &lea ($inp,&DWP(16,$inp));
472 &cmp ($inp,&wparam(3));
473 &jb (&label("mmx_outer_loop"));
474
475 &mov ($inp,&wparam(0)); # load Xi
476 &emms ();
477 &mov (&DWP(12,$inp),$Zll);
478 &mov (&DWP(4,$inp),$Zhl);
479 &mov (&DWP(8,$inp),$Zlh);
480 &mov (&DWP(0,$inp),$Zhh);
481
482 &stack_pop(4+1);
483&function_end("gcm_ghash_4bit_mmx");
484
485}} else {{ # "June" MMX version...
486 # ... has slower "April" gcm_gmult_4bit_mmx with folded
487 # loop. This is done to conserve code size...
488$S=16; # shift factor for rem_4bit
489
490sub mmx_loop() {
491# MMX version performs 2.8 times better on P4 (see comment in non-MMX
492# routine for further details), 40% better on Opteron and Core2, 50%
493# better on PIII... In other words effort is considered to be well
494# spent...
495 my $inp = shift;
496 my $rem_4bit = shift;
497 my $cnt = $Zhh;
498 my $nhi = $Zhl;
499 my $nlo = $Zlh;
500 my $rem = $Zll;
501
502 my ($Zlo,$Zhi) = ("mm0","mm1");
503 my $tmp = "mm2";
504
505 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
506 &mov ($nhi,$Zll);
507 &mov (&LB($nlo),&LB($nhi));
508 &mov ($cnt,14);
509 &shl (&LB($nlo),4);
510 &and ($nhi,0xf0);
511 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
512 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
513 &movd ($rem,$Zlo);
514 &jmp (&label("mmx_loop"));
515
516 &set_label("mmx_loop",16);
517 &psrlq ($Zlo,4);
518 &and ($rem,0xf);
519 &movq ($tmp,$Zhi);
520 &psrlq ($Zhi,4);
521 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
522 &mov (&LB($nlo),&BP(0,$inp,$cnt));
523 &psllq ($tmp,60);
524 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
525 &dec ($cnt);
526 &movd ($rem,$Zlo);
527 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
528 &mov ($nhi,$nlo);
529 &pxor ($Zlo,$tmp);
530 &js (&label("mmx_break"));
531
532 &shl (&LB($nlo),4);
533 &and ($rem,0xf);
534 &psrlq ($Zlo,4);
535 &and ($nhi,0xf0);
536 &movq ($tmp,$Zhi);
537 &psrlq ($Zhi,4);
538 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
539 &psllq ($tmp,60);
540 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
541 &movd ($rem,$Zlo);
542 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
543 &pxor ($Zlo,$tmp);
544 &jmp (&label("mmx_loop"));
545
546 &set_label("mmx_break",16);
547 &shl (&LB($nlo),4);
548 &and ($rem,0xf);
549 &psrlq ($Zlo,4);
550 &and ($nhi,0xf0);
551 &movq ($tmp,$Zhi);
552 &psrlq ($Zhi,4);
553 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
554 &psllq ($tmp,60);
555 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
556 &movd ($rem,$Zlo);
557 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
558 &pxor ($Zlo,$tmp);
559
560 &psrlq ($Zlo,4);
561 &and ($rem,0xf);
562 &movq ($tmp,$Zhi);
563 &psrlq ($Zhi,4);
564 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
565 &psllq ($tmp,60);
566 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
567 &movd ($rem,$Zlo);
568 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
569 &pxor ($Zlo,$tmp);
570
571 &psrlq ($Zlo,32); # lower part of Zlo is already there
572 &movd ($Zhl,$Zhi);
573 &psrlq ($Zhi,32);
574 &movd ($Zlh,$Zlo);
575 &movd ($Zhh,$Zhi);
576
577 &bswap ($Zll);
578 &bswap ($Zhl);
579 &bswap ($Zlh);
580 &bswap ($Zhh);
581}
582
583&function_begin("gcm_gmult_4bit_mmx");
584 &mov ($inp,&wparam(0)); # load Xi
585 &mov ($Htbl,&wparam(1)); # load Htable
586
587 &call (&label("pic_point"));
588 &set_label("pic_point");
589 &blindpop("eax");
590 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
591
592 &movz ($Zll,&BP(15,$inp));
593
594 &mmx_loop($inp,"eax");
595
596 &emms ();
597 &mov (&DWP(12,$inp),$Zll);
598 &mov (&DWP(4,$inp),$Zhl);
599 &mov (&DWP(8,$inp),$Zlh);
600 &mov (&DWP(0,$inp),$Zhh);
601&function_end("gcm_gmult_4bit_mmx");
602
603######################################################################
604# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
605# (see gcm128.c for details). It provides further 20-40% performance
606# improvement over above mentioned "May" version.
607
608&static_label("rem_8bit");
609
610&function_begin("gcm_ghash_4bit_mmx");
611{ my ($Zlo,$Zhi) = ("mm7","mm6");
612 my $rem_8bit = "esi";
613 my $Htbl = "ebx";
614
615 # parameter block
616 &mov ("eax",&wparam(0)); # Xi
617 &mov ("ebx",&wparam(1)); # Htable
618 &mov ("ecx",&wparam(2)); # inp
619 &mov ("edx",&wparam(3)); # len
620 &mov ("ebp","esp"); # original %esp
621 &call (&label("pic_point"));
622 &set_label ("pic_point");
623 &blindpop ($rem_8bit);
624 &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
625
626 &sub ("esp",512+16+16); # allocate stack frame...
627 &and ("esp",-64); # ...and align it
628 &sub ("esp",16); # place for (u8)(H[]<<4)
629
630 &add ("edx","ecx"); # pointer to the end of input
631 &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
632 &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
633 &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
634
635 { my @lo = ("mm0","mm1","mm2");
636 my @hi = ("mm3","mm4","mm5");
637 my @tmp = ("mm6","mm7");
638 my $off1=0,$off2=0,$i;
639
640 &add ($Htbl,128); # optimize for size
641 &lea ("edi",&DWP(16+128,"esp"));
642 &lea ("ebp",&DWP(16+256+128,"esp"));
643
644 # decompose Htable (low and high parts are kept separately),
645 # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
646 for ($i=0;$i<18;$i++) {
647
648 &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
649 &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
650 &psllq ($tmp[1],60) if ($i>1);
651 &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
652 &por ($lo[2],$tmp[1]) if ($i>1);
653 &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
654 &psrlq ($lo[1],4) if ($i>0 && $i<17);
655 &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
656 &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
657 &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
658 &psrlq ($hi[1],4) if ($i>0 && $i<17);
659 &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
660 &shl ("edx",4) if ($i<16);
661 &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
662
663 unshift (@lo,pop(@lo)); # "rotate" registers
664 unshift (@hi,pop(@hi));
665 unshift (@tmp,pop(@tmp));
666 $off1 += 8 if ($i>0);
667 $off2 += 8 if ($i>1);
668 }
669 }
670
671 &movq ($Zhi,&QWP(0,"eax"));
672 &mov ("ebx",&DWP(8,"eax"));
673 &mov ("edx",&DWP(12,"eax")); # load Xi
674
675&set_label("outer",16);
676 { my $nlo = "eax";
677 my $dat = "edx";
678 my @nhi = ("edi","ebp");
679 my @rem = ("ebx","ecx");
680 my @red = ("mm0","mm1","mm2");
681 my $tmp = "mm3";
682
683 &xor ($dat,&DWP(12,"ecx")); # merge input data
684 &xor ("ebx",&DWP(8,"ecx"));
685 &pxor ($Zhi,&QWP(0,"ecx"));
686 &lea ("ecx",&DWP(16,"ecx")); # inp+=16
687 #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
688 &mov (&DWP(528+8,"esp"),"ebx");
689 &movq (&QWP(528+0,"esp"),$Zhi);
690 &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
691
692 &xor ($nlo,$nlo);
693 &rol ($dat,8);
694 &mov (&LB($nlo),&LB($dat));
695 &mov ($nhi[1],$nlo);
696 &and (&LB($nlo),0x0f);
697 &shr ($nhi[1],4);
698 &pxor ($red[0],$red[0]);
699 &rol ($dat,8); # next byte
700 &pxor ($red[1],$red[1]);
701 &pxor ($red[2],$red[2]);
702
703 # Just like in "May" verson modulo-schedule for critical path in
704 # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
705 # is scheduled so late that rem_8bit[] has to be shifted *right*
706 # by 16, which is why last argument to pinsrw is 2, which
707 # corresponds to <<32=<<48>>16...
708 for ($j=11,$i=0;$i<15;$i++) {
709
710 if ($i>0) {
711 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
712 &rol ($dat,8); # next byte
713 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
714
715 &pxor ($Zlo,$tmp);
716 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
717 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
718 } else {
719 &movq ($Zlo,&QWP(16,"esp",$nlo,8));
720 &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
721 }
722
723 &mov (&LB($nlo),&LB($dat));
724 &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
725
726 &movd ($rem[0],$Zlo);
727 &movz ($rem[1],&LB($rem[1])) if ($i>0);
728 &psrlq ($Zlo,8); # Z>>=8
729
730 &movq ($tmp,$Zhi);
731 &mov ($nhi[0],$nlo);
732 &psrlq ($Zhi,8);
733
734 &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
735 &and (&LB($nlo),0x0f);
736 &psllq ($tmp,56);
737
738 &pxor ($Zhi,$red[1]) if ($i>1);
739 &shr ($nhi[0],4);
740 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
741
742 unshift (@red,pop(@red)); # "rotate" registers
743 unshift (@rem,pop(@rem));
744 unshift (@nhi,pop(@nhi));
745 }
746
747 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
748 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
749 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
750
751 &pxor ($Zlo,$tmp);
752 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
753 &movz ($rem[1],&LB($rem[1]));
754
755 &pxor ($red[2],$red[2]); # clear 2nd word
756 &psllq ($red[1],4);
757
758 &movd ($rem[0],$Zlo);
759 &psrlq ($Zlo,4); # Z>>=4
760
761 &movq ($tmp,$Zhi);
762 &psrlq ($Zhi,4);
763 &shl ($rem[0],4); # rem<<4
764
765 &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
766 &psllq ($tmp,60);
767 &movz ($rem[0],&LB($rem[0]));
768
769 &pxor ($Zlo,$tmp);
770 &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
771
772 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
773 &pxor ($Zhi,$red[1]);
774
775 &movd ($dat,$Zlo);
776 &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
777
778 &psllq ($red[0],12); # correct by <<16>>4
779 &pxor ($Zhi,$red[0]);
780 &psrlq ($Zlo,32);
781 &pxor ($Zhi,$red[2]);
782
783 &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
784 &movd ("ebx",$Zlo);
785 &movq ($tmp,$Zhi); # 01234567
786 &psllw ($Zhi,8); # 1.3.5.7.
787 &psrlw ($tmp,8); # .0.2.4.6
788 &por ($Zhi,$tmp); # 10325476
789 &bswap ($dat);
790 &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
791 &bswap ("ebx");
792
793 &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
794 &jne (&label("outer"));
795 }
796
797 &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
798 &mov (&DWP(12,"eax"),"edx");
799 &mov (&DWP(8,"eax"),"ebx");
800 &movq (&QWP(0,"eax"),$Zhi);
801
802 &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
803 &emms ();
804}
805&function_end("gcm_ghash_4bit_mmx");
806}}
807
808if ($sse2) {{
809######################################################################
810# PCLMULQDQ version.
811
812$Xip="eax";
813$Htbl="edx";
814$const="ecx";
815$inp="esi";
816$len="ebx";
817
818($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
819($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
820($Xn,$Xhn)=("xmm6","xmm7");
821
822&static_label("bswap");
823
824sub clmul64x64_T2 { # minimal "register" pressure
825my ($Xhi,$Xi,$Hkey)=@_;
826
827 &movdqa ($Xhi,$Xi); #
828 &pshufd ($T1,$Xi,0b01001110);
829 &pshufd ($T2,$Hkey,0b01001110);
830 &pxor ($T1,$Xi); #
831 &pxor ($T2,$Hkey);
832
833 &pclmulqdq ($Xi,$Hkey,0x00); #######
834 &pclmulqdq ($Xhi,$Hkey,0x11); #######
835 &pclmulqdq ($T1,$T2,0x00); #######
836 &xorps ($T1,$Xi); #
837 &xorps ($T1,$Xhi); #
838
839 &movdqa ($T2,$T1); #
840 &psrldq ($T1,8);
841 &pslldq ($T2,8); #
842 &pxor ($Xhi,$T1);
843 &pxor ($Xi,$T2); #
844}
845
846sub clmul64x64_T3 {
847# Even though this subroutine offers visually better ILP, it
848# was empirically found to be a tad slower than above version.
849# At least in gcm_ghash_clmul context. But it's just as well,
850# because loop modulo-scheduling is possible only thanks to
851# minimized "register" pressure...
852my ($Xhi,$Xi,$Hkey)=@_;
853
854 &movdqa ($T1,$Xi); #
855 &movdqa ($Xhi,$Xi);
856 &pclmulqdq ($Xi,$Hkey,0x00); #######
857 &pclmulqdq ($Xhi,$Hkey,0x11); #######
858 &pshufd ($T2,$T1,0b01001110); #
859 &pshufd ($T3,$Hkey,0b01001110);
860 &pxor ($T2,$T1); #
861 &pxor ($T3,$Hkey);
862 &pclmulqdq ($T2,$T3,0x00); #######
863 &pxor ($T2,$Xi); #
864 &pxor ($T2,$Xhi); #
865
866 &movdqa ($T3,$T2); #
867 &psrldq ($T2,8);
868 &pslldq ($T3,8); #
869 &pxor ($Xhi,$T2);
870 &pxor ($Xi,$T3); #
871}
872
873if (1) { # Algorithm 9 with <<1 twist.
874 # Reduction is shorter and uses only two
875 # temporary registers, which makes it better
876 # candidate for interleaving with 64x64
877 # multiplication. Pre-modulo-scheduled loop
878 # was found to be ~20% faster than Algorithm 5
879 # below. Algorithm 9 was therefore chosen for
880 # further optimization...
881
882sub reduction_alg9 { # 17/13 times faster than Intel version
883my ($Xhi,$Xi) = @_;
884
885 # 1st phase
886 &movdqa ($T1,$Xi) #
887 &psllq ($Xi,1);
888 &pxor ($Xi,$T1); #
889 &psllq ($Xi,5); #
890 &pxor ($Xi,$T1); #
891 &psllq ($Xi,57); #
892 &movdqa ($T2,$Xi); #
893 &pslldq ($Xi,8);
894 &psrldq ($T2,8); #
895 &pxor ($Xi,$T1);
896 &pxor ($Xhi,$T2); #
897
898 # 2nd phase
899 &movdqa ($T2,$Xi);
900 &psrlq ($Xi,5);
901 &pxor ($Xi,$T2); #
902 &psrlq ($Xi,1); #
903 &pxor ($Xi,$T2); #
904 &pxor ($T2,$Xhi);
905 &psrlq ($Xi,1); #
906 &pxor ($Xi,$T2); #
907}
908
909&function_begin_B("gcm_init_clmul");
910 &mov ($Htbl,&wparam(0));
911 &mov ($Xip,&wparam(1));
912
913 &call (&label("pic"));
914&set_label("pic");
915 &blindpop ($const);
916 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
917
918 &movdqu ($Hkey,&QWP(0,$Xip));
919 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
920
921 # <<1 twist
922 &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
923 &movdqa ($T1,$Hkey);
924 &psllq ($Hkey,1);
925 &pxor ($T3,$T3); #
926 &psrlq ($T1,63);
927 &pcmpgtd ($T3,$T2); # broadcast carry bit
928 &pslldq ($T1,8);
929 &por ($Hkey,$T1); # H<<=1
930
931 # magic reduction
932 &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
933 &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
934
935 # calculate H^2
936 &movdqa ($Xi,$Hkey);
937 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
938 &reduction_alg9 ($Xhi,$Xi);
939
940 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
941 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
942
943 &ret ();
944&function_end_B("gcm_init_clmul");
945
946&function_begin_B("gcm_gmult_clmul");
947 &mov ($Xip,&wparam(0));
948 &mov ($Htbl,&wparam(1));
949
950 &call (&label("pic"));
951&set_label("pic");
952 &blindpop ($const);
953 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
954
955 &movdqu ($Xi,&QWP(0,$Xip));
956 &movdqa ($T3,&QWP(0,$const));
957 &movups ($Hkey,&QWP(0,$Htbl));
958 &pshufb ($Xi,$T3);
959
960 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
961 &reduction_alg9 ($Xhi,$Xi);
962
963 &pshufb ($Xi,$T3);
964 &movdqu (&QWP(0,$Xip),$Xi);
965
966 &ret ();
967&function_end_B("gcm_gmult_clmul");
968
969&function_begin("gcm_ghash_clmul");
970 &mov ($Xip,&wparam(0));
971 &mov ($Htbl,&wparam(1));
972 &mov ($inp,&wparam(2));
973 &mov ($len,&wparam(3));
974
975 &call (&label("pic"));
976&set_label("pic");
977 &blindpop ($const);
978 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
979
980 &movdqu ($Xi,&QWP(0,$Xip));
981 &movdqa ($T3,&QWP(0,$const));
982 &movdqu ($Hkey,&QWP(0,$Htbl));
983 &pshufb ($Xi,$T3);
984
985 &sub ($len,0x10);
986 &jz (&label("odd_tail"));
987
988 #######
989 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
990 # [(H*Ii+1) + (H*Xi+1)] mod P =
991 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
992 #
993 &movdqu ($T1,&QWP(0,$inp)); # Ii
994 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
995 &pshufb ($T1,$T3);
996 &pshufb ($Xn,$T3);
997 &pxor ($Xi,$T1); # Ii+Xi
998
999 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
1000 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1001
1002 &lea ($inp,&DWP(32,$inp)); # i+=2
1003 &sub ($len,0x20);
1004 &jbe (&label("even_tail"));
1005
1006&set_label("mod_loop");
1007 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1008 &movdqu ($T1,&QWP(0,$inp)); # Ii
1009 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1010
1011 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1012 &pxor ($Xhi,$Xhn);
1013
1014 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1015 &pshufb ($T1,$T3);
1016 &pshufb ($Xn,$T3);
1017
1018 &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1019 &movdqa ($Xhn,$Xn);
1020 &pxor ($Xhi,$T1); # "Ii+Xi", consume early
1021
1022 &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase
1023 &psllq ($Xi,1);
1024 &pxor ($Xi,$T1); #
1025 &psllq ($Xi,5); #
1026 &pxor ($Xi,$T1); #
1027 &pclmulqdq ($Xn,$Hkey,0x00); #######
1028 &psllq ($Xi,57); #
1029 &movdqa ($T2,$Xi); #
1030 &pslldq ($Xi,8);
1031 &psrldq ($T2,8); #
1032 &pxor ($Xi,$T1);
1033 &pshufd ($T1,$T3,0b01001110);
1034 &pxor ($Xhi,$T2); #
1035 &pxor ($T1,$T3);
1036 &pshufd ($T3,$Hkey,0b01001110);
1037 &pxor ($T3,$Hkey); #
1038
1039 &pclmulqdq ($Xhn,$Hkey,0x11); #######
1040 &movdqa ($T2,$Xi); # 2nd phase
1041 &psrlq ($Xi,5);
1042 &pxor ($Xi,$T2); #
1043 &psrlq ($Xi,1); #
1044 &pxor ($Xi,$T2); #
1045 &pxor ($T2,$Xhi);
1046 &psrlq ($Xi,1); #
1047 &pxor ($Xi,$T2); #
1048
1049 &pclmulqdq ($T1,$T3,0x00); #######
1050 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1051 &xorps ($T1,$Xn); #
1052 &xorps ($T1,$Xhn); #
1053
1054 &movdqa ($T3,$T1); #
1055 &psrldq ($T1,8);
1056 &pslldq ($T3,8); #
1057 &pxor ($Xhn,$T1);
1058 &pxor ($Xn,$T3); #
1059 &movdqa ($T3,&QWP(0,$const));
1060
1061 &lea ($inp,&DWP(32,$inp));
1062 &sub ($len,0x20);
1063 &ja (&label("mod_loop"));
1064
1065&set_label("even_tail");
1066 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1067
1068 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1069 &pxor ($Xhi,$Xhn);
1070
1071 &reduction_alg9 ($Xhi,$Xi);
1072
1073 &test ($len,$len);
1074 &jnz (&label("done"));
1075
1076 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1077&set_label("odd_tail");
1078 &movdqu ($T1,&QWP(0,$inp)); # Ii
1079 &pshufb ($T1,$T3);
1080 &pxor ($Xi,$T1); # Ii+Xi
1081
1082 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1083 &reduction_alg9 ($Xhi,$Xi);
1084
1085&set_label("done");
1086 &pshufb ($Xi,$T3);
1087 &movdqu (&QWP(0,$Xip),$Xi);
1088&function_end("gcm_ghash_clmul");
1089
1090} else { # Algorith 5. Kept for reference purposes.
1091
1092sub reduction_alg5 { # 19/16 times faster than Intel version
1093my ($Xhi,$Xi)=@_;
1094
1095 # <<1
1096 &movdqa ($T1,$Xi); #
1097 &movdqa ($T2,$Xhi);
1098 &pslld ($Xi,1);
1099 &pslld ($Xhi,1); #
1100 &psrld ($T1,31);
1101 &psrld ($T2,31); #
1102 &movdqa ($T3,$T1);
1103 &pslldq ($T1,4);
1104 &psrldq ($T3,12); #
1105 &pslldq ($T2,4);
1106 &por ($Xhi,$T3); #
1107 &por ($Xi,$T1);
1108 &por ($Xhi,$T2); #
1109
1110 # 1st phase
1111 &movdqa ($T1,$Xi);
1112 &movdqa ($T2,$Xi);
1113 &movdqa ($T3,$Xi); #
1114 &pslld ($T1,31);
1115 &pslld ($T2,30);
1116 &pslld ($Xi,25); #
1117 &pxor ($T1,$T2);
1118 &pxor ($T1,$Xi); #
1119 &movdqa ($T2,$T1); #
1120 &pslldq ($T1,12);
1121 &psrldq ($T2,4); #
1122 &pxor ($T3,$T1);
1123
1124 # 2nd phase
1125 &pxor ($Xhi,$T3); #
1126 &movdqa ($Xi,$T3);
1127 &movdqa ($T1,$T3);
1128 &psrld ($Xi,1); #
1129 &psrld ($T1,2);
1130 &psrld ($T3,7); #
1131 &pxor ($Xi,$T1);
1132 &pxor ($Xhi,$T2);
1133 &pxor ($Xi,$T3); #
1134 &pxor ($Xi,$Xhi); #
1135}
1136
1137&function_begin_B("gcm_init_clmul");
1138 &mov ($Htbl,&wparam(0));
1139 &mov ($Xip,&wparam(1));
1140
1141 &call (&label("pic"));
1142&set_label("pic");
1143 &blindpop ($const);
1144 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1145
1146 &movdqu ($Hkey,&QWP(0,$Xip));
1147 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1148
1149 # calculate H^2
1150 &movdqa ($Xi,$Hkey);
1151 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1152 &reduction_alg5 ($Xhi,$Xi);
1153
1154 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1155 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1156
1157 &ret ();
1158&function_end_B("gcm_init_clmul");
1159
1160&function_begin_B("gcm_gmult_clmul");
1161 &mov ($Xip,&wparam(0));
1162 &mov ($Htbl,&wparam(1));
1163
1164 &call (&label("pic"));
1165&set_label("pic");
1166 &blindpop ($const);
1167 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1168
1169 &movdqu ($Xi,&QWP(0,$Xip));
1170 &movdqa ($Xn,&QWP(0,$const));
1171 &movdqu ($Hkey,&QWP(0,$Htbl));
1172 &pshufb ($Xi,$Xn);
1173
1174 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1175 &reduction_alg5 ($Xhi,$Xi);
1176
1177 &pshufb ($Xi,$Xn);
1178 &movdqu (&QWP(0,$Xip),$Xi);
1179
1180 &ret ();
1181&function_end_B("gcm_gmult_clmul");
1182
1183&function_begin("gcm_ghash_clmul");
1184 &mov ($Xip,&wparam(0));
1185 &mov ($Htbl,&wparam(1));
1186 &mov ($inp,&wparam(2));
1187 &mov ($len,&wparam(3));
1188
1189 &call (&label("pic"));
1190&set_label("pic");
1191 &blindpop ($const);
1192 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1193
1194 &movdqu ($Xi,&QWP(0,$Xip));
1195 &movdqa ($T3,&QWP(0,$const));
1196 &movdqu ($Hkey,&QWP(0,$Htbl));
1197 &pshufb ($Xi,$T3);
1198
1199 &sub ($len,0x10);
1200 &jz (&label("odd_tail"));
1201
1202 #######
1203 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1204 # [(H*Ii+1) + (H*Xi+1)] mod P =
1205 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1206 #
1207 &movdqu ($T1,&QWP(0,$inp)); # Ii
1208 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1209 &pshufb ($T1,$T3);
1210 &pshufb ($Xn,$T3);
1211 &pxor ($Xi,$T1); # Ii+Xi
1212
1213 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1214 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1215
1216 &sub ($len,0x20);
1217 &lea ($inp,&DWP(32,$inp)); # i+=2
1218 &jbe (&label("even_tail"));
1219
1220&set_label("mod_loop");
1221 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1222 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1223
1224 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1225 &pxor ($Xhi,$Xhn);
1226
1227 &reduction_alg5 ($Xhi,$Xi);
1228
1229 #######
1230 &movdqa ($T3,&QWP(0,$const));
1231 &movdqu ($T1,&QWP(0,$inp)); # Ii
1232 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1233 &pshufb ($T1,$T3);
1234 &pshufb ($Xn,$T3);
1235 &pxor ($Xi,$T1); # Ii+Xi
1236
1237 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1238 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1239
1240 &sub ($len,0x20);
1241 &lea ($inp,&DWP(32,$inp));
1242 &ja (&label("mod_loop"));
1243
1244&set_label("even_tail");
1245 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1246
1247 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1248 &pxor ($Xhi,$Xhn);
1249
1250 &reduction_alg5 ($Xhi,$Xi);
1251
1252 &movdqa ($T3,&QWP(0,$const));
1253 &test ($len,$len);
1254 &jnz (&label("done"));
1255
1256 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1257&set_label("odd_tail");
1258 &movdqu ($T1,&QWP(0,$inp)); # Ii
1259 &pshufb ($T1,$T3);
1260 &pxor ($Xi,$T1); # Ii+Xi
1261
1262 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1263 &reduction_alg5 ($Xhi,$Xi);
1264
1265 &movdqa ($T3,&QWP(0,$const));
1266&set_label("done");
1267 &pshufb ($Xi,$T3);
1268 &movdqu (&QWP(0,$Xip),$Xi);
1269&function_end("gcm_ghash_clmul");
1270
1271}
1272
1273&set_label("bswap",64);
1274 &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1275 &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1276}} # $sse2
1277
1278&set_label("rem_4bit",64);
1279 &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1280 &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1281 &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1282 &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1283&set_label("rem_8bit",64);
1284 &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1285 &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1286 &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1287 &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1288 &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1289 &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1290 &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1291 &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1292 &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1293 &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1294 &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1295 &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1296 &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1297 &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1298 &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1299 &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1300 &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1301 &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1302 &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1303 &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1304 &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1305 &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1306 &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1307 &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1308 &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1309 &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1310 &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1311 &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1312 &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1313 &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1314 &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1315 &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1316}}} # !$x86only
1317
1318&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
1319&asm_finish();
1320
1321# A question was risen about choice of vanilla MMX. Or rather why wasn't
1322# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1323# CPUs such as PIII, "4-bit" MMX version was observed to provide better
1324# performance than *corresponding* SSE2 one even on contemporary CPUs.
1325# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1326# implementation featuring full range of lookup-table sizes, but with
1327# per-invocation lookup table setup. Latter means that table size is
1328# chosen depending on how much data is to be hashed in every given call,
1329# more data - larger table. Best reported result for Core2 is ~4 cycles
1330# per processed byte out of 64KB block. This number accounts even for
1331# 64KB table setup overhead. As discussed in gcm128.c we choose to be
1332# more conservative in respect to lookup table sizes, but how do the
1333# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1334# on same platform. As also discussed in gcm128.c, next in line "8-bit
1335# Shoup's" or "4KB" method should deliver twice the performance of
1336# "256B" one, in other words not worse than ~6 cycles per byte. It
1337# should be also be noted that in SSE2 case improvement can be "super-
1338# linear," i.e. more than twice, mostly because >>8 maps to single
1339# instruction on SSE2 register. This is unlike "4-bit" case when >>4
1340# maps to same amount of instructions in both MMX and SSE2 cases.
1341# Bottom line is that switch to SSE2 is considered to be justifiable
1342# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000000..a5ae180882
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,805 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that
14# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15# function features so called "528B" variant utilizing additional
16# 256+16 bytes of per-key storage [+512 bytes shared table].
17# Performance results are for this streamed GHASH subroutine and are
18# expressed in cycles per processed byte, less is better:
19#
20# gcc 3.4.x(*) assembler
21#
22# P4 28.6 14.0 +100%
23# Opteron 19.3 7.7 +150%
24# Core2 17.8 8.1(**) +120%
25#
26# (*) comparison is not completely fair, because C results are
27# for vanilla "256B" implementation, while assembler results
28# are for "528B";-)
29# (**) it's mystery [to me] why Core2 result is not same as for
30# Opteron;
31
32# May 2010
33#
34# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
35# See ghash-x86.pl for background information and details about coding
36# techniques.
37#
38# Special thanks to David Woodhouse <dwmw2@infradead.org> for
39# providing access to a Westmere-based system on behalf of Intel
40# Open Source Technology Centre.
41
42$flavour = shift;
43$output = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open STDOUT,"| $^X $xlate $flavour $output";
54
55# common register layout
56$nlo="%rax";
57$nhi="%rbx";
58$Zlo="%r8";
59$Zhi="%r9";
60$tmp="%r10";
61$rem_4bit = "%r11";
62
63$Xi="%rdi";
64$Htbl="%rsi";
65
66# per-function register layout
67$cnt="%rcx";
68$rem="%rdx";
69
70sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
71 $r =~ s/%[er]([sd]i)/%\1l/ or
72 $r =~ s/%[er](bp)/%\1l/ or
73 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
74
75sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
76{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
77 my $arg = pop;
78 $arg = "\$$arg" if ($arg*1 eq $arg);
79 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
80}
81
82{ my $N;
83 sub loop() {
84 my $inp = shift;
85
86 $N++;
87$code.=<<___;
88 xor $nlo,$nlo
89 xor $nhi,$nhi
90 mov `&LB("$Zlo")`,`&LB("$nlo")`
91 mov `&LB("$Zlo")`,`&LB("$nhi")`
92 shl \$4,`&LB("$nlo")`
93 mov \$14,$cnt
94 mov 8($Htbl,$nlo),$Zlo
95 mov ($Htbl,$nlo),$Zhi
96 and \$0xf0,`&LB("$nhi")`
97 mov $Zlo,$rem
98 jmp .Loop$N
99
100.align 16
101.Loop$N:
102 shr \$4,$Zlo
103 and \$0xf,$rem
104 mov $Zhi,$tmp
105 mov ($inp,$cnt),`&LB("$nlo")`
106 shr \$4,$Zhi
107 xor 8($Htbl,$nhi),$Zlo
108 shl \$60,$tmp
109 xor ($Htbl,$nhi),$Zhi
110 mov `&LB("$nlo")`,`&LB("$nhi")`
111 xor ($rem_4bit,$rem,8),$Zhi
112 mov $Zlo,$rem
113 shl \$4,`&LB("$nlo")`
114 xor $tmp,$Zlo
115 dec $cnt
116 js .Lbreak$N
117
118 shr \$4,$Zlo
119 and \$0xf,$rem
120 mov $Zhi,$tmp
121 shr \$4,$Zhi
122 xor 8($Htbl,$nlo),$Zlo
123 shl \$60,$tmp
124 xor ($Htbl,$nlo),$Zhi
125 and \$0xf0,`&LB("$nhi")`
126 xor ($rem_4bit,$rem,8),$Zhi
127 mov $Zlo,$rem
128 xor $tmp,$Zlo
129 jmp .Loop$N
130
131.align 16
132.Lbreak$N:
133 shr \$4,$Zlo
134 and \$0xf,$rem
135 mov $Zhi,$tmp
136 shr \$4,$Zhi
137 xor 8($Htbl,$nlo),$Zlo
138 shl \$60,$tmp
139 xor ($Htbl,$nlo),$Zhi
140 and \$0xf0,`&LB("$nhi")`
141 xor ($rem_4bit,$rem,8),$Zhi
142 mov $Zlo,$rem
143 xor $tmp,$Zlo
144
145 shr \$4,$Zlo
146 and \$0xf,$rem
147 mov $Zhi,$tmp
148 shr \$4,$Zhi
149 xor 8($Htbl,$nhi),$Zlo
150 shl \$60,$tmp
151 xor ($Htbl,$nhi),$Zhi
152 xor $tmp,$Zlo
153 xor ($rem_4bit,$rem,8),$Zhi
154
155 bswap $Zlo
156 bswap $Zhi
157___
158}}
159
160$code=<<___;
161.text
162
163.globl gcm_gmult_4bit
164.type gcm_gmult_4bit,\@function,2
165.align 16
166gcm_gmult_4bit:
167 push %rbx
168 push %rbp # %rbp and %r12 are pushed exclusively in
169 push %r12 # order to reuse Win64 exception handler...
170.Lgmult_prologue:
171
172 movzb 15($Xi),$Zlo
173 lea .Lrem_4bit(%rip),$rem_4bit
174___
175 &loop ($Xi);
176$code.=<<___;
177 mov $Zlo,8($Xi)
178 mov $Zhi,($Xi)
179
180 mov 16(%rsp),%rbx
181 lea 24(%rsp),%rsp
182.Lgmult_epilogue:
183 ret
184.size gcm_gmult_4bit,.-gcm_gmult_4bit
185___
186
187# per-function register layout
188$inp="%rdx";
189$len="%rcx";
190$rem_8bit=$rem_4bit;
191
192$code.=<<___;
193.globl gcm_ghash_4bit
194.type gcm_ghash_4bit,\@function,4
195.align 16
196gcm_ghash_4bit:
197 push %rbx
198 push %rbp
199 push %r12
200 push %r13
201 push %r14
202 push %r15
203 sub \$280,%rsp
204.Lghash_prologue:
205 mov $inp,%r14 # reassign couple of args
206 mov $len,%r15
207___
208{ my $inp="%r14";
209 my $dat="%edx";
210 my $len="%r15";
211 my @nhi=("%ebx","%ecx");
212 my @rem=("%r12","%r13");
213 my $Hshr4="%rbp";
214
215 &sub ($Htbl,-128); # size optimization
216 &lea ($Hshr4,"16+128(%rsp)");
217 { my @lo =($nlo,$nhi);
218 my @hi =($Zlo,$Zhi);
219
220 &xor ($dat,$dat);
221 for ($i=0,$j=-2;$i<18;$i++,$j++) {
222 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
223 &or ($lo[0],$tmp) if ($i>1);
224 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
225 &shr ($lo[1],4) if ($i>0 && $i<17);
226 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
227 &shr ($hi[1],4) if ($i>0 && $i<17);
228 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
229 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
230 &shl (&LB($dat),4) if ($i>0 && $i<17);
231 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
232 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
233 &shl ($tmp,60) if ($i>0 && $i<17);
234
235 push (@lo,shift(@lo));
236 push (@hi,shift(@hi));
237 }
238 }
239 &add ($Htbl,-128);
240 &mov ($Zlo,"8($Xi)");
241 &mov ($Zhi,"0($Xi)");
242 &add ($len,$inp); # pointer to the end of data
243 &lea ($rem_8bit,".Lrem_8bit(%rip)");
244 &jmp (".Louter_loop");
245
246$code.=".align 16\n.Louter_loop:\n";
247 &xor ($Zhi,"($inp)");
248 &mov ("%rdx","8($inp)");
249 &lea ($inp,"16($inp)");
250 &xor ("%rdx",$Zlo);
251 &mov ("($Xi)",$Zhi);
252 &mov ("8($Xi)","%rdx");
253 &shr ("%rdx",32);
254
255 &xor ($nlo,$nlo);
256 &rol ($dat,8);
257 &mov (&LB($nlo),&LB($dat));
258 &movz ($nhi[0],&LB($dat));
259 &shl (&LB($nlo),4);
260 &shr ($nhi[0],4);
261
262 for ($j=11,$i=0;$i<15;$i++) {
263 &rol ($dat,8);
264 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
265 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
266 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
267 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
268
269 &mov (&LB($nlo),&LB($dat));
270 &xor ($Zlo,$tmp) if ($i>0);
271 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
272
273 &movz ($nhi[1],&LB($dat));
274 &shl (&LB($nlo),4);
275 &movzb ($rem[0],"(%rsp,$nhi[0])");
276
277 &shr ($nhi[1],4) if ($i<14);
278 &and ($nhi[1],0xf0) if ($i==14);
279 &shl ($rem[1],48) if ($i>0);
280 &xor ($rem[0],$Zlo);
281
282 &mov ($tmp,$Zhi);
283 &xor ($Zhi,$rem[1]) if ($i>0);
284 &shr ($Zlo,8);
285
286 &movz ($rem[0],&LB($rem[0]));
287 &mov ($dat,"$j($Xi)") if (--$j%4==0);
288 &shr ($Zhi,8);
289
290 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
291 &shl ($tmp,56);
292 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
293
294 unshift (@nhi,pop(@nhi)); # "rotate" registers
295 unshift (@rem,pop(@rem));
296 }
297 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
298 &xor ($Zlo,"8($Htbl,$nlo)");
299 &xor ($Zhi,"($Htbl,$nlo)");
300
301 &shl ($rem[1],48);
302 &xor ($Zlo,$tmp);
303
304 &xor ($Zhi,$rem[1]);
305 &movz ($rem[0],&LB($Zlo));
306 &shr ($Zlo,4);
307
308 &mov ($tmp,$Zhi);
309 &shl (&LB($rem[0]),4);
310 &shr ($Zhi,4);
311
312 &xor ($Zlo,"8($Htbl,$nhi[0])");
313 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
314 &shl ($tmp,60);
315
316 &xor ($Zhi,"($Htbl,$nhi[0])");
317 &xor ($Zlo,$tmp);
318 &shl ($rem[0],48);
319
320 &bswap ($Zlo);
321 &xor ($Zhi,$rem[0]);
322
323 &bswap ($Zhi);
324 &cmp ($inp,$len);
325 &jb (".Louter_loop");
326}
327$code.=<<___;
328 mov $Zlo,8($Xi)
329 mov $Zhi,($Xi)
330
331 lea 280(%rsp),%rsi
332 mov 0(%rsi),%r15
333 mov 8(%rsi),%r14
334 mov 16(%rsi),%r13
335 mov 24(%rsi),%r12
336 mov 32(%rsi),%rbp
337 mov 40(%rsi),%rbx
338 lea 48(%rsi),%rsp
339.Lghash_epilogue:
340 ret
341.size gcm_ghash_4bit,.-gcm_ghash_4bit
342___
343
344######################################################################
345# PCLMULQDQ version.
346
347@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
348 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
349
350($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
351($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
352
353sub clmul64x64_T2 { # minimal register pressure
354my ($Xhi,$Xi,$Hkey,$modulo)=@_;
355
356$code.=<<___ if (!defined($modulo));
357 movdqa $Xi,$Xhi #
358 pshufd \$0b01001110,$Xi,$T1
359 pshufd \$0b01001110,$Hkey,$T2
360 pxor $Xi,$T1 #
361 pxor $Hkey,$T2
362___
363$code.=<<___;
364 pclmulqdq \$0x00,$Hkey,$Xi #######
365 pclmulqdq \$0x11,$Hkey,$Xhi #######
366 pclmulqdq \$0x00,$T2,$T1 #######
367 pxor $Xi,$T1 #
368 pxor $Xhi,$T1 #
369
370 movdqa $T1,$T2 #
371 psrldq \$8,$T1
372 pslldq \$8,$T2 #
373 pxor $T1,$Xhi
374 pxor $T2,$Xi #
375___
376}
377
378sub reduction_alg9 { # 17/13 times faster than Intel version
379my ($Xhi,$Xi) = @_;
380
381$code.=<<___;
382 # 1st phase
383 movdqa $Xi,$T1 #
384 psllq \$1,$Xi
385 pxor $T1,$Xi #
386 psllq \$5,$Xi #
387 pxor $T1,$Xi #
388 psllq \$57,$Xi #
389 movdqa $Xi,$T2 #
390 pslldq \$8,$Xi
391 psrldq \$8,$T2 #
392 pxor $T1,$Xi
393 pxor $T2,$Xhi #
394
395 # 2nd phase
396 movdqa $Xi,$T2
397 psrlq \$5,$Xi
398 pxor $T2,$Xi #
399 psrlq \$1,$Xi #
400 pxor $T2,$Xi #
401 pxor $Xhi,$T2
402 psrlq \$1,$Xi #
403 pxor $T2,$Xi #
404___
405}
406
407{ my ($Htbl,$Xip)=@_4args;
408
409$code.=<<___;
410.globl gcm_init_clmul
411.type gcm_init_clmul,\@abi-omnipotent
412.align 16
413gcm_init_clmul:
414 movdqu ($Xip),$Hkey
415 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
416
417 # <<1 twist
418 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
419 movdqa $Hkey,$T1
420 psllq \$1,$Hkey
421 pxor $T3,$T3 #
422 psrlq \$63,$T1
423 pcmpgtd $T2,$T3 # broadcast carry bit
424 pslldq \$8,$T1
425 por $T1,$Hkey # H<<=1
426
427 # magic reduction
428 pand .L0x1c2_polynomial(%rip),$T3
429 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
430
431 # calculate H^2
432 movdqa $Hkey,$Xi
433___
434 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
435 &reduction_alg9 ($Xhi,$Xi);
436$code.=<<___;
437 movdqu $Hkey,($Htbl) # save H
438 movdqu $Xi,16($Htbl) # save H^2
439 ret
440.size gcm_init_clmul,.-gcm_init_clmul
441___
442}
443
444{ my ($Xip,$Htbl)=@_4args;
445
446$code.=<<___;
447.globl gcm_gmult_clmul
448.type gcm_gmult_clmul,\@abi-omnipotent
449.align 16
450gcm_gmult_clmul:
451 movdqu ($Xip),$Xi
452 movdqa .Lbswap_mask(%rip),$T3
453 movdqu ($Htbl),$Hkey
454 pshufb $T3,$Xi
455___
456 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
457 &reduction_alg9 ($Xhi,$Xi);
458$code.=<<___;
459 pshufb $T3,$Xi
460 movdqu $Xi,($Xip)
461 ret
462.size gcm_gmult_clmul,.-gcm_gmult_clmul
463___
464}
465
466{ my ($Xip,$Htbl,$inp,$len)=@_4args;
467 my $Xn="%xmm6";
468 my $Xhn="%xmm7";
469 my $Hkey2="%xmm8";
470 my $T1n="%xmm9";
471 my $T2n="%xmm10";
472
473$code.=<<___;
474.globl gcm_ghash_clmul
475.type gcm_ghash_clmul,\@abi-omnipotent
476.align 16
477gcm_ghash_clmul:
478___
479$code.=<<___ if ($win64);
480.LSEH_begin_gcm_ghash_clmul:
481 # I can't trust assembler to use specific encoding:-(
482 .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
483 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
484 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
485 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
486 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
487 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
488___
489$code.=<<___;
490 movdqa .Lbswap_mask(%rip),$T3
491
492 movdqu ($Xip),$Xi
493 movdqu ($Htbl),$Hkey
494 pshufb $T3,$Xi
495
496 sub \$0x10,$len
497 jz .Lodd_tail
498
499 movdqu 16($Htbl),$Hkey2
500 #######
501 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
502 # [(H*Ii+1) + (H*Xi+1)] mod P =
503 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
504 #
505 movdqu ($inp),$T1 # Ii
506 movdqu 16($inp),$Xn # Ii+1
507 pshufb $T3,$T1
508 pshufb $T3,$Xn
509 pxor $T1,$Xi # Ii+Xi
510___
511 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
512$code.=<<___;
513 movdqa $Xi,$Xhi #
514 pshufd \$0b01001110,$Xi,$T1
515 pshufd \$0b01001110,$Hkey2,$T2
516 pxor $Xi,$T1 #
517 pxor $Hkey2,$T2
518
519 lea 32($inp),$inp # i+=2
520 sub \$0x20,$len
521 jbe .Leven_tail
522
523.Lmod_loop:
524___
525 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
526$code.=<<___;
527 movdqu ($inp),$T1 # Ii
528 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
529 pxor $Xhn,$Xhi
530
531 movdqu 16($inp),$Xn # Ii+1
532 pshufb $T3,$T1
533 pshufb $T3,$Xn
534
535 movdqa $Xn,$Xhn #
536 pshufd \$0b01001110,$Xn,$T1n
537 pshufd \$0b01001110,$Hkey,$T2n
538 pxor $Xn,$T1n #
539 pxor $Hkey,$T2n
540 pxor $T1,$Xhi # "Ii+Xi", consume early
541
542 movdqa $Xi,$T1 # 1st phase
543 psllq \$1,$Xi
544 pxor $T1,$Xi #
545 psllq \$5,$Xi #
546 pxor $T1,$Xi #
547 pclmulqdq \$0x00,$Hkey,$Xn #######
548 psllq \$57,$Xi #
549 movdqa $Xi,$T2 #
550 pslldq \$8,$Xi
551 psrldq \$8,$T2 #
552 pxor $T1,$Xi
553 pxor $T2,$Xhi #
554
555 pclmulqdq \$0x11,$Hkey,$Xhn #######
556 movdqa $Xi,$T2 # 2nd phase
557 psrlq \$5,$Xi
558 pxor $T2,$Xi #
559 psrlq \$1,$Xi #
560 pxor $T2,$Xi #
561 pxor $Xhi,$T2
562 psrlq \$1,$Xi #
563 pxor $T2,$Xi #
564
565 pclmulqdq \$0x00,$T2n,$T1n #######
566 movdqa $Xi,$Xhi #
567 pshufd \$0b01001110,$Xi,$T1
568 pshufd \$0b01001110,$Hkey2,$T2
569 pxor $Xi,$T1 #
570 pxor $Hkey2,$T2
571
572 pxor $Xn,$T1n #
573 pxor $Xhn,$T1n #
574 movdqa $T1n,$T2n #
575 psrldq \$8,$T1n
576 pslldq \$8,$T2n #
577 pxor $T1n,$Xhn
578 pxor $T2n,$Xn #
579
580 lea 32($inp),$inp
581 sub \$0x20,$len
582 ja .Lmod_loop
583
584.Leven_tail:
585___
586 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
587$code.=<<___;
588 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
589 pxor $Xhn,$Xhi
590___
591 &reduction_alg9 ($Xhi,$Xi);
592$code.=<<___;
593 test $len,$len
594 jnz .Ldone
595
596.Lodd_tail:
597 movdqu ($inp),$T1 # Ii
598 pshufb $T3,$T1
599 pxor $T1,$Xi # Ii+Xi
600___
601 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
602 &reduction_alg9 ($Xhi,$Xi);
603$code.=<<___;
604.Ldone:
605 pshufb $T3,$Xi
606 movdqu $Xi,($Xip)
607___
608$code.=<<___ if ($win64);
609 movaps (%rsp),%xmm6
610 movaps 0x10(%rsp),%xmm7
611 movaps 0x20(%rsp),%xmm8
612 movaps 0x30(%rsp),%xmm9
613 movaps 0x40(%rsp),%xmm10
614 add \$0x58,%rsp
615___
616$code.=<<___;
617 ret
618.LSEH_end_gcm_ghash_clmul:
619.size gcm_ghash_clmul,.-gcm_ghash_clmul
620___
621}
622
623$code.=<<___;
624.align 64
625.Lbswap_mask:
626 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
627.L0x1c2_polynomial:
628 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
629.align 64
630.type .Lrem_4bit,\@object
631.Lrem_4bit:
632 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
633 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
634 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
635 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
636.type .Lrem_8bit,\@object
637.Lrem_8bit:
638 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
639 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
640 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
641 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
642 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
643 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
644 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
645 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
646 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
647 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
648 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
649 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
650 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
651 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
652 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
653 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
654 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
655 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
656 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
657 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
658 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
659 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
660 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
661 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
662 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
663 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
664 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
665 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
666 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
667 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
668 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
669 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
670
671.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
672.align 64
673___
674
675# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
676# CONTEXT *context,DISPATCHER_CONTEXT *disp)
677if ($win64) {
678$rec="%rcx";
679$frame="%rdx";
680$context="%r8";
681$disp="%r9";
682
683$code.=<<___;
684.extern __imp_RtlVirtualUnwind
685.type se_handler,\@abi-omnipotent
686.align 16
687se_handler:
688 push %rsi
689 push %rdi
690 push %rbx
691 push %rbp
692 push %r12
693 push %r13
694 push %r14
695 push %r15
696 pushfq
697 sub \$64,%rsp
698
699 mov 120($context),%rax # pull context->Rax
700 mov 248($context),%rbx # pull context->Rip
701
702 mov 8($disp),%rsi # disp->ImageBase
703 mov 56($disp),%r11 # disp->HandlerData
704
705 mov 0(%r11),%r10d # HandlerData[0]
706 lea (%rsi,%r10),%r10 # prologue label
707 cmp %r10,%rbx # context->Rip<prologue label
708 jb .Lin_prologue
709
710 mov 152($context),%rax # pull context->Rsp
711
712 mov 4(%r11),%r10d # HandlerData[1]
713 lea (%rsi,%r10),%r10 # epilogue label
714 cmp %r10,%rbx # context->Rip>=epilogue label
715 jae .Lin_prologue
716
717 lea 24(%rax),%rax # adjust "rsp"
718
719 mov -8(%rax),%rbx
720 mov -16(%rax),%rbp
721 mov -24(%rax),%r12
722 mov %rbx,144($context) # restore context->Rbx
723 mov %rbp,160($context) # restore context->Rbp
724 mov %r12,216($context) # restore context->R12
725
726.Lin_prologue:
727 mov 8(%rax),%rdi
728 mov 16(%rax),%rsi
729 mov %rax,152($context) # restore context->Rsp
730 mov %rsi,168($context) # restore context->Rsi
731 mov %rdi,176($context) # restore context->Rdi
732
733 mov 40($disp),%rdi # disp->ContextRecord
734 mov $context,%rsi # context
735 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
736 .long 0xa548f3fc # cld; rep movsq
737
738 mov $disp,%rsi
739 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
740 mov 8(%rsi),%rdx # arg2, disp->ImageBase
741 mov 0(%rsi),%r8 # arg3, disp->ControlPc
742 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
743 mov 40(%rsi),%r10 # disp->ContextRecord
744 lea 56(%rsi),%r11 # &disp->HandlerData
745 lea 24(%rsi),%r12 # &disp->EstablisherFrame
746 mov %r10,32(%rsp) # arg5
747 mov %r11,40(%rsp) # arg6
748 mov %r12,48(%rsp) # arg7
749 mov %rcx,56(%rsp) # arg8, (NULL)
750 call *__imp_RtlVirtualUnwind(%rip)
751
752 mov \$1,%eax # ExceptionContinueSearch
753 add \$64,%rsp
754 popfq
755 pop %r15
756 pop %r14
757 pop %r13
758 pop %r12
759 pop %rbp
760 pop %rbx
761 pop %rdi
762 pop %rsi
763 ret
764.size se_handler,.-se_handler
765
766.section .pdata
767.align 4
768 .rva .LSEH_begin_gcm_gmult_4bit
769 .rva .LSEH_end_gcm_gmult_4bit
770 .rva .LSEH_info_gcm_gmult_4bit
771
772 .rva .LSEH_begin_gcm_ghash_4bit
773 .rva .LSEH_end_gcm_ghash_4bit
774 .rva .LSEH_info_gcm_ghash_4bit
775
776 .rva .LSEH_begin_gcm_ghash_clmul
777 .rva .LSEH_end_gcm_ghash_clmul
778 .rva .LSEH_info_gcm_ghash_clmul
779
780.section .xdata
781.align 8
782.LSEH_info_gcm_gmult_4bit:
783 .byte 9,0,0,0
784 .rva se_handler
785 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
786.LSEH_info_gcm_ghash_4bit:
787 .byte 9,0,0,0
788 .rva se_handler
789 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
790.LSEH_info_gcm_ghash_clmul:
791 .byte 0x01,0x1f,0x0b,0x00
792 .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
793 .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
794 .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
795 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
796 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
797 .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
798___
799}
800
801$code =~ s/\`([^\`]*)\`/eval($1)/gem;
802
803print $code;
804
805close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
index 8f8bd563b9..3d3782cbe1 100644
--- a/src/lib/libcrypto/modes/cbc128.c
+++ b/src/lib/libcrypto/modes/cbc128.c
@@ -48,7 +48,8 @@
48 * 48 *
49 */ 49 */
50 50
51#include "modes.h" 51#include <openssl/crypto.h>
52#include "modes_lcl.h"
52#include <string.h> 53#include <string.h>
53 54
54#ifndef MODES_DEBUG 55#ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
58#endif 59#endif
59#include <assert.h> 60#include <assert.h>
60 61
61#define STRICT_ALIGNMENT 1 62#ifndef STRICT_ALIGNMENT
62#if defined(__i386) || defined(__i386__) || \
63 defined(__x86_64) || defined(__x86_64__) || \
64 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
65 defined(__s390__) || defined(__s390x__)
66# undef STRICT_ALIGNMENT
67# define STRICT_ALIGNMENT 0 63# define STRICT_ALIGNMENT 0
68#endif 64#endif
69 65
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
new file mode 100644
index 0000000000..c9b35e5b35
--- /dev/null
+++ b/src/lib/libcrypto/modes/ccm128.c
@@ -0,0 +1,441 @@
1/* ====================================================================
2 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#include <openssl/crypto.h>
51#include "modes_lcl.h"
52#include <string.h>
53
54#ifndef MODES_DEBUG
55# ifndef NDEBUG
56# define NDEBUG
57# endif
58#endif
59#include <assert.h>
60
61/* First you setup M and L parameters and pass the key schedule.
62 * This is called once per session setup... */
63void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
64 unsigned int M,unsigned int L,void *key,block128_f block)
65{
66 memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
67 ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
68 ctx->blocks = 0;
69 ctx->block = block;
70 ctx->key = key;
71}
72
73/* !!! Following interfaces are to be called *once* per packet !!! */
74
75/* Then you setup per-message nonce and pass the length of the message */
76int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
77 const unsigned char *nonce,size_t nlen,size_t mlen)
78{
79 unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
80
81 if (nlen<(14-L)) return -1; /* nonce is too short */
82
83 if (sizeof(mlen)==8 && L>=3) {
84 ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
85 ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
86 ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
87 ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
88 }
89 else
90 *(u32*)(&ctx->nonce.c[8]) = 0;
91
92 ctx->nonce.c[12] = (u8)(mlen>>24);
93 ctx->nonce.c[13] = (u8)(mlen>>16);
94 ctx->nonce.c[14] = (u8)(mlen>>8);
95 ctx->nonce.c[15] = (u8)mlen;
96
97 ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
98 memcpy(&ctx->nonce.c[1],nonce,14-L);
99
100 return 0;
101}
102
103/* Then you pass additional authentication data, this is optional */
104void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
105 const unsigned char *aad,size_t alen)
106{ unsigned int i;
107 block128_f block = ctx->block;
108
109 if (alen==0) return;
110
111 ctx->nonce.c[0] |= 0x40; /* set Adata flag */
112 (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
113 ctx->blocks++;
114
115 if (alen<(0x10000-0x100)) {
116 ctx->cmac.c[0] ^= (u8)(alen>>8);
117 ctx->cmac.c[1] ^= (u8)alen;
118 i=2;
119 }
120 else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
121 ctx->cmac.c[0] ^= 0xFF;
122 ctx->cmac.c[1] ^= 0xFF;
123 ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
124 ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
125 ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
126 ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
127 ctx->cmac.c[6] ^= (u8)(alen>>24);
128 ctx->cmac.c[7] ^= (u8)(alen>>16);
129 ctx->cmac.c[8] ^= (u8)(alen>>8);
130 ctx->cmac.c[9] ^= (u8)alen;
131 i=10;
132 }
133 else {
134 ctx->cmac.c[0] ^= 0xFF;
135 ctx->cmac.c[1] ^= 0xFE;
136 ctx->cmac.c[2] ^= (u8)(alen>>24);
137 ctx->cmac.c[3] ^= (u8)(alen>>16);
138 ctx->cmac.c[4] ^= (u8)(alen>>8);
139 ctx->cmac.c[5] ^= (u8)alen;
140 i=6;
141 }
142
143 do {
144 for(;i<16 && alen;++i,++aad,--alen)
145 ctx->cmac.c[i] ^= *aad;
146 (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
147 ctx->blocks++;
148 i=0;
149 } while (alen);
150}
151
152/* Finally you encrypt or decrypt the message */
153
154/* counter part of nonce may not be larger than L*8 bits,
155 * L is not larger than 8, therefore 64-bit counter... */
156static void ctr64_inc(unsigned char *counter) {
157 unsigned int n=8;
158 u8 c;
159
160 counter += 8;
161 do {
162 --n;
163 c = counter[n];
164 ++c;
165 counter[n] = c;
166 if (c) return;
167 } while (n);
168}
169
170int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
171 const unsigned char *inp, unsigned char *out,
172 size_t len)
173{
174 size_t n;
175 unsigned int i,L;
176 unsigned char flags0 = ctx->nonce.c[0];
177 block128_f block = ctx->block;
178 void * key = ctx->key;
179 union { u64 u[2]; u8 c[16]; } scratch;
180
181 if (!(flags0&0x40))
182 (*block)(ctx->nonce.c,ctx->cmac.c,key),
183 ctx->blocks++;
184
185 ctx->nonce.c[0] = L = flags0&7;
186 for (n=0,i=15-L;i<15;++i) {
187 n |= ctx->nonce.c[i];
188 ctx->nonce.c[i]=0;
189 n <<= 8;
190 }
191 n |= ctx->nonce.c[15]; /* reconstructed length */
192 ctx->nonce.c[15]=1;
193
194 if (n!=len) return -1; /* length mismatch */
195
196 ctx->blocks += ((len+15)>>3)|1;
197 if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
198
199 while (len>=16) {
200#if defined(STRICT_ALIGNMENT)
201 union { u64 u[2]; u8 c[16]; } temp;
202
203 memcpy (temp.c,inp,16);
204 ctx->cmac.u[0] ^= temp.u[0];
205 ctx->cmac.u[1] ^= temp.u[1];
206#else
207 ctx->cmac.u[0] ^= ((u64*)inp)[0];
208 ctx->cmac.u[1] ^= ((u64*)inp)[1];
209#endif
210 (*block)(ctx->cmac.c,ctx->cmac.c,key);
211 (*block)(ctx->nonce.c,scratch.c,key);
212 ctr64_inc(ctx->nonce.c);
213#if defined(STRICT_ALIGNMENT)
214 temp.u[0] ^= scratch.u[0];
215 temp.u[1] ^= scratch.u[1];
216 memcpy(out,temp.c,16);
217#else
218 ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
219 ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
220#endif
221 inp += 16;
222 out += 16;
223 len -= 16;
224 }
225
226 if (len) {
227 for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
228 (*block)(ctx->cmac.c,ctx->cmac.c,key);
229 (*block)(ctx->nonce.c,scratch.c,key);
230 for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
231 }
232
233 for (i=15-L;i<16;++i)
234 ctx->nonce.c[i]=0;
235
236 (*block)(ctx->nonce.c,scratch.c,key);
237 ctx->cmac.u[0] ^= scratch.u[0];
238 ctx->cmac.u[1] ^= scratch.u[1];
239
240 ctx->nonce.c[0] = flags0;
241
242 return 0;
243}
244
245int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
246 const unsigned char *inp, unsigned char *out,
247 size_t len)
248{
249 size_t n;
250 unsigned int i,L;
251 unsigned char flags0 = ctx->nonce.c[0];
252 block128_f block = ctx->block;
253 void * key = ctx->key;
254 union { u64 u[2]; u8 c[16]; } scratch;
255
256 if (!(flags0&0x40))
257 (*block)(ctx->nonce.c,ctx->cmac.c,key);
258
259 ctx->nonce.c[0] = L = flags0&7;
260 for (n=0,i=15-L;i<15;++i) {
261 n |= ctx->nonce.c[i];
262 ctx->nonce.c[i]=0;
263 n <<= 8;
264 }
265 n |= ctx->nonce.c[15]; /* reconstructed length */
266 ctx->nonce.c[15]=1;
267
268 if (n!=len) return -1;
269
270 while (len>=16) {
271#if defined(STRICT_ALIGNMENT)
272 union { u64 u[2]; u8 c[16]; } temp;
273#endif
274 (*block)(ctx->nonce.c,scratch.c,key);
275 ctr64_inc(ctx->nonce.c);
276#if defined(STRICT_ALIGNMENT)
277 memcpy (temp.c,inp,16);
278 ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
279 ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
280 memcpy (out,scratch.c,16);
281#else
282 ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
283 ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
284#endif
285 (*block)(ctx->cmac.c,ctx->cmac.c,key);
286
287 inp += 16;
288 out += 16;
289 len -= 16;
290 }
291
292 if (len) {
293 (*block)(ctx->nonce.c,scratch.c,key);
294 for (i=0; i<len; ++i)
295 ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
296 (*block)(ctx->cmac.c,ctx->cmac.c,key);
297 }
298
299 for (i=15-L;i<16;++i)
300 ctx->nonce.c[i]=0;
301
302 (*block)(ctx->nonce.c,scratch.c,key);
303 ctx->cmac.u[0] ^= scratch.u[0];
304 ctx->cmac.u[1] ^= scratch.u[1];
305
306 ctx->nonce.c[0] = flags0;
307
308 return 0;
309}
310
311static void ctr64_add (unsigned char *counter,size_t inc)
312{ size_t n=8, val=0;
313
314 counter += 8;
315 do {
316 --n;
317 val += counter[n] + (inc&0xff);
318 counter[n] = (unsigned char)val;
319 val >>= 8; /* carry bit */
320 inc >>= 8;
321 } while(n && (inc || val));
322}
323
324int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
325 const unsigned char *inp, unsigned char *out,
326 size_t len,ccm128_f stream)
327{
328 size_t n;
329 unsigned int i,L;
330 unsigned char flags0 = ctx->nonce.c[0];
331 block128_f block = ctx->block;
332 void * key = ctx->key;
333 union { u64 u[2]; u8 c[16]; } scratch;
334
335 if (!(flags0&0x40))
336 (*block)(ctx->nonce.c,ctx->cmac.c,key),
337 ctx->blocks++;
338
339 ctx->nonce.c[0] = L = flags0&7;
340 for (n=0,i=15-L;i<15;++i) {
341 n |= ctx->nonce.c[i];
342 ctx->nonce.c[i]=0;
343 n <<= 8;
344 }
345 n |= ctx->nonce.c[15]; /* reconstructed length */
346 ctx->nonce.c[15]=1;
347
348 if (n!=len) return -1; /* length mismatch */
349
350 ctx->blocks += ((len+15)>>3)|1;
351 if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
352
353 if ((n=len/16)) {
354 (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
355 n *= 16;
356 inp += n;
357 out += n;
358 len -= n;
359 if (len) ctr64_add(ctx->nonce.c,n/16);
360 }
361
362 if (len) {
363 for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
364 (*block)(ctx->cmac.c,ctx->cmac.c,key);
365 (*block)(ctx->nonce.c,scratch.c,key);
366 for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
367 }
368
369 for (i=15-L;i<16;++i)
370 ctx->nonce.c[i]=0;
371
372 (*block)(ctx->nonce.c,scratch.c,key);
373 ctx->cmac.u[0] ^= scratch.u[0];
374 ctx->cmac.u[1] ^= scratch.u[1];
375
376 ctx->nonce.c[0] = flags0;
377
378 return 0;
379}
380
381int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
382 const unsigned char *inp, unsigned char *out,
383 size_t len,ccm128_f stream)
384{
385 size_t n;
386 unsigned int i,L;
387 unsigned char flags0 = ctx->nonce.c[0];
388 block128_f block = ctx->block;
389 void * key = ctx->key;
390 union { u64 u[2]; u8 c[16]; } scratch;
391
392 if (!(flags0&0x40))
393 (*block)(ctx->nonce.c,ctx->cmac.c,key);
394
395 ctx->nonce.c[0] = L = flags0&7;
396 for (n=0,i=15-L;i<15;++i) {
397 n |= ctx->nonce.c[i];
398 ctx->nonce.c[i]=0;
399 n <<= 8;
400 }
401 n |= ctx->nonce.c[15]; /* reconstructed length */
402 ctx->nonce.c[15]=1;
403
404 if (n!=len) return -1;
405
406 if ((n=len/16)) {
407 (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
408 n *= 16;
409 inp += n;
410 out += n;
411 len -= n;
412 if (len) ctr64_add(ctx->nonce.c,n/16);
413 }
414
415 if (len) {
416 (*block)(ctx->nonce.c,scratch.c,key);
417 for (i=0; i<len; ++i)
418 ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
419 (*block)(ctx->cmac.c,ctx->cmac.c,key);
420 }
421
422 for (i=15-L;i<16;++i)
423 ctx->nonce.c[i]=0;
424
425 (*block)(ctx->nonce.c,scratch.c,key);
426 ctx->cmac.u[0] ^= scratch.u[0];
427 ctx->cmac.u[1] ^= scratch.u[1];
428
429 ctx->nonce.c[0] = flags0;
430
431 return 0;
432}
433
434size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
435{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
436
437 M *= 2; M += 2;
438 if (len<M) return 0;
439 memcpy(tag,ctx->cmac.c,M);
440 return M;
441}
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
index e5938c6137..4e6f5d35e1 100644
--- a/src/lib/libcrypto/modes/cfb128.c
+++ b/src/lib/libcrypto/modes/cfb128.c
@@ -48,7 +48,8 @@
48 * 48 *
49 */ 49 */
50 50
51#include "modes.h" 51#include <openssl/crypto.h>
52#include "modes_lcl.h"
52#include <string.h> 53#include <string.h>
53 54
54#ifndef MODES_DEBUG 55#ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
58#endif 59#endif
59#include <assert.h> 60#include <assert.h>
60 61
61#define STRICT_ALIGNMENT
62#if defined(__i386) || defined(__i386__) || \
63 defined(__x86_64) || defined(__x86_64__) || \
64 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
65 defined(__s390__) || defined(__s390x__)
66# undef STRICT_ALIGNMENT
67#endif
68
69/* The input and output encrypted as though 128bit cfb mode is being 62/* The input and output encrypted as though 128bit cfb mode is being
70 * used. The extra state information to record how much of the 63 * used. The extra state information to record how much of the
71 * 128bit block we have used is contained in *num; 64 * 128bit block we have used is contained in *num;
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
index 932037f551..ee642c5863 100644
--- a/src/lib/libcrypto/modes/ctr128.c
+++ b/src/lib/libcrypto/modes/ctr128.c
@@ -48,7 +48,8 @@
48 * 48 *
49 */ 49 */
50 50
51#include "modes.h" 51#include <openssl/crypto.h>
52#include "modes_lcl.h"
52#include <string.h> 53#include <string.h>
53 54
54#ifndef MODES_DEBUG 55#ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
58#endif 59#endif
59#include <assert.h> 60#include <assert.h>
60 61
61typedef unsigned int u32;
62typedef unsigned char u8;
63
64#define STRICT_ALIGNMENT
65#if defined(__i386) || defined(__i386__) || \
66 defined(__x86_64) || defined(__x86_64__) || \
67 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
68 defined(__s390__) || defined(__s390x__)
69# undef STRICT_ALIGNMENT
70#endif
71
72/* NOTE: the IV/counter CTR mode is big-endian. The code itself 62/* NOTE: the IV/counter CTR mode is big-endian. The code itself
73 * is endian-neutral. */ 63 * is endian-neutral. */
74 64
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
182 172
183 *num=n; 173 *num=n;
184} 174}
175
176/* increment upper 96 bits of 128-bit counter by 1 */
177static void ctr96_inc(unsigned char *counter) {
178 u32 n=12;
179 u8 c;
180
181 do {
182 --n;
183 c = counter[n];
184 ++c;
185 counter[n] = c;
186 if (c) return;
187 } while (n);
188}
189
190void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
191 size_t len, const void *key,
192 unsigned char ivec[16], unsigned char ecount_buf[16],
193 unsigned int *num, ctr128_f func)
194{
195 unsigned int n,ctr32;
196
197 assert(in && out && key && ecount_buf && num);
198 assert(*num < 16);
199
200 n = *num;
201
202 while (n && len) {
203 *(out++) = *(in++) ^ ecount_buf[n];
204 --len;
205 n = (n+1) % 16;
206 }
207
208 ctr32 = GETU32(ivec+12);
209 while (len>=16) {
210 size_t blocks = len/16;
211 /*
212 * 1<<28 is just a not-so-small yet not-so-large number...
213 * Below condition is practically never met, but it has to
214 * be checked for code correctness.
215 */
216 if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
217 blocks = (1U<<28);
218 /*
219 * As (*func) operates on 32-bit counter, caller
220 * has to handle overflow. 'if' below detects the
221 * overflow, which is then handled by limiting the
222 * amount of blocks to the exact overflow point...
223 */
224 ctr32 += (u32)blocks;
225 if (ctr32 < blocks) {
226 blocks -= ctr32;
227 ctr32 = 0;
228 }
229 (*func)(in,out,blocks,key,ivec);
230 /* (*ctr) does not update ivec, caller does: */
231 PUTU32(ivec+12,ctr32);
232 /* ... overflow was detected, propogate carry. */
233 if (ctr32 == 0) ctr96_inc(ivec);
234 blocks *= 16;
235 len -= blocks;
236 out += blocks;
237 in += blocks;
238 }
239 if (len) {
240 memset(ecount_buf,0,16);
241 (*func)(ecount_buf,ecount_buf,1,key,ivec);
242 ++ctr32;
243 PUTU32(ivec+12,ctr32);
244 if (ctr32 == 0) ctr96_inc(ivec);
245 while (len--) {
246 out[n] = in[n] ^ ecount_buf[n];
247 ++n;
248 }
249 }
250
251 *num=n;
252}
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c
index e0430f9fdc..c0e1f3696c 100644
--- a/src/lib/libcrypto/modes/cts128.c
+++ b/src/lib/libcrypto/modes/cts128.c
@@ -5,7 +5,8 @@
5 * forms are granted according to the OpenSSL license. 5 * forms are granted according to the OpenSSL license.
6 */ 6 */
7 7
8#include "modes.h" 8#include <openssl/crypto.h>
9#include "modes_lcl.h"
9#include <string.h> 10#include <string.h>
10 11
11#ifndef MODES_DEBUG 12#ifndef MODES_DEBUG
@@ -23,8 +24,9 @@
23 * deviates from mentioned RFCs. Most notably it allows input to be 24 * deviates from mentioned RFCs. Most notably it allows input to be
24 * of block length and it doesn't flip the order of the last two 25 * of block length and it doesn't flip the order of the last two
25 * blocks. CTS is being discussed even in ECB context, but it's not 26 * blocks. CTS is being discussed even in ECB context, but it's not
26 * adopted for any known application. This implementation complies 27 * adopted for any known application. This implementation provides
27 * with mentioned RFCs and [as such] extends CBC mode. 28 * two interfaces: one compliant with above mentioned RFCs and one
29 * compliant with the NIST proposal, both extending CBC mode.
28 */ 30 */
29 31
30size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, 32size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
@@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
54 return len+residue; 56 return len+residue;
55} 57}
56 58
59size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
60 size_t len, const void *key,
61 unsigned char ivec[16], block128_f block)
62{ size_t residue, n;
63
64 assert (in && out && key && ivec);
65
66 if (len < 16) return 0;
67
68 residue=len%16;
69
70 len -= residue;
71
72 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
73
74 if (residue==0) return len;
75
76 in += len;
77 out += len;
78
79 for (n=0; n<residue; ++n)
80 ivec[n] ^= in[n];
81 (*block)(ivec,ivec,key);
82 memcpy(out-16+residue,ivec,16);
83
84 return len+residue;
85}
86
57size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, 87size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
58 size_t len, const void *key, 88 size_t len, const void *key,
59 unsigned char ivec[16], cbc128_f cbc) 89 unsigned char ivec[16], cbc128_f cbc)
@@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
90 return len+residue; 120 return len+residue;
91} 121}
92 122
123size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
124 size_t len, const void *key,
125 unsigned char ivec[16], cbc128_f cbc)
126{ size_t residue;
127 union { size_t align; unsigned char c[16]; } tmp;
128
129 assert (in && out && key && ivec);
130
131 if (len < 16) return 0;
132
133 residue=len%16;
134
135 len -= residue;
136
137 (*cbc)(in,out,len,key,ivec,1);
138
139 if (residue==0) return len;
140
141 in += len;
142 out += len;
143
144#if defined(CBC_HANDLES_TRUNCATED_IO)
145 (*cbc)(in,out-16+residue,residue,key,ivec,1);
146#else
147 {
148 size_t n;
149 for (n=0; n<16; n+=sizeof(size_t))
150 *(size_t *)(tmp.c+n) = 0;
151 memcpy(tmp.c,in,residue);
152 }
153 (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
154#endif
155 return len+residue;
156}
157
93size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, 158size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
94 size_t len, const void *key, 159 size_t len, const void *key,
95 unsigned char ivec[16], block128_f block) 160 unsigned char ivec[16], block128_f block)
@@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
125 for(residue+=16; n<residue; ++n) 190 for(residue+=16; n<residue; ++n)
126 out[n] = tmp.c[n] ^ in[n]; 191 out[n] = tmp.c[n] ^ in[n];
127 192
128 return len+residue-16; 193 return 16+len+residue;
194}
195
196size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
197 size_t len, const void *key,
198 unsigned char ivec[16], block128_f block)
199{ size_t residue, n;
200 union { size_t align; unsigned char c[32]; } tmp;
201
202 assert (in && out && key && ivec);
203
204 if (len<16) return 0;
205
206 residue=len%16;
207
208 if (residue==0) {
209 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
210 return len;
211 }
212
213 len -= 16+residue;
214
215 if (len) {
216 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
217 in += len;
218 out += len;
219 }
220
221 (*block)(in+residue,tmp.c+16,key);
222
223 for (n=0; n<16; n+=sizeof(size_t))
224 *(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n);
225 memcpy(tmp.c,in,residue);
226 (*block)(tmp.c,tmp.c,key);
227
228 for(n=0; n<16; ++n) {
229 unsigned char c = in[n];
230 out[n] = tmp.c[n] ^ ivec[n];
231 ivec[n] = in[n+residue];
232 tmp.c[n] = c;
233 }
234 for(residue+=16; n<residue; ++n)
235 out[n] = tmp.c[n] ^ tmp.c[n-16];
236
237 return 16+len+residue;
129} 238}
130 239
131size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, 240size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
@@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
160 (*cbc)(tmp.c,tmp.c,32,key,ivec,0); 269 (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
161 memcpy(out,tmp.c,16+residue); 270 memcpy(out,tmp.c,16+residue);
162#endif 271#endif
163 return len+residue; 272 return 16+len+residue;
273}
274
275size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
276 size_t len, const void *key,
277 unsigned char ivec[16], cbc128_f cbc)
278{ size_t residue, n;
279 union { size_t align; unsigned char c[32]; } tmp;
280
281 assert (in && out && key && ivec);
282
283 if (len<16) return 0;
284
285 residue=len%16;
286
287 if (residue==0) {
288 (*cbc)(in,out,len,key,ivec,0);
289 return len;
290 }
291
292 len -= 16+residue;
293
294 if (len) {
295 (*cbc)(in,out,len,key,ivec,0);
296 in += len;
297 out += len;
298 }
299
300 for (n=16; n<32; n+=sizeof(size_t))
301 *(size_t *)(tmp.c+n) = 0;
302 /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
303 (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
304
305 memcpy(tmp.c,in,residue);
306#if defined(CBC_HANDLES_TRUNCATED_IO)
307 (*cbc)(tmp.c,out,16+residue,key,ivec,0);
308#else
309 (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
310 memcpy(out,tmp.c,16+residue);
311#endif
312 return 16+len+residue;
164} 313}
165 314
166#if defined(SELFTEST) 315#if defined(SELFTEST)
@@ -200,9 +349,8 @@ static const unsigned char vector_64[64] =
200static AES_KEY encks, decks; 349static AES_KEY encks, decks;
201 350
202void test_vector(const unsigned char *vector,size_t len) 351void test_vector(const unsigned char *vector,size_t len)
203{ unsigned char cleartext[64]; 352{ unsigned char iv[sizeof(test_iv)];
204 unsigned char iv[sizeof(test_iv)]; 353 unsigned char cleartext[64],ciphertext[64];
205 unsigned char ciphertext[64];
206 size_t tail; 354 size_t tail;
207 355
208 printf("vector_%d\n",len); fflush(stdout); 356 printf("vector_%d\n",len); fflush(stdout);
@@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len)
243 fprintf(stderr,"iv_%d mismatch\n",len), exit(4); 391 fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
244} 392}
245 393
246main() 394void test_nistvector(const unsigned char *vector,size_t len)
395{ unsigned char iv[sizeof(test_iv)];
396 unsigned char cleartext[64],ciphertext[64],nistvector[64];
397 size_t tail;
398
399 printf("nistvector_%d\n",len); fflush(stdout);
400
401 if ((tail=len%16) == 0) tail = 16;
402
403 len -= 16 + tail;
404 memcpy(nistvector,vector,len);
405 /* flip two last blocks */
406 memcpy(nistvector+len,vector+len+16,tail);
407 memcpy(nistvector+len+tail,vector+len,16);
408 len += 16 + tail;
409 tail = 16;
410
411 /* test block-based encryption */
412 memcpy(iv,test_iv,sizeof(test_iv));
413 CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
414 if (memcmp(ciphertext,nistvector,len))
415 fprintf(stderr,"output_%d mismatch\n",len), exit(1);
416 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
417 fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
418
419 /* test block-based decryption */
420 memcpy(iv,test_iv,sizeof(test_iv));
421 CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
422 if (memcmp(cleartext,test_input,len))
423 fprintf(stderr,"input_%d mismatch\n",len), exit(2);
424 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
425 fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
426
427 /* test streamed encryption */
428 memcpy(iv,test_iv,sizeof(test_iv));
429 CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
430 if (memcmp(ciphertext,nistvector,len))
431 fprintf(stderr,"output_%d mismatch\n",len), exit(3);
432 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
433 fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
434
435 /* test streamed decryption */
436 memcpy(iv,test_iv,sizeof(test_iv));
437 CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
438 if (memcmp(cleartext,test_input,len))
439 fprintf(stderr,"input_%d mismatch\n",len), exit(4);
440 if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
441 fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
442}
443
444int main()
247{ 445{
248 AES_set_encrypt_key(test_key,128,&encks); 446 AES_set_encrypt_key(test_key,128,&encks);
249 AES_set_decrypt_key(test_key,128,&decks); 447 AES_set_decrypt_key(test_key,128,&decks);
@@ -254,6 +452,14 @@ main()
254 test_vector(vector_47,sizeof(vector_47)); 452 test_vector(vector_47,sizeof(vector_47));
255 test_vector(vector_48,sizeof(vector_48)); 453 test_vector(vector_48,sizeof(vector_48));
256 test_vector(vector_64,sizeof(vector_64)); 454 test_vector(vector_64,sizeof(vector_64));
257 exit(0); 455
456 test_nistvector(vector_17,sizeof(vector_17));
457 test_nistvector(vector_31,sizeof(vector_31));
458 test_nistvector(vector_32,sizeof(vector_32));
459 test_nistvector(vector_47,sizeof(vector_47));
460 test_nistvector(vector_48,sizeof(vector_48));
461 test_nistvector(vector_64,sizeof(vector_64));
462
463 return 0;
258} 464}
259#endif 465#endif
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
new file mode 100644
index 0000000000..7d6d034970
--- /dev/null
+++ b/src/lib/libcrypto/modes/gcm128.c
@@ -0,0 +1,1757 @@
1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#define OPENSSL_FIPSAPI
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65#undef GETU32
66#define GETU32(p) BSWAP4(*(const u32 *)(p))
67#undef PUTU32
68#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
77 } \
78 else { \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
82 } \
83} while(0)
84
85/*
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 * handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 * subsystem penalties (for example on Windows large enough free
111 * results in VM working set trimming, meaning that consequent
112 * malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 * performance of other code paths (not necessarily even from same
115 * thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123 int i, j;
124 u128 V;
125
126 Htable[0].hi = 0;
127 Htable[0].lo = 0;
128 V.hi = H[0];
129 V.lo = H[1];
130
131 for (Htable[128]=V, i=64; i>0; i>>=1) {
132 REDUCE1BIT(V);
133 Htable[i] = V;
134 }
135
136 for (i=2; i<256; i<<=1) {
137 u128 *Hi = Htable+i, H0 = *Hi;
138 for (j=1; j<i; ++j) {
139 Hi[j].hi = H0.hi^Htable[j].hi;
140 Hi[j].lo = H0.lo^Htable[j].lo;
141 }
142 }
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147 u128 Z = { 0, 0};
148 const u8 *xi = (const u8 *)Xi+15;
149 size_t rem, n = *xi;
150 const union { long one; char little; } is_endian = {1};
151 static const size_t rem_8bit[256] = {
152 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217 while (1) {
218 Z.hi ^= Htable[n].hi;
219 Z.lo ^= Htable[n].lo;
220
221 if ((u8 *)Xi==xi) break;
222
223 n = *(--xi);
224
225 rem = (size_t)Z.lo&0xff;
226 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227 Z.hi = (Z.hi>>8);
228 if (sizeof(size_t)==8)
229 Z.hi ^= rem_8bit[rem];
230 else
231 Z.hi ^= (u64)rem_8bit[rem]<<32;
232 }
233
234 if (is_endian.little) {
235#ifdef BSWAP8
236 Xi[0] = BSWAP8(Z.hi);
237 Xi[1] = BSWAP8(Z.lo);
238#else
239 u8 *p = (u8 *)Xi;
240 u32 v;
241 v = (u32)(Z.hi>>32); PUTU32(p,v);
242 v = (u32)(Z.hi); PUTU32(p+4,v);
243 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
244 v = (u32)(Z.lo); PUTU32(p+12,v);
245#endif
246 }
247 else {
248 Xi[0] = Z.hi;
249 Xi[1] = Z.lo;
250 }
251}
252#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254#elif TABLE_BITS==4
255
256static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257{
258 u128 V;
259#if defined(OPENSSL_SMALL_FOOTPRINT)
260 int i;
261#endif
262
263 Htable[0].hi = 0;
264 Htable[0].lo = 0;
265 V.hi = H[0];
266 V.lo = H[1];
267
268#if defined(OPENSSL_SMALL_FOOTPRINT)
269 for (Htable[8]=V, i=4; i>0; i>>=1) {
270 REDUCE1BIT(V);
271 Htable[i] = V;
272 }
273
274 for (i=2; i<16; i<<=1) {
275 u128 *Hi = Htable+i;
276 int j;
277 for (V=*Hi, j=1; j<i; ++j) {
278 Hi[j].hi = V.hi^Htable[j].hi;
279 Hi[j].lo = V.lo^Htable[j].lo;
280 }
281 }
282#else
283 Htable[8] = V;
284 REDUCE1BIT(V);
285 Htable[4] = V;
286 REDUCE1BIT(V);
287 Htable[2] = V;
288 REDUCE1BIT(V);
289 Htable[1] = V;
290 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
291 V=Htable[4];
292 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
293 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
294 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
295 V=Htable[8];
296 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
297 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303#endif
304#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305 /*
306 * ARM assembler expects specific dword order in Htable.
307 */
308 {
309 int j;
310 const union { long one; char little; } is_endian = {1};
311
312 if (is_endian.little)
313 for (j=0;j<16;++j) {
314 V = Htable[j];
315 Htable[j].hi = V.lo;
316 Htable[j].lo = V.hi;
317 }
318 else
319 for (j=0;j<16;++j) {
320 V = Htable[j];
321 Htable[j].hi = V.lo<<32|V.lo>>32;
322 Htable[j].lo = V.hi<<32|V.hi>>32;
323 }
324 }
325#endif
326}
327
328#ifndef GHASH_ASM
329static const size_t rem_4bit[16] = {
330 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336{
337 u128 Z;
338 int cnt = 15;
339 size_t rem, nlo, nhi;
340 const union { long one; char little; } is_endian = {1};
341
342 nlo = ((const u8 *)Xi)[15];
343 nhi = nlo>>4;
344 nlo &= 0xf;
345
346 Z.hi = Htable[nlo].hi;
347 Z.lo = Htable[nlo].lo;
348
349 while (1) {
350 rem = (size_t)Z.lo&0xf;
351 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352 Z.hi = (Z.hi>>4);
353 if (sizeof(size_t)==8)
354 Z.hi ^= rem_4bit[rem];
355 else
356 Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358 Z.hi ^= Htable[nhi].hi;
359 Z.lo ^= Htable[nhi].lo;
360
361 if (--cnt<0) break;
362
363 nlo = ((const u8 *)Xi)[cnt];
364 nhi = nlo>>4;
365 nlo &= 0xf;
366
367 rem = (size_t)Z.lo&0xf;
368 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369 Z.hi = (Z.hi>>4);
370 if (sizeof(size_t)==8)
371 Z.hi ^= rem_4bit[rem];
372 else
373 Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375 Z.hi ^= Htable[nlo].hi;
376 Z.lo ^= Htable[nlo].lo;
377 }
378
379 if (is_endian.little) {
380#ifdef BSWAP8
381 Xi[0] = BSWAP8(Z.hi);
382 Xi[1] = BSWAP8(Z.lo);
383#else
384 u8 *p = (u8 *)Xi;
385 u32 v;
386 v = (u32)(Z.hi>>32); PUTU32(p,v);
387 v = (u32)(Z.hi); PUTU32(p+4,v);
388 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
389 v = (u32)(Z.lo); PUTU32(p+12,v);
390#endif
391 }
392 else {
393 Xi[0] = Z.hi;
394 Xi[1] = Z.lo;
395 }
396}
397
398#if !defined(OPENSSL_SMALL_FOOTPRINT)
399/*
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
405 */
406static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407 const u8 *inp,size_t len)
408{
409 u128 Z;
410 int cnt;
411 size_t rem, nlo, nhi;
412 const union { long one; char little; } is_endian = {1};
413
414#if 1
415 do {
416 cnt = 15;
417 nlo = ((const u8 *)Xi)[15];
418 nlo ^= inp[15];
419 nhi = nlo>>4;
420 nlo &= 0xf;
421
422 Z.hi = Htable[nlo].hi;
423 Z.lo = Htable[nlo].lo;
424
425 while (1) {
426 rem = (size_t)Z.lo&0xf;
427 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428 Z.hi = (Z.hi>>4);
429 if (sizeof(size_t)==8)
430 Z.hi ^= rem_4bit[rem];
431 else
432 Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434 Z.hi ^= Htable[nhi].hi;
435 Z.lo ^= Htable[nhi].lo;
436
437 if (--cnt<0) break;
438
439 nlo = ((const u8 *)Xi)[cnt];
440 nlo ^= inp[cnt];
441 nhi = nlo>>4;
442 nlo &= 0xf;
443
444 rem = (size_t)Z.lo&0xf;
445 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446 Z.hi = (Z.hi>>4);
447 if (sizeof(size_t)==8)
448 Z.hi ^= rem_4bit[rem];
449 else
450 Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452 Z.hi ^= Htable[nlo].hi;
453 Z.lo ^= Htable[nlo].lo;
454 }
455#else
456 /*
457 * Extra 256+16 bytes per-key plus 512 bytes shared tables
458 * [should] give ~50% improvement... One could have PACK()-ed
459 * the rem_8bit even here, but the priority is to minimize
460 * cache footprint...
461 */
462 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
463 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
464 static const unsigned short rem_8bit[256] = {
465 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497 /*
498 * This pre-processing phase slows down procedure by approximately
499 * same time as it makes each loop spin faster. In other words
500 * single block performance is approximately same as straightforward
501 * "4-bit" implementation, and then it goes only faster...
502 */
503 for (cnt=0; cnt<16; ++cnt) {
504 Z.hi = Htable[cnt].hi;
505 Z.lo = Htable[cnt].lo;
506 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507 Hshr4[cnt].hi = (Z.hi>>4);
508 Hshl4[cnt] = (u8)(Z.lo<<4);
509 }
510
511 do {
512 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513 nlo = ((const u8 *)Xi)[cnt];
514 nlo ^= inp[cnt];
515 nhi = nlo>>4;
516 nlo &= 0xf;
517
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
520
521 rem = (size_t)Z.lo&0xff;
522
523 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524 Z.hi = (Z.hi>>8);
525
526 Z.hi ^= Hshr4[nhi].hi;
527 Z.lo ^= Hshr4[nhi].lo;
528 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529 }
530
531 nlo = ((const u8 *)Xi)[0];
532 nlo ^= inp[0];
533 nhi = nlo>>4;
534 nlo &= 0xf;
535
536 Z.hi ^= Htable[nlo].hi;
537 Z.lo ^= Htable[nlo].lo;
538
539 rem = (size_t)Z.lo&0xf;
540
541 Z.lo = (Z.hi<<60)|(Z.lo>>4);
542 Z.hi = (Z.hi>>4);
543
544 Z.hi ^= Htable[nhi].hi;
545 Z.lo ^= Htable[nhi].lo;
546 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547#endif
548
549 if (is_endian.little) {
550#ifdef BSWAP8
551 Xi[0] = BSWAP8(Z.hi);
552 Xi[1] = BSWAP8(Z.lo);
553#else
554 u8 *p = (u8 *)Xi;
555 u32 v;
556 v = (u32)(Z.hi>>32); PUTU32(p,v);
557 v = (u32)(Z.hi); PUTU32(p+4,v);
558 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
559 v = (u32)(Z.lo); PUTU32(p+12,v);
560#endif
561 }
562 else {
563 Xi[0] = Z.hi;
564 Xi[1] = Z.lo;
565 }
566 } while (inp+=16, len-=16);
567}
568#endif
569#else
570void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572#endif
573
574#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580#define GHASH_CHUNK (3*1024)
581#endif
582
583#else /* TABLE_BITS */
584
585static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586{
587 u128 V,Z = { 0,0 };
588 long X;
589 int i,j;
590 const long *xi = (const long *)Xi;
591 const union { long one; char little; } is_endian = {1};
592
593 V.hi = H[0]; /* H is in host byte order, no byte swapping */
594 V.lo = H[1];
595
596 for (j=0; j<16/sizeof(long); ++j) {
597 if (is_endian.little) {
598 if (sizeof(long)==8) {
599#ifdef BSWAP8
600 X = (long)(BSWAP8(xi[j]));
601#else
602 const u8 *p = (const u8 *)(xi+j);
603 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604#endif
605 }
606 else {
607 const u8 *p = (const u8 *)(xi+j);
608 X = (long)GETU32(p);
609 }
610 }
611 else
612 X = xi[j];
613
614 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615 u64 M = (u64)(X>>(8*sizeof(long)-1));
616 Z.hi ^= V.hi&M;
617 Z.lo ^= V.lo&M;
618
619 REDUCE1BIT(V);
620 }
621 }
622
623 if (is_endian.little) {
624#ifdef BSWAP8
625 Xi[0] = BSWAP8(Z.hi);
626 Xi[1] = BSWAP8(Z.lo);
627#else
628 u8 *p = (u8 *)Xi;
629 u32 v;
630 v = (u32)(Z.hi>>32); PUTU32(p,v);
631 v = (u32)(Z.hi); PUTU32(p+4,v);
632 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
633 v = (u32)(Z.lo); PUTU32(p+12,v);
634#endif
635 }
636 else {
637 Xi[0] = Z.hi;
638 Xi[1] = Z.lo;
639 }
640}
641#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643#endif
644
645#if TABLE_BITS==4 && defined(GHASH_ASM)
646# if !defined(I386_ONLY) && \
647 (defined(__i386) || defined(__i386__) || \
648 defined(__x86_64) || defined(__x86_64__) || \
649 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
650# define GHASH_ASM_X86_OR_64
651# define GCM_FUNCREF_4BIT
652extern unsigned int OPENSSL_ia32cap_P[2];
653
654void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
659# define GHASH_ASM_X86
660void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665# endif
666# elif defined(__arm__) || defined(__arm)
667# include "arm_arch.h"
668# if __ARM_ARCH__>=7
669# define GHASH_ASM_ARM
670# define GCM_FUNCREF_4BIT
671void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673# endif
674# endif
675#endif
676
677#ifdef GCM_FUNCREF_4BIT
678# undef GCM_MUL
679# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680# ifdef GHASH
681# undef GHASH
682# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683# endif
684#endif
685
686void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687{
688 const union { long one; char little; } is_endian = {1};
689
690 memset(ctx,0,sizeof(*ctx));
691 ctx->block = block;
692 ctx->key = key;
693
694 (*block)(ctx->H.c,ctx->H.c,key);
695
696 if (is_endian.little) {
697 /* H is stored in host byte order */
698#ifdef BSWAP8
699 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701#else
702 u8 *p = ctx->H.c;
703 u64 hi,lo;
704 hi = (u64)GETU32(p) <<32|GETU32(p+4);
705 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706 ctx->H.u[0] = hi;
707 ctx->H.u[1] = lo;
708#endif
709 }
710
711#if TABLE_BITS==8
712 gcm_init_8bit(ctx->Htable,ctx->H.u);
713#elif TABLE_BITS==4
714# if defined(GHASH_ASM_X86_OR_64)
715# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
717 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
718 gcm_init_clmul(ctx->Htable,ctx->H.u);
719 ctx->gmult = gcm_gmult_clmul;
720 ctx->ghash = gcm_ghash_clmul;
721 return;
722 }
723# endif
724 gcm_init_4bit(ctx->Htable,ctx->H.u);
725# if defined(GHASH_ASM_X86) /* x86 only */
726# if defined(OPENSSL_IA32_SSE2)
727 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
728# else
729 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
730# endif
731 ctx->gmult = gcm_gmult_4bit_mmx;
732 ctx->ghash = gcm_ghash_4bit_mmx;
733 } else {
734 ctx->gmult = gcm_gmult_4bit_x86;
735 ctx->ghash = gcm_ghash_4bit_x86;
736 }
737# else
738 ctx->gmult = gcm_gmult_4bit;
739 ctx->ghash = gcm_ghash_4bit;
740# endif
741# elif defined(GHASH_ASM_ARM)
742 if (OPENSSL_armcap_P & ARMV7_NEON) {
743 ctx->gmult = gcm_gmult_neon;
744 ctx->ghash = gcm_ghash_neon;
745 } else {
746 gcm_init_4bit(ctx->Htable,ctx->H.u);
747 ctx->gmult = gcm_gmult_4bit;
748 ctx->ghash = gcm_ghash_4bit;
749 }
750# else
751 gcm_init_4bit(ctx->Htable,ctx->H.u);
752# endif
753#endif
754}
755
756void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
757{
758 const union { long one; char little; } is_endian = {1};
759 unsigned int ctr;
760#ifdef GCM_FUNCREF_4BIT
761 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
762#endif
763
764 ctx->Yi.u[0] = 0;
765 ctx->Yi.u[1] = 0;
766 ctx->Xi.u[0] = 0;
767 ctx->Xi.u[1] = 0;
768 ctx->len.u[0] = 0; /* AAD length */
769 ctx->len.u[1] = 0; /* message length */
770 ctx->ares = 0;
771 ctx->mres = 0;
772
773 if (len==12) {
774 memcpy(ctx->Yi.c,iv,12);
775 ctx->Yi.c[15]=1;
776 ctr=1;
777 }
778 else {
779 size_t i;
780 u64 len0 = len;
781
782 while (len>=16) {
783 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
784 GCM_MUL(ctx,Yi);
785 iv += 16;
786 len -= 16;
787 }
788 if (len) {
789 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790 GCM_MUL(ctx,Yi);
791 }
792 len0 <<= 3;
793 if (is_endian.little) {
794#ifdef BSWAP8
795 ctx->Yi.u[1] ^= BSWAP8(len0);
796#else
797 ctx->Yi.c[8] ^= (u8)(len0>>56);
798 ctx->Yi.c[9] ^= (u8)(len0>>48);
799 ctx->Yi.c[10] ^= (u8)(len0>>40);
800 ctx->Yi.c[11] ^= (u8)(len0>>32);
801 ctx->Yi.c[12] ^= (u8)(len0>>24);
802 ctx->Yi.c[13] ^= (u8)(len0>>16);
803 ctx->Yi.c[14] ^= (u8)(len0>>8);
804 ctx->Yi.c[15] ^= (u8)(len0);
805#endif
806 }
807 else
808 ctx->Yi.u[1] ^= len0;
809
810 GCM_MUL(ctx,Yi);
811
812 if (is_endian.little)
813 ctr = GETU32(ctx->Yi.c+12);
814 else
815 ctr = ctx->Yi.d[3];
816 }
817
818 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
819 ++ctr;
820 if (is_endian.little)
821 PUTU32(ctx->Yi.c+12,ctr);
822 else
823 ctx->Yi.d[3] = ctr;
824}
825
826int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827{
828 size_t i;
829 unsigned int n;
830 u64 alen = ctx->len.u[0];
831#ifdef GCM_FUNCREF_4BIT
832 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
833# ifdef GHASH
834 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
835 const u8 *inp,size_t len) = ctx->ghash;
836# endif
837#endif
838
839 if (ctx->len.u[1]) return -2;
840
841 alen += len;
842 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
843 return -1;
844 ctx->len.u[0] = alen;
845
846 n = ctx->ares;
847 if (n) {
848 while (n && len) {
849 ctx->Xi.c[n] ^= *(aad++);
850 --len;
851 n = (n+1)%16;
852 }
853 if (n==0) GCM_MUL(ctx,Xi);
854 else {
855 ctx->ares = n;
856 return 0;
857 }
858 }
859
860#ifdef GHASH
861 if ((i = (len&(size_t)-16))) {
862 GHASH(ctx,aad,i);
863 aad += i;
864 len -= i;
865 }
866#else
867 while (len>=16) {
868 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
869 GCM_MUL(ctx,Xi);
870 aad += 16;
871 len -= 16;
872 }
873#endif
874 if (len) {
875 n = (unsigned int)len;
876 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
877 }
878
879 ctx->ares = n;
880 return 0;
881}
882
883int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
884 const unsigned char *in, unsigned char *out,
885 size_t len)
886{
887 const union { long one; char little; } is_endian = {1};
888 unsigned int n, ctr;
889 size_t i;
890 u64 mlen = ctx->len.u[1];
891 block128_f block = ctx->block;
892 void *key = ctx->key;
893#ifdef GCM_FUNCREF_4BIT
894 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
895# ifdef GHASH
896 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
897 const u8 *inp,size_t len) = ctx->ghash;
898# endif
899#endif
900
901#if 0
902 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
903#endif
904 mlen += len;
905 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
906 return -1;
907 ctx->len.u[1] = mlen;
908
909 if (ctx->ares) {
910 /* First call to encrypt finalizes GHASH(AAD) */
911 GCM_MUL(ctx,Xi);
912 ctx->ares = 0;
913 }
914
915 if (is_endian.little)
916 ctr = GETU32(ctx->Yi.c+12);
917 else
918 ctr = ctx->Yi.d[3];
919
920 n = ctx->mres;
921#if !defined(OPENSSL_SMALL_FOOTPRINT)
922 if (16%sizeof(size_t) == 0) do { /* always true actually */
923 if (n) {
924 while (n && len) {
925 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
926 --len;
927 n = (n+1)%16;
928 }
929 if (n==0) GCM_MUL(ctx,Xi);
930 else {
931 ctx->mres = n;
932 return 0;
933 }
934 }
935#if defined(STRICT_ALIGNMENT)
936 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
937 break;
938#endif
939#if defined(GHASH) && defined(GHASH_CHUNK)
940 while (len>=GHASH_CHUNK) {
941 size_t j=GHASH_CHUNK;
942
943 while (j) {
944 (*block)(ctx->Yi.c,ctx->EKi.c,key);
945 ++ctr;
946 if (is_endian.little)
947 PUTU32(ctx->Yi.c+12,ctr);
948 else
949 ctx->Yi.d[3] = ctr;
950 for (i=0; i<16; i+=sizeof(size_t))
951 *(size_t *)(out+i) =
952 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
953 out += 16;
954 in += 16;
955 j -= 16;
956 }
957 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
958 len -= GHASH_CHUNK;
959 }
960 if ((i = (len&(size_t)-16))) {
961 size_t j=i;
962
963 while (len>=16) {
964 (*block)(ctx->Yi.c,ctx->EKi.c,key);
965 ++ctr;
966 if (is_endian.little)
967 PUTU32(ctx->Yi.c+12,ctr);
968 else
969 ctx->Yi.d[3] = ctr;
970 for (i=0; i<16; i+=sizeof(size_t))
971 *(size_t *)(out+i) =
972 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
973 out += 16;
974 in += 16;
975 len -= 16;
976 }
977 GHASH(ctx,out-j,j);
978 }
979#else
980 while (len>=16) {
981 (*block)(ctx->Yi.c,ctx->EKi.c,key);
982 ++ctr;
983 if (is_endian.little)
984 PUTU32(ctx->Yi.c+12,ctr);
985 else
986 ctx->Yi.d[3] = ctr;
987 for (i=0; i<16; i+=sizeof(size_t))
988 *(size_t *)(ctx->Xi.c+i) ^=
989 *(size_t *)(out+i) =
990 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
991 GCM_MUL(ctx,Xi);
992 out += 16;
993 in += 16;
994 len -= 16;
995 }
996#endif
997 if (len) {
998 (*block)(ctx->Yi.c,ctx->EKi.c,key);
999 ++ctr;
1000 if (is_endian.little)
1001 PUTU32(ctx->Yi.c+12,ctr);
1002 else
1003 ctx->Yi.d[3] = ctr;
1004 while (len--) {
1005 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1006 ++n;
1007 }
1008 }
1009
1010 ctx->mres = n;
1011 return 0;
1012 } while(0);
1013#endif
1014 for (i=0;i<len;++i) {
1015 if (n==0) {
1016 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1017 ++ctr;
1018 if (is_endian.little)
1019 PUTU32(ctx->Yi.c+12,ctr);
1020 else
1021 ctx->Yi.d[3] = ctr;
1022 }
1023 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1024 n = (n+1)%16;
1025 if (n==0)
1026 GCM_MUL(ctx,Xi);
1027 }
1028
1029 ctx->mres = n;
1030 return 0;
1031}
1032
1033int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1034 const unsigned char *in, unsigned char *out,
1035 size_t len)
1036{
1037 const union { long one; char little; } is_endian = {1};
1038 unsigned int n, ctr;
1039 size_t i;
1040 u64 mlen = ctx->len.u[1];
1041 block128_f block = ctx->block;
1042 void *key = ctx->key;
1043#ifdef GCM_FUNCREF_4BIT
1044 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1045# ifdef GHASH
1046 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1047 const u8 *inp,size_t len) = ctx->ghash;
1048# endif
1049#endif
1050
1051 mlen += len;
1052 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1053 return -1;
1054 ctx->len.u[1] = mlen;
1055
1056 if (ctx->ares) {
1057 /* First call to decrypt finalizes GHASH(AAD) */
1058 GCM_MUL(ctx,Xi);
1059 ctx->ares = 0;
1060 }
1061
1062 if (is_endian.little)
1063 ctr = GETU32(ctx->Yi.c+12);
1064 else
1065 ctr = ctx->Yi.d[3];
1066
1067 n = ctx->mres;
1068#if !defined(OPENSSL_SMALL_FOOTPRINT)
1069 if (16%sizeof(size_t) == 0) do { /* always true actually */
1070 if (n) {
1071 while (n && len) {
1072 u8 c = *(in++);
1073 *(out++) = c^ctx->EKi.c[n];
1074 ctx->Xi.c[n] ^= c;
1075 --len;
1076 n = (n+1)%16;
1077 }
1078 if (n==0) GCM_MUL (ctx,Xi);
1079 else {
1080 ctx->mres = n;
1081 return 0;
1082 }
1083 }
1084#if defined(STRICT_ALIGNMENT)
1085 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1086 break;
1087#endif
1088#if defined(GHASH) && defined(GHASH_CHUNK)
1089 while (len>=GHASH_CHUNK) {
1090 size_t j=GHASH_CHUNK;
1091
1092 GHASH(ctx,in,GHASH_CHUNK);
1093 while (j) {
1094 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1095 ++ctr;
1096 if (is_endian.little)
1097 PUTU32(ctx->Yi.c+12,ctr);
1098 else
1099 ctx->Yi.d[3] = ctr;
1100 for (i=0; i<16; i+=sizeof(size_t))
1101 *(size_t *)(out+i) =
1102 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1103 out += 16;
1104 in += 16;
1105 j -= 16;
1106 }
1107 len -= GHASH_CHUNK;
1108 }
1109 if ((i = (len&(size_t)-16))) {
1110 GHASH(ctx,in,i);
1111 while (len>=16) {
1112 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1113 ++ctr;
1114 if (is_endian.little)
1115 PUTU32(ctx->Yi.c+12,ctr);
1116 else
1117 ctx->Yi.d[3] = ctr;
1118 for (i=0; i<16; i+=sizeof(size_t))
1119 *(size_t *)(out+i) =
1120 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1121 out += 16;
1122 in += 16;
1123 len -= 16;
1124 }
1125 }
1126#else
1127 while (len>=16) {
1128 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1129 ++ctr;
1130 if (is_endian.little)
1131 PUTU32(ctx->Yi.c+12,ctr);
1132 else
1133 ctx->Yi.d[3] = ctr;
1134 for (i=0; i<16; i+=sizeof(size_t)) {
1135 size_t c = *(size_t *)(in+i);
1136 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1137 *(size_t *)(ctx->Xi.c+i) ^= c;
1138 }
1139 GCM_MUL(ctx,Xi);
1140 out += 16;
1141 in += 16;
1142 len -= 16;
1143 }
1144#endif
1145 if (len) {
1146 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1147 ++ctr;
1148 if (is_endian.little)
1149 PUTU32(ctx->Yi.c+12,ctr);
1150 else
1151 ctx->Yi.d[3] = ctr;
1152 while (len--) {
1153 u8 c = in[n];
1154 ctx->Xi.c[n] ^= c;
1155 out[n] = c^ctx->EKi.c[n];
1156 ++n;
1157 }
1158 }
1159
1160 ctx->mres = n;
1161 return 0;
1162 } while(0);
1163#endif
1164 for (i=0;i<len;++i) {
1165 u8 c;
1166 if (n==0) {
1167 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1168 ++ctr;
1169 if (is_endian.little)
1170 PUTU32(ctx->Yi.c+12,ctr);
1171 else
1172 ctx->Yi.d[3] = ctr;
1173 }
1174 c = in[i];
1175 out[i] = c^ctx->EKi.c[n];
1176 ctx->Xi.c[n] ^= c;
1177 n = (n+1)%16;
1178 if (n==0)
1179 GCM_MUL(ctx,Xi);
1180 }
1181
1182 ctx->mres = n;
1183 return 0;
1184}
1185
1186int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1187 const unsigned char *in, unsigned char *out,
1188 size_t len, ctr128_f stream)
1189{
1190 const union { long one; char little; } is_endian = {1};
1191 unsigned int n, ctr;
1192 size_t i;
1193 u64 mlen = ctx->len.u[1];
1194 void *key = ctx->key;
1195#ifdef GCM_FUNCREF_4BIT
1196 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1197# ifdef GHASH
1198 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1199 const u8 *inp,size_t len) = ctx->ghash;
1200# endif
1201#endif
1202
1203 mlen += len;
1204 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1205 return -1;
1206 ctx->len.u[1] = mlen;
1207
1208 if (ctx->ares) {
1209 /* First call to encrypt finalizes GHASH(AAD) */
1210 GCM_MUL(ctx,Xi);
1211 ctx->ares = 0;
1212 }
1213
1214 if (is_endian.little)
1215 ctr = GETU32(ctx->Yi.c+12);
1216 else
1217 ctr = ctx->Yi.d[3];
1218
1219 n = ctx->mres;
1220 if (n) {
1221 while (n && len) {
1222 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1223 --len;
1224 n = (n+1)%16;
1225 }
1226 if (n==0) GCM_MUL(ctx,Xi);
1227 else {
1228 ctx->mres = n;
1229 return 0;
1230 }
1231 }
1232#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1233 while (len>=GHASH_CHUNK) {
1234 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1235 ctr += GHASH_CHUNK/16;
1236 if (is_endian.little)
1237 PUTU32(ctx->Yi.c+12,ctr);
1238 else
1239 ctx->Yi.d[3] = ctr;
1240 GHASH(ctx,out,GHASH_CHUNK);
1241 out += GHASH_CHUNK;
1242 in += GHASH_CHUNK;
1243 len -= GHASH_CHUNK;
1244 }
1245#endif
1246 if ((i = (len&(size_t)-16))) {
1247 size_t j=i/16;
1248
1249 (*stream)(in,out,j,key,ctx->Yi.c);
1250 ctr += (unsigned int)j;
1251 if (is_endian.little)
1252 PUTU32(ctx->Yi.c+12,ctr);
1253 else
1254 ctx->Yi.d[3] = ctr;
1255 in += i;
1256 len -= i;
1257#if defined(GHASH)
1258 GHASH(ctx,out,i);
1259 out += i;
1260#else
1261 while (j--) {
1262 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1263 GCM_MUL(ctx,Xi);
1264 out += 16;
1265 }
1266#endif
1267 }
1268 if (len) {
1269 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1270 ++ctr;
1271 if (is_endian.little)
1272 PUTU32(ctx->Yi.c+12,ctr);
1273 else
1274 ctx->Yi.d[3] = ctr;
1275 while (len--) {
1276 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1277 ++n;
1278 }
1279 }
1280
1281 ctx->mres = n;
1282 return 0;
1283}
1284
1285int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1286 const unsigned char *in, unsigned char *out,
1287 size_t len,ctr128_f stream)
1288{
1289 const union { long one; char little; } is_endian = {1};
1290 unsigned int n, ctr;
1291 size_t i;
1292 u64 mlen = ctx->len.u[1];
1293 void *key = ctx->key;
1294#ifdef GCM_FUNCREF_4BIT
1295 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1296# ifdef GHASH
1297 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1298 const u8 *inp,size_t len) = ctx->ghash;
1299# endif
1300#endif
1301
1302 mlen += len;
1303 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1304 return -1;
1305 ctx->len.u[1] = mlen;
1306
1307 if (ctx->ares) {
1308 /* First call to decrypt finalizes GHASH(AAD) */
1309 GCM_MUL(ctx,Xi);
1310 ctx->ares = 0;
1311 }
1312
1313 if (is_endian.little)
1314 ctr = GETU32(ctx->Yi.c+12);
1315 else
1316 ctr = ctx->Yi.d[3];
1317
1318 n = ctx->mres;
1319 if (n) {
1320 while (n && len) {
1321 u8 c = *(in++);
1322 *(out++) = c^ctx->EKi.c[n];
1323 ctx->Xi.c[n] ^= c;
1324 --len;
1325 n = (n+1)%16;
1326 }
1327 if (n==0) GCM_MUL (ctx,Xi);
1328 else {
1329 ctx->mres = n;
1330 return 0;
1331 }
1332 }
1333#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1334 while (len>=GHASH_CHUNK) {
1335 GHASH(ctx,in,GHASH_CHUNK);
1336 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1337 ctr += GHASH_CHUNK/16;
1338 if (is_endian.little)
1339 PUTU32(ctx->Yi.c+12,ctr);
1340 else
1341 ctx->Yi.d[3] = ctr;
1342 out += GHASH_CHUNK;
1343 in += GHASH_CHUNK;
1344 len -= GHASH_CHUNK;
1345 }
1346#endif
1347 if ((i = (len&(size_t)-16))) {
1348 size_t j=i/16;
1349
1350#if defined(GHASH)
1351 GHASH(ctx,in,i);
1352#else
1353 while (j--) {
1354 size_t k;
1355 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1356 GCM_MUL(ctx,Xi);
1357 in += 16;
1358 }
1359 j = i/16;
1360 in -= i;
1361#endif
1362 (*stream)(in,out,j,key,ctx->Yi.c);
1363 ctr += (unsigned int)j;
1364 if (is_endian.little)
1365 PUTU32(ctx->Yi.c+12,ctr);
1366 else
1367 ctx->Yi.d[3] = ctr;
1368 out += i;
1369 in += i;
1370 len -= i;
1371 }
1372 if (len) {
1373 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1374 ++ctr;
1375 if (is_endian.little)
1376 PUTU32(ctx->Yi.c+12,ctr);
1377 else
1378 ctx->Yi.d[3] = ctr;
1379 while (len--) {
1380 u8 c = in[n];
1381 ctx->Xi.c[n] ^= c;
1382 out[n] = c^ctx->EKi.c[n];
1383 ++n;
1384 }
1385 }
1386
1387 ctx->mres = n;
1388 return 0;
1389}
1390
1391int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1392 size_t len)
1393{
1394 const union { long one; char little; } is_endian = {1};
1395 u64 alen = ctx->len.u[0]<<3;
1396 u64 clen = ctx->len.u[1]<<3;
1397#ifdef GCM_FUNCREF_4BIT
1398 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1399#endif
1400
1401 if (ctx->mres)
1402 GCM_MUL(ctx,Xi);
1403
1404 if (is_endian.little) {
1405#ifdef BSWAP8
1406 alen = BSWAP8(alen);
1407 clen = BSWAP8(clen);
1408#else
1409 u8 *p = ctx->len.c;
1410
1411 ctx->len.u[0] = alen;
1412 ctx->len.u[1] = clen;
1413
1414 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1415 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1416#endif
1417 }
1418
1419 ctx->Xi.u[0] ^= alen;
1420 ctx->Xi.u[1] ^= clen;
1421 GCM_MUL(ctx,Xi);
1422
1423 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1424 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1425
1426 if (tag && len<=sizeof(ctx->Xi))
1427 return memcmp(ctx->Xi.c,tag,len);
1428 else
1429 return -1;
1430}
1431
1432void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1433{
1434 CRYPTO_gcm128_finish(ctx, NULL, 0);
1435 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1436}
1437
1438GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1439{
1440 GCM128_CONTEXT *ret;
1441
1442 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1443 CRYPTO_gcm128_init(ret,key,block);
1444
1445 return ret;
1446}
1447
1448void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1449{
1450 if (ctx) {
1451 OPENSSL_cleanse(ctx,sizeof(*ctx));
1452 OPENSSL_free(ctx);
1453 }
1454}
1455
1456#if defined(SELFTEST)
1457#include <stdio.h>
1458#include <openssl/aes.h>
1459
1460/* Test Case 1 */
1461static const u8 K1[16],
1462 *P1=NULL,
1463 *A1=NULL,
1464 IV1[12],
1465 *C1=NULL,
1466 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1467
1468/* Test Case 2 */
1469#define K2 K1
1470#define A2 A1
1471#define IV2 IV1
1472static const u8 P2[16],
1473 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1474 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1475
1476/* Test Case 3 */
1477#define A3 A2
1478static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1479 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1480 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1481 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1482 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1483 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1484 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1485 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1486 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1487 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1488 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1489
1490/* Test Case 4 */
1491#define K4 K3
1492#define IV4 IV3
1493static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1494 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1495 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1496 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1497 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1498 0xab,0xad,0xda,0xd2},
1499 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1500 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1501 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1502 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1503 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1504
1505/* Test Case 5 */
1506#define K5 K4
1507#define P5 P4
1508#define A5 A4
1509static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1510 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1511 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1512 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1513 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1514 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1515
1516/* Test Case 6 */
1517#define K6 K5
1518#define P6 P5
1519#define A6 A5
1520static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1521 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1522 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1523 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1524 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1525 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1526 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1527 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1528 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1529
1530/* Test Case 7 */
1531static const u8 K7[24],
1532 *P7=NULL,
1533 *A7=NULL,
1534 IV7[12],
1535 *C7=NULL,
1536 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1537
1538/* Test Case 8 */
1539#define K8 K7
1540#define IV8 IV7
1541#define A8 A7
1542static const u8 P8[16],
1543 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1544 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1545
1546/* Test Case 9 */
1547#define A9 A8
1548static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1549 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1550 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1551 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1552 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1553 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1554 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1555 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1556 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1557 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1558 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1559 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1560
1561/* Test Case 10 */
1562#define K10 K9
1563#define IV10 IV9
1564static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1565 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1566 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1567 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1568 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1569 0xab,0xad,0xda,0xd2},
1570 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1571 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1572 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1573 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1574 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1575
1576/* Test Case 11 */
1577#define K11 K10
1578#define P11 P10
1579#define A11 A10
1580static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1581 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1582 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1583 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1584 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1585 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1586
1587/* Test Case 12 */
1588#define K12 K11
1589#define P12 P11
1590#define A12 A11
1591static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1592 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1593 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1594 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1595 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1596 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1597 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1598 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1599 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1600
1601/* Test Case 13 */
1602static const u8 K13[32],
1603 *P13=NULL,
1604 *A13=NULL,
1605 IV13[12],
1606 *C13=NULL,
1607 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1608
1609/* Test Case 14 */
1610#define K14 K13
1611#define A14 A13
1612static const u8 P14[16],
1613 IV14[12],
1614 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1615 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1616
1617/* Test Case 15 */
1618#define A15 A14
1619static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1620 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1621 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1622 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1623 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1624 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1625 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1626 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1627 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1628 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1629 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1630 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1631
1632/* Test Case 16 */
1633#define K16 K15
1634#define IV16 IV15
1635static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1636 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1637 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1638 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1639 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1640 0xab,0xad,0xda,0xd2},
1641 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1642 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1643 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1644 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1645 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1646
1647/* Test Case 17 */
1648#define K17 K16
1649#define P17 P16
1650#define A17 A16
1651static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1652 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1653 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1654 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1655 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1656 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1657
1658/* Test Case 18 */
1659#define K18 K17
1660#define P18 P17
1661#define A18 A17
1662static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1663 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1664 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1665 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1666 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1667 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1668 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1669 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1670 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1671
1672#define TEST_CASE(n) do { \
1673 u8 out[sizeof(P##n)]; \
1674 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1675 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1676 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1677 memset(out,0,sizeof(out)); \
1678 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1679 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1680 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1681 (C##n && memcmp(out,C##n,sizeof(out)))) \
1682 ret++, printf ("encrypt test#%d failed.\n",n); \
1683 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1684 memset(out,0,sizeof(out)); \
1685 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1686 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1687 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1688 (P##n && memcmp(out,P##n,sizeof(out)))) \
1689 ret++, printf ("decrypt test#%d failed.\n",n); \
1690 } while(0)
1691
1692int main()
1693{
1694 GCM128_CONTEXT ctx;
1695 AES_KEY key;
1696 int ret=0;
1697
1698 TEST_CASE(1);
1699 TEST_CASE(2);
1700 TEST_CASE(3);
1701 TEST_CASE(4);
1702 TEST_CASE(5);
1703 TEST_CASE(6);
1704 TEST_CASE(7);
1705 TEST_CASE(8);
1706 TEST_CASE(9);
1707 TEST_CASE(10);
1708 TEST_CASE(11);
1709 TEST_CASE(12);
1710 TEST_CASE(13);
1711 TEST_CASE(14);
1712 TEST_CASE(15);
1713 TEST_CASE(16);
1714 TEST_CASE(17);
1715 TEST_CASE(18);
1716
1717#ifdef OPENSSL_CPUID_OBJ
1718 {
1719 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1720 union { u64 u; u8 c[1024]; } buf;
1721 int i;
1722
1723 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1724 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1725 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1726
1727 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1728 start = OPENSSL_rdtsc();
1729 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1730 gcm_t = OPENSSL_rdtsc() - start;
1731
1732 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1733 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1734 (block128_f)AES_encrypt);
1735 start = OPENSSL_rdtsc();
1736 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1737 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1738 (block128_f)AES_encrypt);
1739 ctr_t = OPENSSL_rdtsc() - start;
1740
1741 printf("%.2f-%.2f=%.2f\n",
1742 gcm_t/(double)sizeof(buf),
1743 ctr_t/(double)sizeof(buf),
1744 (gcm_t-ctr_t)/(double)sizeof(buf));
1745#ifdef GHASH
1746 GHASH(&ctx,buf.c,sizeof(buf));
1747 start = OPENSSL_rdtsc();
1748 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1749 gcm_t = OPENSSL_rdtsc() - start;
1750 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1751#endif
1752 }
1753#endif
1754
1755 return ret;
1756}
1757#endif
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
index af8d97d795..f18215bb2b 100644
--- a/src/lib/libcrypto/modes/modes.h
+++ b/src/lib/libcrypto/modes/modes.h
@@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
15 size_t len, const void *key, 15 size_t len, const void *key,
16 unsigned char ivec[16], int enc); 16 unsigned char ivec[16], int enc);
17 17
18typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
19 size_t blocks, const void *key,
20 const unsigned char ivec[16]);
21
22typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
23 size_t blocks, const void *key,
24 const unsigned char ivec[16],unsigned char cmac[16]);
25
18void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, 26void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
19 size_t len, const void *key, 27 size_t len, const void *key,
20 unsigned char ivec[16], block128_f block); 28 unsigned char ivec[16], block128_f block);
@@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
27 unsigned char ivec[16], unsigned char ecount_buf[16], 35 unsigned char ivec[16], unsigned char ecount_buf[16],
28 unsigned int *num, block128_f block); 36 unsigned int *num, block128_f block);
29 37
38void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
39 size_t len, const void *key,
40 unsigned char ivec[16], unsigned char ecount_buf[16],
41 unsigned int *num, ctr128_f ctr);
42
30void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, 43void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
31 size_t len, const void *key, 44 size_t len, const void *key,
32 unsigned char ivec[16], int *num, 45 unsigned char ivec[16], int *num,
@@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
57size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, 70size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
58 size_t len, const void *key, 71 size_t len, const void *key,
59 unsigned char ivec[16], cbc128_f cbc); 72 unsigned char ivec[16], cbc128_f cbc);
73
74size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
75 size_t len, const void *key,
76 unsigned char ivec[16], block128_f block);
77size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
78 size_t len, const void *key,
79 unsigned char ivec[16], cbc128_f cbc);
80size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
81 size_t len, const void *key,
82 unsigned char ivec[16], block128_f block);
83size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
84 size_t len, const void *key,
85 unsigned char ivec[16], cbc128_f cbc);
86
87typedef struct gcm128_context GCM128_CONTEXT;
88
89GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
90void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
91void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
92 size_t len);
93int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
94 size_t len);
95int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
96 const unsigned char *in, unsigned char *out,
97 size_t len);
98int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
99 const unsigned char *in, unsigned char *out,
100 size_t len);
101int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
102 const unsigned char *in, unsigned char *out,
103 size_t len, ctr128_f stream);
104int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
105 const unsigned char *in, unsigned char *out,
106 size_t len, ctr128_f stream);
107int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
108 size_t len);
109void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
110void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
111
112typedef struct ccm128_context CCM128_CONTEXT;
113
114void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
115 unsigned int M, unsigned int L, void *key,block128_f block);
116int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
117 const unsigned char *nonce, size_t nlen, size_t mlen);
118void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
119 const unsigned char *aad, size_t alen);
120int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
121 const unsigned char *inp, unsigned char *out, size_t len);
122int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
123 const unsigned char *inp, unsigned char *out, size_t len);
124int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
125 const unsigned char *inp, unsigned char *out, size_t len,
126 ccm128_f stream);
127int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
128 const unsigned char *inp, unsigned char *out, size_t len,
129 ccm128_f stream);
130size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
131
132typedef struct xts128_context XTS128_CONTEXT;
133
134int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
135 const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h
new file mode 100644
index 0000000000..b6dc3c336f
--- /dev/null
+++ b/src/lib/libcrypto/modes/modes_lcl.h
@@ -0,0 +1,131 @@
1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use is governed by OpenSSL license.
5 * ====================================================================
6 */
7
8#include <openssl/modes.h>
9
10
11#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
12typedef __int64 i64;
13typedef unsigned __int64 u64;
14#define U64(C) C##UI64
15#elif defined(__arch64__)
16typedef long i64;
17typedef unsigned long u64;
18#define U64(C) C##UL
19#else
20typedef long long i64;
21typedef unsigned long long u64;
22#define U64(C) C##ULL
23#endif
24
25typedef unsigned int u32;
26typedef unsigned char u8;
27
28#define STRICT_ALIGNMENT 1
29#if defined(__i386) || defined(__i386__) || \
30 defined(__x86_64) || defined(__x86_64__) || \
31 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
32 defined(__s390__) || defined(__s390x__) || \
33 ( (defined(__arm__) || defined(__arm)) && \
34 (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
35 defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) )
36# undef STRICT_ALIGNMENT
37#endif
38
39#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
40#if defined(__GNUC__) && __GNUC__>=2
41# if defined(__x86_64) || defined(__x86_64__)
42# define BSWAP8(x) ({ u64 ret=(x); \
43 asm ("bswapq %0" \
44 : "+r"(ret)); ret; })
45# define BSWAP4(x) ({ u32 ret=(x); \
46 asm ("bswapl %0" \
47 : "+r"(ret)); ret; })
48# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
49# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
50 asm ("bswapl %0; bswapl %1" \
51 : "+r"(hi),"+r"(lo)); \
52 (u64)hi<<32|lo; })
53# define BSWAP4(x) ({ u32 ret=(x); \
54 asm ("bswapl %0" \
55 : "+r"(ret)); ret; })
56# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
57# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
58 asm ("rev %0,%0; rev %1,%1" \
59 : "+r"(hi),"+r"(lo)); \
60 (u64)hi<<32|lo; })
61# define BSWAP4(x) ({ u32 ret; \
62 asm ("rev %0,%1" \
63 : "=r"(ret) : "r"((u32)(x))); \
64 ret; })
65# endif
66#elif defined(_MSC_VER)
67# if _MSC_VER>=1300
68# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
69# define BSWAP8(x) _byteswap_uint64((u64)(x))
70# define BSWAP4(x) _byteswap_ulong((u32)(x))
71# elif defined(_M_IX86)
72 __inline u32 _bswap4(u32 val) {
73 _asm mov eax,val
74 _asm bswap eax
75 }
76# define BSWAP4(x) _bswap4(x)
77# endif
78#endif
79#endif
80
81#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
82#define GETU32(p) BSWAP4(*(const u32 *)(p))
83#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
84#else
85#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
86#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
87#endif
88
89/* GCM definitions */
90
91typedef struct { u64 hi,lo; } u128;
92
93#ifdef TABLE_BITS
94#undef TABLE_BITS
95#endif
96/*
97 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
98 * never be set to 8 [or 1]. For further information see gcm128.c.
99 */
100#define TABLE_BITS 4
101
102struct gcm128_context {
103 /* Following 6 names follow names in GCM specification */
104 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len,
105 Xi,H;
106 /* Relative position of Xi, H and pre-computed Htable is used
107 * in some assembler modules, i.e. don't change the order! */
108#if TABLE_BITS==8
109 u128 Htable[256];
110#else
111 u128 Htable[16];
112 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
113 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
114#endif
115 unsigned int mres, ares;
116 block128_f block;
117 void *key;
118};
119
120struct xts128_context {
121 void *key1, *key2;
122 block128_f block1,block2;
123};
124
125struct ccm128_context {
126 union { u64 u[2]; u8 c[16]; } nonce, cmac;
127 u64 blocks;
128 block128_f block;
129 void *key;
130};
131
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
index c732e2ec58..01c01702c4 100644
--- a/src/lib/libcrypto/modes/ofb128.c
+++ b/src/lib/libcrypto/modes/ofb128.c
@@ -48,7 +48,8 @@
48 * 48 *
49 */ 49 */
50 50
51#include "modes.h" 51#include <openssl/crypto.h>
52#include "modes_lcl.h"
52#include <string.h> 53#include <string.h>
53 54
54#ifndef MODES_DEBUG 55#ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
58#endif 59#endif
59#include <assert.h> 60#include <assert.h>
60 61
61#define STRICT_ALIGNMENT
62#if defined(__i386) || defined(__i386__) || \
63 defined(__x86_64) || defined(__x86_64__) || \
64 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
65 defined(__s390__) || defined(__s390x__)
66# undef STRICT_ALIGNMENT
67#endif
68
69/* The input and output encrypted as though 128bit ofb mode is being 62/* The input and output encrypted as though 128bit ofb mode is being
70 * used. The extra state information to record how much of the 63 * used. The extra state information to record how much of the
71 * 128bit block we have used is contained in *num; 64 * 128bit block we have used is contained in *num;
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
new file mode 100644
index 0000000000..9cf27a25e9
--- /dev/null
+++ b/src/lib/libcrypto/modes/xts128.c
@@ -0,0 +1,187 @@
1/* ====================================================================
2 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#include <openssl/crypto.h>
51#include "modes_lcl.h"
52#include <string.h>
53
54#ifndef MODES_DEBUG
55# ifndef NDEBUG
56# define NDEBUG
57# endif
58#endif
59#include <assert.h>
60
61int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
62 const unsigned char *inp, unsigned char *out,
63 size_t len, int enc)
64{
65 const union { long one; char little; } is_endian = {1};
66 union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
67 unsigned int i;
68
69 if (len<16) return -1;
70
71 memcpy(tweak.c, iv, 16);
72
73 (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
74
75 if (!enc && (len%16)) len-=16;
76
77 while (len>=16) {
78#if defined(STRICT_ALIGNMENT)
79 memcpy(scratch.c,inp,16);
80 scratch.u[0] ^= tweak.u[0];
81 scratch.u[1] ^= tweak.u[1];
82#else
83 scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
84 scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
85#endif
86 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
87#if defined(STRICT_ALIGNMENT)
88 scratch.u[0] ^= tweak.u[0];
89 scratch.u[1] ^= tweak.u[1];
90 memcpy(out,scratch.c,16);
91#else
92 ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
93 ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
94#endif
95 inp += 16;
96 out += 16;
97 len -= 16;
98
99 if (len==0) return 0;
100
101 if (is_endian.little) {
102 unsigned int carry,res;
103
104 res = 0x87&(((int)tweak.d[3])>>31);
105 carry = (unsigned int)(tweak.u[0]>>63);
106 tweak.u[0] = (tweak.u[0]<<1)^res;
107 tweak.u[1] = (tweak.u[1]<<1)|carry;
108 }
109 else {
110 size_t c;
111
112 for (c=0,i=0;i<16;++i) {
113 /*+ substitutes for |, because c is 1 bit */
114 c += ((size_t)tweak.c[i])<<1;
115 tweak.c[i] = (u8)c;
116 c = c>>8;
117 }
118 tweak.c[0] ^= (u8)(0x87&(0-c));
119 }
120 }
121 if (enc) {
122 for (i=0;i<len;++i) {
123 u8 c = inp[i];
124 out[i] = scratch.c[i];
125 scratch.c[i] = c;
126 }
127 scratch.u[0] ^= tweak.u[0];
128 scratch.u[1] ^= tweak.u[1];
129 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
130 scratch.u[0] ^= tweak.u[0];
131 scratch.u[1] ^= tweak.u[1];
132 memcpy(out-16,scratch.c,16);
133 }
134 else {
135 union { u64 u[2]; u8 c[16]; } tweak1;
136
137 if (is_endian.little) {
138 unsigned int carry,res;
139
140 res = 0x87&(((int)tweak.d[3])>>31);
141 carry = (unsigned int)(tweak.u[0]>>63);
142 tweak1.u[0] = (tweak.u[0]<<1)^res;
143 tweak1.u[1] = (tweak.u[1]<<1)|carry;
144 }
145 else {
146 size_t c;
147
148 for (c=0,i=0;i<16;++i) {
149 /*+ substitutes for |, because c is 1 bit */
150 c += ((size_t)tweak.c[i])<<1;
151 tweak1.c[i] = (u8)c;
152 c = c>>8;
153 }
154 tweak1.c[0] ^= (u8)(0x87&(0-c));
155 }
156#if defined(STRICT_ALIGNMENT)
157 memcpy(scratch.c,inp,16);
158 scratch.u[0] ^= tweak1.u[0];
159 scratch.u[1] ^= tweak1.u[1];
160#else
161 scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
162 scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
163#endif
164 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
165 scratch.u[0] ^= tweak1.u[0];
166 scratch.u[1] ^= tweak1.u[1];
167
168 for (i=0;i<len;++i) {
169 u8 c = inp[16+i];
170 out[16+i] = scratch.c[i];
171 scratch.c[i] = c;
172 }
173 scratch.u[0] ^= tweak.u[0];
174 scratch.u[1] ^= tweak.u[1];
175 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
176#if defined(STRICT_ALIGNMENT)
177 scratch.u[0] ^= tweak.u[0];
178 scratch.u[1] ^= tweak.u[1];
179 memcpy (out,scratch.c,16);
180#else
181 ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
182 ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
183#endif
184 }
185
186 return 0;
187}