summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/modes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/modes')
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-alpha.pl455
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-armv4.pl429
-rwxr-xr-xsrc/lib/libcrypto/modes/asm/ghash-ia64.pl463
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-parisc.pl741
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-s390x.pl262
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86.pl1342
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86_64.pl806
-rw-r--r--src/lib/libcrypto/modes/cbc128.c202
-rw-r--r--src/lib/libcrypto/modes/ccm128.c441
-rw-r--r--src/lib/libcrypto/modes/cfb128.c234
-rw-r--r--src/lib/libcrypto/modes/ctr128.c252
-rw-r--r--src/lib/libcrypto/modes/cts128.c267
-rw-r--r--src/lib/libcrypto/modes/gcm128.c1539
-rw-r--r--src/lib/libcrypto/modes/modes.h136
-rw-r--r--src/lib/libcrypto/modes/modes_lcl.h108
-rw-r--r--src/lib/libcrypto/modes/ofb128.c119
-rw-r--r--src/lib/libcrypto/modes/xts128.c187
18 files changed, 0 insertions, 8313 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index b6d6ea5a62..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,455 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#include <machine/asm.h>
249
250.text
251
252.set noat
253.set noreorder
254.globl gcm_gmult_4bit
255.align 4
256.ent gcm_gmult_4bit
257gcm_gmult_4bit:
258 .frame sp,0,ra
259 .prologue 0
260
261 ldq $Xlo,8($Xi)
262 ldq $Xhi,0($Xi)
263
264 bsr $t0,picmeup
265 nop
266___
267
268 &loop();
269
270$code.=<<___;
271 srl $Zlo,24,$t0 # byte swap
272 srl $Zlo,8,$t1
273
274 sll $Zlo,8,$t2
275 sll $Zlo,24,$Zlo
276 zapnot $t0,0x11,$t0
277 zapnot $t1,0x22,$t1
278
279 zapnot $Zlo,0x88,$Zlo
280 or $t0,$t1,$t0
281 zapnot $t2,0x44,$t2
282
283 or $Zlo,$t0,$Zlo
284 srl $Zhi,24,$t0
285 srl $Zhi,8,$t1
286
287 or $Zlo,$t2,$Zlo
288 sll $Zhi,8,$t2
289 sll $Zhi,24,$Zhi
290
291 srl $Zlo,32,$Xlo
292 sll $Zlo,32,$Zlo
293
294 zapnot $t0,0x11,$t0
295 zapnot $t1,0x22,$t1
296 or $Zlo,$Xlo,$Xlo
297
298 zapnot $Zhi,0x88,$Zhi
299 or $t0,$t1,$t0
300 zapnot $t2,0x44,$t2
301
302 or $Zhi,$t0,$Zhi
303 or $Zhi,$t2,$Zhi
304
305 srl $Zhi,32,$Xhi
306 sll $Zhi,32,$Zhi
307
308 or $Zhi,$Xhi,$Xhi
309 stq $Xlo,8($Xi)
310 stq $Xhi,0($Xi)
311
312 ret (ra)
313.end gcm_gmult_4bit
314___
315
316$inhi="s0";
317$inlo="s1";
318
319$code.=<<___;
320.globl gcm_ghash_4bit
321.align 4
322.ent gcm_ghash_4bit
323gcm_ghash_4bit:
324 lda sp,-32(sp)
325 stq ra,0(sp)
326 stq s0,8(sp)
327 stq s1,16(sp)
328 .mask 0x04000600,-32
329 .frame sp,32,ra
330 .prologue 0
331
332 ldq_u $inhi,0($inp)
333 ldq_u $Thi0,7($inp)
334 ldq_u $inlo,8($inp)
335 ldq_u $Tlo0,15($inp)
336 ldq $Xhi,0($Xi)
337 ldq $Xlo,8($Xi)
338
339 bsr $t0,picmeup
340 nop
341
342.Louter:
343 extql $inhi,$inp,$inhi
344 extqh $Thi0,$inp,$Thi0
345 or $inhi,$Thi0,$inhi
346 lda $inp,16($inp)
347
348 extql $inlo,$inp,$inlo
349 extqh $Tlo0,$inp,$Tlo0
350 or $inlo,$Tlo0,$inlo
351 subq $len,16,$len
352
353 xor $Xlo,$inlo,$Xlo
354 xor $Xhi,$inhi,$Xhi
355___
356
357 &loop();
358
359$code.=<<___;
360 srl $Zlo,24,$t0 # byte swap
361 srl $Zlo,8,$t1
362
363 sll $Zlo,8,$t2
364 sll $Zlo,24,$Zlo
365 zapnot $t0,0x11,$t0
366 zapnot $t1,0x22,$t1
367
368 zapnot $Zlo,0x88,$Zlo
369 or $t0,$t1,$t0
370 zapnot $t2,0x44,$t2
371
372 or $Zlo,$t0,$Zlo
373 srl $Zhi,24,$t0
374 srl $Zhi,8,$t1
375
376 or $Zlo,$t2,$Zlo
377 sll $Zhi,8,$t2
378 sll $Zhi,24,$Zhi
379
380 srl $Zlo,32,$Xlo
381 sll $Zlo,32,$Zlo
382 beq $len,.Ldone
383
384 zapnot $t0,0x11,$t0
385 zapnot $t1,0x22,$t1
386 or $Zlo,$Xlo,$Xlo
387 ldq_u $inhi,0($inp)
388
389 zapnot $Zhi,0x88,$Zhi
390 or $t0,$t1,$t0
391 zapnot $t2,0x44,$t2
392 ldq_u $Thi0,7($inp)
393
394 or $Zhi,$t0,$Zhi
395 or $Zhi,$t2,$Zhi
396 ldq_u $inlo,8($inp)
397 ldq_u $Tlo0,15($inp)
398
399 srl $Zhi,32,$Xhi
400 sll $Zhi,32,$Zhi
401
402 or $Zhi,$Xhi,$Xhi
403 br zero,.Louter
404
405.Ldone:
406 zapnot $t0,0x11,$t0
407 zapnot $t1,0x22,$t1
408 or $Zlo,$Xlo,$Xlo
409
410 zapnot $Zhi,0x88,$Zhi
411 or $t0,$t1,$t0
412 zapnot $t2,0x44,$t2
413
414 or $Zhi,$t0,$Zhi
415 or $Zhi,$t2,$Zhi
416
417 srl $Zhi,32,$Xhi
418 sll $Zhi,32,$Zhi
419
420 or $Zhi,$Xhi,$Xhi
421
422 stq $Xlo,8($Xi)
423 stq $Xhi,0($Xi)
424
425 .set noreorder
426 /*ldq ra,0(sp)*/
427 ldq s0,8(sp)
428 ldq s1,16(sp)
429 lda sp,32(sp)
430 ret (ra)
431.end gcm_ghash_4bit
432
433.align 4
434.ent picmeup
435picmeup:
436 .frame sp,0,$t0
437 .prologue 0
438 br $rem_4bit,.Lpic
439.Lpic: lda $rem_4bit,12($rem_4bit)
440 ret ($t0)
441.end picmeup
442 nop
443rem_4bit:
444 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
445 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
446 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
447 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
448.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
449.align 4
450
451___
452$output=shift and open STDOUT,">$output";
453print $code;
454close STDOUT;
455
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index d91586ee29..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,429 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+32 bytes shared table]. There is no
15# experimental performance data available yet. The only approximation
16# that can be made at this point is based on code size. Inner loop is
17# 32 instructions long and on single-issue core should execute in <40
18# cycles. Having verified that gcc 3.4 didn't unroll corresponding
19# loop, this assembler loop body was found to be ~3x smaller than
20# compiler-generated one...
21#
22# July 2010
23#
24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
25# Cortex A8 core and ~25 cycles per processed byte (which was observed
26# to be ~3 times faster than gcc-generated code:-)
27#
28# February 2011
29#
30# Profiler-assisted and platform-specific optimization resulted in 7%
31# improvement on Cortex A8 core and ~23.5 cycles per byte.
32#
33# March 2011
34#
35# Add NEON implementation featuring polynomial multiplication, i.e. no
36# lookup tables involved. On Cortex A8 it was measured to process one
37# byte in 15 cycles or 55% faster than integer-only code.
38
39# ====================================================================
40# Note about "528B" variant. In ARM case it makes lesser sense to
41# implement it for following reasons:
42#
43# - performance improvement won't be anywhere near 50%, because 128-
44# bit shift operation is neatly fused with 128-bit xor here, and
45# "538B" variant would eliminate only 4-5 instructions out of 32
46# in the inner loop (meaning that estimated improvement is ~15%);
47# - ARM-based systems are often embedded ones and extra memory
48# consumption might be unappreciated (for so little improvement);
49#
50# Byte order [in]dependence. =========================================
51#
52# Caller is expected to maintain specific *dword* order in Htable,
53# namely with *least* significant dword of 128-bit value at *lower*
54# address. This differs completely from C code and has everything to
55# do with ldm instruction and order in which dwords are "consumed" by
56# algorithm. *Byte* order within these dwords in turn is whatever
57# *native* byte order on current platform. See gcm128.c for working
58# example...
59
60while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61open STDOUT,">$output";
62
63$Xi="r0"; # argument block
64$Htbl="r1";
65$inp="r2";
66$len="r3";
67
68$Zll="r4"; # variables
69$Zlh="r5";
70$Zhl="r6";
71$Zhh="r7";
72$Tll="r8";
73$Tlh="r9";
74$Thl="r10";
75$Thh="r11";
76$nlo="r12";
77################# r13 is stack pointer
78$nhi="r14";
79################# r15 is program counter
80
81$rem_4bit=$inp; # used in gcm_gmult_4bit
82$cnt=$len;
83
84sub Zsmash() {
85 my $i=12;
86 my @args=@_;
87 for ($Zll,$Zlh,$Zhl,$Zhh) {
88 $code.=<<___;
89#if __ARM_ARCH__>=7 && defined(__ARMEL__)
90 rev $_,$_
91 str $_,[$Xi,#$i]
92#elif defined(__ARMEB__)
93 str $_,[$Xi,#$i]
94#else
95 mov $Tlh,$_,lsr#8
96 strb $_,[$Xi,#$i+3]
97 mov $Thl,$_,lsr#16
98 strb $Tlh,[$Xi,#$i+2]
99 mov $Thh,$_,lsr#24
100 strb $Thl,[$Xi,#$i+1]
101 strb $Thh,[$Xi,#$i]
102#endif
103___
104 $code.="\t".shift(@args)."\n";
105 $i-=4;
106 }
107}
108
109$code=<<___;
110#include "arm_arch.h"
111
112.text
113.code 32
114
115.type rem_4bit,%object
116.align 5
117rem_4bit:
118.short 0x0000,0x1C20,0x3840,0x2460
119.short 0x7080,0x6CA0,0x48C0,0x54E0
120.short 0xE100,0xFD20,0xD940,0xC560
121.short 0x9180,0x8DA0,0xA9C0,0xB5E0
122.size rem_4bit,.-rem_4bit
123
124.type rem_4bit_get,%function
125rem_4bit_get:
126 sub $rem_4bit,pc,#8
127 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
128 b .Lrem_4bit_got
129 nop
130.size rem_4bit_get,.-rem_4bit_get
131
132.global gcm_ghash_4bit
133.type gcm_ghash_4bit,%function
134gcm_ghash_4bit:
135 sub r12,pc,#8
136 add $len,$inp,$len @ $len to point at the end
137 stmdb sp!,{r3-r11,lr} @ save $len/end too
138 sub r12,r12,#48 @ &rem_4bit
139
140 ldmia r12,{r4-r11} @ copy rem_4bit ...
141 stmdb sp!,{r4-r11} @ ... to stack
142
143 ldrb $nlo,[$inp,#15]
144 ldrb $nhi,[$Xi,#15]
145.Louter:
146 eor $nlo,$nlo,$nhi
147 and $nhi,$nlo,#0xf0
148 and $nlo,$nlo,#0x0f
149 mov $cnt,#14
150
151 add $Zhh,$Htbl,$nlo,lsl#4
152 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
153 add $Thh,$Htbl,$nhi
154 ldrb $nlo,[$inp,#14]
155
156 and $nhi,$Zll,#0xf @ rem
157 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
158 add $nhi,$nhi,$nhi
159 eor $Zll,$Tll,$Zll,lsr#4
160 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
161 eor $Zll,$Zll,$Zlh,lsl#28
162 ldrb $nhi,[$Xi,#14]
163 eor $Zlh,$Tlh,$Zlh,lsr#4
164 eor $Zlh,$Zlh,$Zhl,lsl#28
165 eor $Zhl,$Thl,$Zhl,lsr#4
166 eor $Zhl,$Zhl,$Zhh,lsl#28
167 eor $Zhh,$Thh,$Zhh,lsr#4
168 eor $nlo,$nlo,$nhi
169 and $nhi,$nlo,#0xf0
170 and $nlo,$nlo,#0x0f
171 eor $Zhh,$Zhh,$Tll,lsl#16
172
173.Linner:
174 add $Thh,$Htbl,$nlo,lsl#4
175 and $nlo,$Zll,#0xf @ rem
176 subs $cnt,$cnt,#1
177 add $nlo,$nlo,$nlo
178 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
179 eor $Zll,$Tll,$Zll,lsr#4
180 eor $Zll,$Zll,$Zlh,lsl#28
181 eor $Zlh,$Tlh,$Zlh,lsr#4
182 eor $Zlh,$Zlh,$Zhl,lsl#28
183 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
184 eor $Zhl,$Thl,$Zhl,lsr#4
185 ldrplb $nlo,[$inp,$cnt]
186 eor $Zhl,$Zhl,$Zhh,lsl#28
187 eor $Zhh,$Thh,$Zhh,lsr#4
188
189 add $Thh,$Htbl,$nhi
190 and $nhi,$Zll,#0xf @ rem
191 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
192 add $nhi,$nhi,$nhi
193 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
194 eor $Zll,$Tll,$Zll,lsr#4
195 ldrplb $Tll,[$Xi,$cnt]
196 eor $Zll,$Zll,$Zlh,lsl#28
197 eor $Zlh,$Tlh,$Zlh,lsr#4
198 ldrh $Tlh,[sp,$nhi]
199 eor $Zlh,$Zlh,$Zhl,lsl#28
200 eor $Zhl,$Thl,$Zhl,lsr#4
201 eor $Zhl,$Zhl,$Zhh,lsl#28
202 eorpl $nlo,$nlo,$Tll
203 eor $Zhh,$Thh,$Zhh,lsr#4
204 andpl $nhi,$nlo,#0xf0
205 andpl $nlo,$nlo,#0x0f
206 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
207 bpl .Linner
208
209 ldr $len,[sp,#32] @ re-load $len/end
210 add $inp,$inp,#16
211 mov $nhi,$Zll
212___
213 &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
214$code.=<<___;
215 bne .Louter
216
217 add sp,sp,#36
218#if __ARM_ARCH__>=5
219 ldmia sp!,{r4-r11,pc}
220#else
221 ldmia sp!,{r4-r11,lr}
222 tst lr,#1
223 moveq pc,lr @ be binary compatible with V4, yet
224 bx lr @ interoperable with Thumb ISA:-)
225#endif
226.size gcm_ghash_4bit,.-gcm_ghash_4bit
227
228.global gcm_gmult_4bit
229.type gcm_gmult_4bit,%function
230gcm_gmult_4bit:
231 stmdb sp!,{r4-r11,lr}
232 ldrb $nlo,[$Xi,#15]
233 b rem_4bit_get
234.Lrem_4bit_got:
235 and $nhi,$nlo,#0xf0
236 and $nlo,$nlo,#0x0f
237 mov $cnt,#14
238
239 add $Zhh,$Htbl,$nlo,lsl#4
240 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
241 ldrb $nlo,[$Xi,#14]
242
243 add $Thh,$Htbl,$nhi
244 and $nhi,$Zll,#0xf @ rem
245 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
246 add $nhi,$nhi,$nhi
247 eor $Zll,$Tll,$Zll,lsr#4
248 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
249 eor $Zll,$Zll,$Zlh,lsl#28
250 eor $Zlh,$Tlh,$Zlh,lsr#4
251 eor $Zlh,$Zlh,$Zhl,lsl#28
252 eor $Zhl,$Thl,$Zhl,lsr#4
253 eor $Zhl,$Zhl,$Zhh,lsl#28
254 eor $Zhh,$Thh,$Zhh,lsr#4
255 and $nhi,$nlo,#0xf0
256 eor $Zhh,$Zhh,$Tll,lsl#16
257 and $nlo,$nlo,#0x0f
258
259.Loop:
260 add $Thh,$Htbl,$nlo,lsl#4
261 and $nlo,$Zll,#0xf @ rem
262 subs $cnt,$cnt,#1
263 add $nlo,$nlo,$nlo
264 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
265 eor $Zll,$Tll,$Zll,lsr#4
266 eor $Zll,$Zll,$Zlh,lsl#28
267 eor $Zlh,$Tlh,$Zlh,lsr#4
268 eor $Zlh,$Zlh,$Zhl,lsl#28
269 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
270 eor $Zhl,$Thl,$Zhl,lsr#4
271 ldrplb $nlo,[$Xi,$cnt]
272 eor $Zhl,$Zhl,$Zhh,lsl#28
273 eor $Zhh,$Thh,$Zhh,lsr#4
274
275 add $Thh,$Htbl,$nhi
276 and $nhi,$Zll,#0xf @ rem
277 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
278 add $nhi,$nhi,$nhi
279 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
280 eor $Zll,$Tll,$Zll,lsr#4
281 eor $Zll,$Zll,$Zlh,lsl#28
282 eor $Zlh,$Tlh,$Zlh,lsr#4
283 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
284 eor $Zlh,$Zlh,$Zhl,lsl#28
285 eor $Zhl,$Thl,$Zhl,lsr#4
286 eor $Zhl,$Zhl,$Zhh,lsl#28
287 eor $Zhh,$Thh,$Zhh,lsr#4
288 andpl $nhi,$nlo,#0xf0
289 andpl $nlo,$nlo,#0x0f
290 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
291 bpl .Loop
292___
293 &Zsmash();
294$code.=<<___;
295#if __ARM_ARCH__>=5
296 ldmia sp!,{r4-r11,pc}
297#else
298 ldmia sp!,{r4-r11,lr}
299 tst lr,#1
300 moveq pc,lr @ be binary compatible with V4, yet
301 bx lr @ interoperable with Thumb ISA:-)
302#endif
303.size gcm_gmult_4bit,.-gcm_gmult_4bit
304___
305{
306my $cnt=$Htbl; # $Htbl is used once in the very beginning
307
308my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
309my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
310
311# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
312# in Zo. Or should I say "top bit", because GHASH is specified in
313# reverse bit order? Otherwise straightforward 128-bt H by one input
314# byte multiplication and modulo-reduction, times 16.
315
316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
318sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
319
320$code.=<<___;
321#if __ARM_ARCH__>=7
322.fpu neon
323
324.global gcm_gmult_neon
325.type gcm_gmult_neon,%function
326.align 4
327gcm_gmult_neon:
328 sub $Htbl,#16 @ point at H in GCM128_CTX
329 vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
330 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
331 vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
332 vshr.u64 $mod,#32
333 vldmia $Htbl,{$Hhi-$Hlo} @ load H
334 veor $zero,$zero
335#ifdef __ARMEL__
336 vrev64.8 $IN,$IN
337#endif
338 veor $Qpost,$Qpost
339 veor $R,$R
340 mov $cnt,#16
341 veor $Z,$Z
342 mov $len,#16
343 veor $Zo,$Zo
344 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
345 b .Linner_neon
346.size gcm_gmult_neon,.-gcm_gmult_neon
347
348.global gcm_ghash_neon
349.type gcm_ghash_neon,%function
350.align 4
351gcm_ghash_neon:
352 vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
353 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
354 vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
355 vshr.u64 $mod,#32
356 vldmia $Xi,{$Hhi-$Hlo} @ load H
357 veor $zero,$zero
358 nop
359#ifdef __ARMEL__
360 vrev64.8 $Z,$Z
361#endif
362.Louter_neon:
363 vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
364 veor $Qpost,$Qpost
365 vld1.64 `&Dlo($IN)`,[$inp]!
366 veor $R,$R
367 mov $cnt,#16
368#ifdef __ARMEL__
369 vrev64.8 $IN,$IN
370#endif
371 veor $Zo,$Zo
372 veor $IN,$Z @ inp^=Xi
373 veor $Z,$Z
374 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
375.Linner_neon:
376 subs $cnt,$cnt,#1
377 vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
378 vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
379 vext.8 $IN,$zero,#1 @ IN>>=8
380
381 veor $Z,$Qpost @ modulo-scheduled part
382 vshl.i64 `&Dlo("$R")`,#48
383 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
384 veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
385
386 veor `&Dhi("$Z")`,`&Dlo("$R")`
387 vuzp.8 $Qlo,$Qhi
388 vsli.8 $Zo,$T,#1 @ compose the "carry" byte
389 vext.8 $Z,$zero,#1 @ Z>>=8
390
391 vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
392 vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
393 vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
394 veor $Z,$Qhi
395 bne .Linner_neon
396
397 veor $Z,$Qpost @ modulo-scheduled artefact
398 vshl.i64 `&Dlo("$R")`,#48
399 veor `&Dhi("$Z")`,`&Dlo("$R")`
400
401 @ finalization, normalize Z:Zo
402 vand $Zo,$mod @ suffices to mask the bit
403 vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
404 vshl.i64 $Z,#1
405 subs $len,#16
406 vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
407 bne .Louter_neon
408
409#ifdef __ARMEL__
410 vrev64.8 $Z,$Z
411#endif
412 sub $Xi,#16
413 vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
414 vst1.64 `&Dlo("$Z")`,[$Xi,:64]
415
416 bx lr
417.size gcm_ghash_neon,.-gcm_ghash_neon
418#endif
419___
420}
421$code.=<<___;
422.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
423.align 2
424___
425
426$code =~ s/\`([^\`]*)\`/eval $1/gem;
427$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
428print $code;
429close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
deleted file mode 100755
index 0354c95444..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-ia64.pl
+++ /dev/null
@@ -1,463 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15# GHASH performance was measured to be 6.67 cycles per processed byte
16# on Itanium 2, which is >90% better than Microsoft compiler generated
17# code. To anchor to something else sha1-ia64.pl module processes one
18# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19# byte.
20
21# September 2010
22#
23# It was originally thought that it makes lesser sense to implement
24# "528B" variant on Itanium 2 for following reason. Because number of
25# functional units is naturally limited, it appeared impossible to
26# implement "528B" loop in 4 cycles, only in 5. This would mean that
27# theoretically performance improvement couldn't be more than 20%.
28# But occasionally you prove yourself wrong:-) I figured out a way to
29# fold couple of instructions and having freed yet another instruction
30# slot by unrolling the loop... Resulting performance is 4.45 cycles
31# per processed byte and 50% better than "256B" version. On original
32# Itanium performance should remain the same as the "256B" version,
33# i.e. ~8.5 cycles.
34
35$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36
37if ($^O eq "hpux") {
38 $ADDP="addp4";
39 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40} else { $ADDP="add"; }
41for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
42 $big_endian=0 if (/\-DL_ENDIAN/); }
43if (!defined($big_endian))
44 { $big_endian=(unpack('L',pack('N',1))==1); }
45
46sub loop() {
47my $label=shift;
48my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49
50# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51# in scalable manner;-) Naturally assuming data in L1 cache...
52# Special note about 'dep' instruction, which is used to construct
53# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54# bytes boundary and lower 7 bits of its address are guaranteed to
55# be zero.
56$code.=<<___;
57$label:
58{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
59 (p19) dep rem=Zlo,rem_4bitp,3,4 }
60{ .mfi; (p19) xor Zhi=Zhi,Hhi
61 ($p17) xor xi[1]=xi[1],in[1] };;
62{ .mfi; (p18) ld8 Hhi=[Hi[1]]
63 (p19) shrp Zlo=Zhi,Zlo,4 }
64{ .mfi; (p19) ld8 rem=[rem]
65 (p18) and Hi[1]=mask0xf0,xi[2] };;
66{ .mmi; ($p16) ld1 in[0]=[inp],-1
67 (p18) xor Zlo=Zlo,Hlo
68 (p19) shr.u Zhi=Zhi,4 }
69{ .mib; (p19) xor Hhi=Hhi,rem
70 (p18) add Hi[1]=Htbl,Hi[1] };;
71
72{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
73 (p18) dep rem=Zlo,rem_4bitp,3,4 }
74{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
75 (p18) xor Zhi=Zhi,Hhi };;
76{ .mfi; (p18) ld8 Hhi=[Hi[1]]
77 (p18) shrp Zlo=Zhi,Zlo,4 }
78{ .mfi; (p18) ld8 rem=[rem]
79 (p17) and Hi[0]=mask0xf0,Hi[0] };;
80{ .mmi; (p16) ld1 xi[0]=[Xi],-1
81 (p18) xor Zlo=Zlo,Hlo
82 (p18) shr.u Zhi=Zhi,4 }
83{ .mib; (p18) xor Hhi=Hhi,rem
84 (p17) add Hi[0]=Htbl,Hi[0]
85 br.ctop.sptk $label };;
86___
87}
88
89$code=<<___;
90.explicit
91.text
92
93prevfs=r2; prevlc=r3; prevpr=r8;
94mask0xf0=r21;
95rem=r22; rem_4bitp=r23;
96Xi=r24; Htbl=r25;
97inp=r26; end=r27;
98Hhi=r28; Hlo=r29;
99Zhi=r30; Zlo=r31;
100
101.align 128
102.skip 16 // aligns loop body
103.global gcm_gmult_4bit#
104.proc gcm_gmult_4bit#
105gcm_gmult_4bit:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,2,6,0,8
109 $ADDP Xi=15,in0 // &Xi[15]
110 mov rem_4bitp=ip }
111{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
112 .save ar.lc,prevlc
113 mov prevlc=ar.lc
114 .save pr,prevpr
115 mov prevpr=pr };;
116
117 .body
118 .rotr in[3],xi[3],Hi[2]
119
120{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
121 mov mask0xf0=0xf0
122 brp.loop.imp .Loop1,.Lend1-16};;
123{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
124 };;
125{ .mii; shladd Hi[1]=xi[2],4,r0
126 mov pr.rot=0x7<<16
127 mov ar.lc=13 };;
128{ .mii; and Hi[1]=mask0xf0,Hi[1]
129 mov ar.ec=3
130 xor Zlo=Zlo,Zlo };;
131{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
132 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133 xor Zhi=Zhi,Zhi };;
134___
135 &loop (".Loop1",1);
136$code.=<<___;
137.Lend1:
138{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
139{ .mib; mux1 Zlo=Zlo,\@rev };;
140{ .mib; mux1 Zhi=Zhi,\@rev };;
141{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
142 add Hhi=1,Xi };; // pipeline flush on Itanium
143{ .mib; st8 [Hlo]=Zlo
144 mov pr=prevpr,0x1ffff };;
145{ .mib; st8 [Hhi]=Zhi
146 mov ar.lc=prevlc
147 br.ret.sptk.many b0 };;
148.endp gcm_gmult_4bit#
149___
150
151######################################################################
152# "528B" (well, "512B" actualy) streamed GHASH
153#
154$Xip="in0";
155$Htbl="in1";
156$inp="in2";
157$len="in3";
158$rem_8bit="loc0";
159$mask0xff="loc1";
160($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161
162sub load_htable() {
163 for (my $i=0;$i<8;$i++) {
164 $code.=<<___;
165{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
166 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
167{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
168 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
169___
170 $code.=shift if (($i+$#_)==7);
171 $code.="\t};;\n"
172 }
173}
174
175$code.=<<___;
176prevsp=r3;
177
178.align 32
179.skip 16 // aligns loop body
180.global gcm_ghash_4bit#
181.proc gcm_ghash_4bit#
182gcm_ghash_4bit:
183 .prologue
184{ .mmi; .save ar.pfs,prevfs
185 alloc prevfs=ar.pfs,4,2,0,0
186 .vframe prevsp
187 mov prevsp=sp
188 mov $rem_8bit=ip };;
189 .body
190{ .mfi; $ADDP r8=0+0,$Htbl
191 $ADDP r9=0+8,$Htbl }
192{ .mfi; $ADDP r10=128+0,$Htbl
193 $ADDP r11=128+8,$Htbl };;
194___
195 &load_htable(
196 " $ADDP $Xip=15,$Xip", # &Xi[15]
197 " $ADDP $len=$len,$inp", # &inp[len]
198 " $ADDP $inp=15,$inp", # &inp[15]
199 " mov $mask0xff=0xff",
200 " add sp=-512,sp",
201 " andcm sp=sp,$mask0xff", # align stack frame
202 " add r14=0,sp",
203 " add r15=8,sp");
204$code.=<<___;
205{ .mmi; $sum 1<<1 // go big-endian
206 add r8=256+0,sp
207 add r9=256+8,sp }
208{ .mmi; add r10=256+128+0,sp
209 add r11=256+128+8,sp
210 add $len=-17,$len };;
211___
212for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
213my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214$code.=<<___;
215{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
216 st8 [r9]=$rhi,16 // Htable[$i].hi
217 shrp $rlo=$rhi,$rlo,4 }//;;
218{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
219 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
220 shr.u $rhi=$rhi,4 };;
221{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
222 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
223___
224}
225$code.=<<___;
226{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
227 ld8 r17=[r9],16 };; // Htable[8].hi
228{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
229 ld8 r19=[r9],16 } // Htable[9].hi
230{ .mmi; rum 1<<5 // clear um.mfh
231 shrp r16=r17,r16,4 };;
232___
233for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
234$code.=<<___;
235{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
236 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
237 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
238{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
239 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
240 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
241___
242}
243$code.=<<___;
244{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248{ .mmi; add $Htbl=256,sp // &Htable[0]
249 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
251{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
252 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
253___
254
255$in="r15";
256@xi=("r16","r17");
257@rem=("r18","r19");
258($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259($Atbl,$Btbl)=("r26","r27");
260
261$code.=<<___; # (p16)
262{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
263 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
264 cmp.eq p0,p6=r0,r0 };; // clear p6
265___
266push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
267
268$code.=<<___; # (p16),(p17)
269{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
270 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
271{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
272 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
273 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
274.align 32
275.LOOP:
276{ .mmi;
277(p6) st8 [$Xip]=$Zhi,13
278 xor $Zlo=$Zlo,$Zlo
279 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
280___
281push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
282
283$code.=<<___; # (p16),(p17),(p18)
284{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
285 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
287{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
288 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
289{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
290 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
291{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292 ld1 $in=[$inp],-1 } //(p16) *inp--
293{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
294 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
295 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
296{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
297 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
298 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
301___
302push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
303
304for ($i=1;$i<14;$i++) {
305# Above and below fragments are derived from this one by removing
306# unsuitable (p??) instructions.
307$code.=<<___; # (p16),(p17),(p18),(p19)
308{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
309 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
311{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
312 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
313 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
314{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
315 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
316 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
317{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
318 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
319 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
320{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321 ld1 $in=[$inp],-1 //(p16) *inp--
322 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
323{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
324 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
325 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
326{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
327 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
328 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
331 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
332___
333push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
334}
335
336$code.=<<___; # (p17),(p18),(p19)
337{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
338 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
340{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
341 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
342 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
343{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
344 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
345 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
346{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
347 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
348 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
349{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
351{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
352 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
353 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
354{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
355 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
358 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
359___
360push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
361
362$code.=<<___; # (p18),(p19)
363{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
364 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
365{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
366 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
367{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
368 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
369{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
370 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
371{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
372 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
373{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
374 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
375{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
376 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
379___
380push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
381
382$code.=<<___; # (p19)
383{ .mmi; cmp.ltu p6,p0=$inp,$len
384 add $inp=32,$inp
385 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
386{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
387 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
388 add $Xip=9,$Xip };; // &Xi.lo
389{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
390(p6) ld1 $in=[$inp],-1 //[p16] *inp--
391(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
392{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
393(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
394{ .mmi; st8 [$Xip]=$Zlo,-8
395(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
396 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
397{ .mmi;
398(p6) ld1 $in=[$inp],-1 //[p16] *inp--
399 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
400(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
401{ .mib;
402(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
403(p6) br.cond.dptk.many .LOOP };;
404
405{ .mib; st8 [$Xip]=$Zhi };;
406{ .mib; $rum 1<<1 // return to little-endian
407 .restore sp
408 mov sp=prevsp
409 br.ret.sptk.many b0 };;
410.endp gcm_ghash_4bit#
411___
412$code.=<<___;
413.align 128
414.type rem_4bit#,\@object
415rem_4bit:
416 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420.size rem_4bit#,128
421.type rem_8bit#,\@object
422rem_8bit:
423 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455.size rem_8bit#,512
456stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457___
458
459$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461
462print $code;
463close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index 965802d3fa..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,741 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36 $NREGS =6;
37} else {
38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46 $NREGS =11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50 # [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26"; # argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl; # variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73 $Zhl="%r6";
74 $Zlh="%r7";
75 $Hhl="%r8";
76 $Hlh="%r9";
77 $Thl="%r10";
78 $Tlh="%r11";
79}
80$rem2="%r6"; # used in PA-RISC 2.0 code
81
82$code.=<<___;
83 .LEVEL $LEVEL
84#if 0
85 .SPACE \$TEXT\$
86 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
87#else
88 .text
89#endif
90
91 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
92 .ALIGN 64
93gcm_gmult_4bit
94 .PROC
95 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
96 .ENTRY
97 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
98 $PUSHMA %r3,$FRAME(%sp)
99 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
100 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
101 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
102___
103$code.=<<___ if ($SIZE_T==4);
104 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
105 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
106 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
107 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
108 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
109___
110$code.=<<___;
111 blr %r0,$rem_4bit
112 ldi 3,$rem
113L\$pic_gmult
114 andcm $rem_4bit,$rem,$rem_4bit
115 addl $inp,$len,$len
116 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
117 ldi 0xf0,$mask0xf0
118___
119$code.=<<___ if ($SIZE_T==4);
120#ifndef __OpenBSD__
121 ldi 31,$rem
122 mtctl $rem,%cr11
123 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
124 b L\$parisc1_gmult
125 nop
126___
127
128$code.=<<___;
129 ldb 15($Xi),$nlo
130 ldo 8($Htbl),$Hll
131
132 and $mask0xf0,$nlo,$nhi
133 depd,z $nlo,59,4,$nlo
134
135 ldd $nlo($Hll),$Zll
136 ldd $nlo($Hhh),$Zhh
137
138 depd,z $Zll,60,4,$rem
139 shrpd $Zhh,$Zll,4,$Zll
140 extrd,u $Zhh,59,60,$Zhh
141 ldb 14($Xi),$nlo
142
143 ldd $nhi($Hll),$Tll
144 ldd $nhi($Hhh),$Thh
145 and $mask0xf0,$nlo,$nhi
146 depd,z $nlo,59,4,$nlo
147
148 xor $Tll,$Zll,$Zll
149 xor $Thh,$Zhh,$Zhh
150 ldd $rem($rem_4bit),$rem
151 b L\$oop_gmult_pa2
152 ldi 13,$cnt
153
154 .ALIGN 8
155L\$oop_gmult_pa2
156 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
157 depd,z $Zll,60,4,$rem
158
159 shrpd $Zhh,$Zll,4,$Zll
160 extrd,u $Zhh,59,60,$Zhh
161 ldd $nlo($Hll),$Tll
162 ldd $nlo($Hhh),$Thh
163
164 xor $Tll,$Zll,$Zll
165 xor $Thh,$Zhh,$Zhh
166 ldd $rem($rem_4bit),$rem
167
168 xor $rem,$Zhh,$Zhh
169 depd,z $Zll,60,4,$rem
170 ldbx $cnt($Xi),$nlo
171
172 shrpd $Zhh,$Zll,4,$Zll
173 extrd,u $Zhh,59,60,$Zhh
174 ldd $nhi($Hll),$Tll
175 ldd $nhi($Hhh),$Thh
176
177 and $mask0xf0,$nlo,$nhi
178 depd,z $nlo,59,4,$nlo
179 ldd $rem($rem_4bit),$rem
180
181 xor $Tll,$Zll,$Zll
182 addib,uv -1,$cnt,L\$oop_gmult_pa2
183 xor $Thh,$Zhh,$Zhh
184
185 xor $rem,$Zhh,$Zhh
186 depd,z $Zll,60,4,$rem
187
188 shrpd $Zhh,$Zll,4,$Zll
189 extrd,u $Zhh,59,60,$Zhh
190 ldd $nlo($Hll),$Tll
191 ldd $nlo($Hhh),$Thh
192
193 xor $Tll,$Zll,$Zll
194 xor $Thh,$Zhh,$Zhh
195 ldd $rem($rem_4bit),$rem
196
197 xor $rem,$Zhh,$Zhh
198 depd,z $Zll,60,4,$rem
199
200 shrpd $Zhh,$Zll,4,$Zll
201 extrd,u $Zhh,59,60,$Zhh
202 ldd $nhi($Hll),$Tll
203 ldd $nhi($Hhh),$Thh
204
205 xor $Tll,$Zll,$Zll
206 xor $Thh,$Zhh,$Zhh
207 ldd $rem($rem_4bit),$rem
208
209 xor $rem,$Zhh,$Zhh
210 std $Zll,8($Xi)
211 std $Zhh,0($Xi)
212___
213
214$code.=<<___ if ($SIZE_T==4);
215 b L\$done_gmult
216 nop
217
218L\$parisc1_gmult
219#endif
220 ldb 15($Xi),$nlo
221 ldo 12($Htbl),$Hll
222 ldo 8($Htbl),$Hlh
223 ldo 4($Htbl),$Hhl
224
225 and $mask0xf0,$nlo,$nhi
226 zdep $nlo,27,4,$nlo
227
228 ldwx $nlo($Hll),$Zll
229 ldwx $nlo($Hlh),$Zlh
230 ldwx $nlo($Hhl),$Zhl
231 ldwx $nlo($Hhh),$Zhh
232 zdep $Zll,28,4,$rem
233 ldb 14($Xi),$nlo
234 ldwx $rem($rem_4bit),$rem
235 shrpw $Zlh,$Zll,4,$Zll
236 ldwx $nhi($Hll),$Tll
237 shrpw $Zhl,$Zlh,4,$Zlh
238 ldwx $nhi($Hlh),$Tlh
239 shrpw $Zhh,$Zhl,4,$Zhl
240 ldwx $nhi($Hhl),$Thl
241 extru $Zhh,27,28,$Zhh
242 ldwx $nhi($Hhh),$Thh
243 xor $rem,$Zhh,$Zhh
244 and $mask0xf0,$nlo,$nhi
245 zdep $nlo,27,4,$nlo
246
247 xor $Tll,$Zll,$Zll
248 ldwx $nlo($Hll),$Tll
249 xor $Tlh,$Zlh,$Zlh
250 ldwx $nlo($Hlh),$Tlh
251 xor $Thl,$Zhl,$Zhl
252 b L\$oop_gmult_pa1
253 ldi 13,$cnt
254
255 .ALIGN 8
256L\$oop_gmult_pa1
257 zdep $Zll,28,4,$rem
258 ldwx $nlo($Hhl),$Thl
259 xor $Thh,$Zhh,$Zhh
260 ldwx $rem($rem_4bit),$rem
261 shrpw $Zlh,$Zll,4,$Zll
262 ldwx $nlo($Hhh),$Thh
263 shrpw $Zhl,$Zlh,4,$Zlh
264 ldbx $cnt($Xi),$nlo
265 xor $Tll,$Zll,$Zll
266 ldwx $nhi($Hll),$Tll
267 shrpw $Zhh,$Zhl,4,$Zhl
268 xor $Tlh,$Zlh,$Zlh
269 ldwx $nhi($Hlh),$Tlh
270 extru $Zhh,27,28,$Zhh
271 xor $Thl,$Zhl,$Zhl
272 ldwx $nhi($Hhl),$Thl
273 xor $rem,$Zhh,$Zhh
274 zdep $Zll,28,4,$rem
275 xor $Thh,$Zhh,$Zhh
276 ldwx $nhi($Hhh),$Thh
277 shrpw $Zlh,$Zll,4,$Zll
278 ldwx $rem($rem_4bit),$rem
279 shrpw $Zhl,$Zlh,4,$Zlh
280 shrpw $Zhh,$Zhl,4,$Zhl
281 and $mask0xf0,$nlo,$nhi
282 extru $Zhh,27,28,$Zhh
283 zdep $nlo,27,4,$nlo
284 xor $Tll,$Zll,$Zll
285 ldwx $nlo($Hll),$Tll
286 xor $Tlh,$Zlh,$Zlh
287 ldwx $nlo($Hlh),$Tlh
288 xor $rem,$Zhh,$Zhh
289 addib,uv -1,$cnt,L\$oop_gmult_pa1
290 xor $Thl,$Zhl,$Zhl
291
292 zdep $Zll,28,4,$rem
293 ldwx $nlo($Hhl),$Thl
294 xor $Thh,$Zhh,$Zhh
295 ldwx $rem($rem_4bit),$rem
296 shrpw $Zlh,$Zll,4,$Zll
297 ldwx $nlo($Hhh),$Thh
298 shrpw $Zhl,$Zlh,4,$Zlh
299 xor $Tll,$Zll,$Zll
300 ldwx $nhi($Hll),$Tll
301 shrpw $Zhh,$Zhl,4,$Zhl
302 xor $Tlh,$Zlh,$Zlh
303 ldwx $nhi($Hlh),$Tlh
304 extru $Zhh,27,28,$Zhh
305 xor $rem,$Zhh,$Zhh
306 xor $Thl,$Zhl,$Zhl
307 ldwx $nhi($Hhl),$Thl
308 xor $Thh,$Zhh,$Zhh
309 ldwx $nhi($Hhh),$Thh
310 zdep $Zll,28,4,$rem
311 ldwx $rem($rem_4bit),$rem
312 shrpw $Zlh,$Zll,4,$Zll
313 shrpw $Zhl,$Zlh,4,$Zlh
314 shrpw $Zhh,$Zhl,4,$Zhl
315 extru $Zhh,27,28,$Zhh
316 xor $Tll,$Zll,$Zll
317 xor $Tlh,$Zlh,$Zlh
318 xor $rem,$Zhh,$Zhh
319 stw $Zll,12($Xi)
320 xor $Thl,$Zhl,$Zhl
321 stw $Zlh,8($Xi)
322 xor $Thh,$Zhh,$Zhh
323 stw $Zhl,4($Xi)
324 stw $Zhh,0($Xi)
325___
326$code.=<<___;
327L\$done_gmult
328 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
329 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
330 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
331 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
332___
333$code.=<<___ if ($SIZE_T==4);
334 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
335 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
336 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
337 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
338 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
339___
340$code.=<<___;
341 bv (%r2)
342 .EXIT
343 $POPMB -$FRAME(%sp),%r3
344 .PROCEND
345
346 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
347 .ALIGN 64
348gcm_ghash_4bit
349 .PROC
350 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
351 .ENTRY
352 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
353 $PUSHMA %r3,$FRAME(%sp)
354 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
355 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
356 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
357___
358$code.=<<___ if ($SIZE_T==4);
359 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
360 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
361 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
362 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
363 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
364___
365$code.=<<___;
366 blr %r0,$rem_4bit
367 ldi 3,$rem
368L\$pic_ghash
369 andcm $rem_4bit,$rem,$rem_4bit
370 addl $inp,$len,$len
371 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
372 ldi 0xf0,$mask0xf0
373___
374$code.=<<___ if ($SIZE_T==4);
375#ifndef __OpenBSD__
376 ldi 31,$rem
377 mtctl $rem,%cr11
378 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
379 b L\$parisc1_ghash
380 nop
381___
382
383$code.=<<___;
384 ldb 15($Xi),$nlo
385 ldo 8($Htbl),$Hll
386
387L\$outer_ghash_pa2
388 ldb 15($inp),$nhi
389 xor $nhi,$nlo,$nlo
390 and $mask0xf0,$nlo,$nhi
391 depd,z $nlo,59,4,$nlo
392
393 ldd $nlo($Hll),$Zll
394 ldd $nlo($Hhh),$Zhh
395
396 depd,z $Zll,60,4,$rem
397 shrpd $Zhh,$Zll,4,$Zll
398 extrd,u $Zhh,59,60,$Zhh
399 ldb 14($Xi),$nlo
400 ldb 14($inp),$byte
401
402 ldd $nhi($Hll),$Tll
403 ldd $nhi($Hhh),$Thh
404 xor $byte,$nlo,$nlo
405 and $mask0xf0,$nlo,$nhi
406 depd,z $nlo,59,4,$nlo
407
408 xor $Tll,$Zll,$Zll
409 xor $Thh,$Zhh,$Zhh
410 ldd $rem($rem_4bit),$rem
411 b L\$oop_ghash_pa2
412 ldi 13,$cnt
413
414 .ALIGN 8
415L\$oop_ghash_pa2
416 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
417 depd,z $Zll,60,4,$rem2
418
419 shrpd $Zhh,$Zll,4,$Zll
420 extrd,u $Zhh,59,60,$Zhh
421 ldd $nlo($Hll),$Tll
422 ldd $nlo($Hhh),$Thh
423
424 xor $Tll,$Zll,$Zll
425 xor $Thh,$Zhh,$Zhh
426 ldbx $cnt($Xi),$nlo
427 ldbx $cnt($inp),$byte
428
429 depd,z $Zll,60,4,$rem
430 shrpd $Zhh,$Zll,4,$Zll
431 ldd $rem2($rem_4bit),$rem2
432
433 xor $rem2,$Zhh,$Zhh
434 xor $byte,$nlo,$nlo
435 ldd $nhi($Hll),$Tll
436 ldd $nhi($Hhh),$Thh
437
438 and $mask0xf0,$nlo,$nhi
439 depd,z $nlo,59,4,$nlo
440
441 extrd,u $Zhh,59,60,$Zhh
442 xor $Tll,$Zll,$Zll
443
444 ldd $rem($rem_4bit),$rem
445 addib,uv -1,$cnt,L\$oop_ghash_pa2
446 xor $Thh,$Zhh,$Zhh
447
448 xor $rem,$Zhh,$Zhh
449 depd,z $Zll,60,4,$rem2
450
451 shrpd $Zhh,$Zll,4,$Zll
452 extrd,u $Zhh,59,60,$Zhh
453 ldd $nlo($Hll),$Tll
454 ldd $nlo($Hhh),$Thh
455
456 xor $Tll,$Zll,$Zll
457 xor $Thh,$Zhh,$Zhh
458
459 depd,z $Zll,60,4,$rem
460 shrpd $Zhh,$Zll,4,$Zll
461 ldd $rem2($rem_4bit),$rem2
462
463 xor $rem2,$Zhh,$Zhh
464 ldd $nhi($Hll),$Tll
465 ldd $nhi($Hhh),$Thh
466
467 extrd,u $Zhh,59,60,$Zhh
468 xor $Tll,$Zll,$Zll
469 xor $Thh,$Zhh,$Zhh
470 ldd $rem($rem_4bit),$rem
471
472 xor $rem,$Zhh,$Zhh
473 std $Zll,8($Xi)
474 ldo 16($inp),$inp
475 std $Zhh,0($Xi)
476 cmpb,*<> $inp,$len,L\$outer_ghash_pa2
477 copy $Zll,$nlo
478___
479
480$code.=<<___ if ($SIZE_T==4);
481 b L\$done_ghash
482 nop
483
484L\$parisc1_ghash
485#endif
486 ldb 15($Xi),$nlo
487 ldo 12($Htbl),$Hll
488 ldo 8($Htbl),$Hlh
489 ldo 4($Htbl),$Hhl
490
491L\$outer_ghash_pa1
492 ldb 15($inp),$byte
493 xor $byte,$nlo,$nlo
494 and $mask0xf0,$nlo,$nhi
495 zdep $nlo,27,4,$nlo
496
497 ldwx $nlo($Hll),$Zll
498 ldwx $nlo($Hlh),$Zlh
499 ldwx $nlo($Hhl),$Zhl
500 ldwx $nlo($Hhh),$Zhh
501 zdep $Zll,28,4,$rem
502 ldb 14($Xi),$nlo
503 ldb 14($inp),$byte
504 ldwx $rem($rem_4bit),$rem
505 shrpw $Zlh,$Zll,4,$Zll
506 ldwx $nhi($Hll),$Tll
507 shrpw $Zhl,$Zlh,4,$Zlh
508 ldwx $nhi($Hlh),$Tlh
509 shrpw $Zhh,$Zhl,4,$Zhl
510 ldwx $nhi($Hhl),$Thl
511 extru $Zhh,27,28,$Zhh
512 ldwx $nhi($Hhh),$Thh
513 xor $byte,$nlo,$nlo
514 xor $rem,$Zhh,$Zhh
515 and $mask0xf0,$nlo,$nhi
516 zdep $nlo,27,4,$nlo
517
518 xor $Tll,$Zll,$Zll
519 ldwx $nlo($Hll),$Tll
520 xor $Tlh,$Zlh,$Zlh
521 ldwx $nlo($Hlh),$Tlh
522 xor $Thl,$Zhl,$Zhl
523 b L\$oop_ghash_pa1
524 ldi 13,$cnt
525
526 .ALIGN 8
527L\$oop_ghash_pa1
528 zdep $Zll,28,4,$rem
529 ldwx $nlo($Hhl),$Thl
530 xor $Thh,$Zhh,$Zhh
531 ldwx $rem($rem_4bit),$rem
532 shrpw $Zlh,$Zll,4,$Zll
533 ldwx $nlo($Hhh),$Thh
534 shrpw $Zhl,$Zlh,4,$Zlh
535 ldbx $cnt($Xi),$nlo
536 xor $Tll,$Zll,$Zll
537 ldwx $nhi($Hll),$Tll
538 shrpw $Zhh,$Zhl,4,$Zhl
539 ldbx $cnt($inp),$byte
540 xor $Tlh,$Zlh,$Zlh
541 ldwx $nhi($Hlh),$Tlh
542 extru $Zhh,27,28,$Zhh
543 xor $Thl,$Zhl,$Zhl
544 ldwx $nhi($Hhl),$Thl
545 xor $rem,$Zhh,$Zhh
546 zdep $Zll,28,4,$rem
547 xor $Thh,$Zhh,$Zhh
548 ldwx $nhi($Hhh),$Thh
549 shrpw $Zlh,$Zll,4,$Zll
550 ldwx $rem($rem_4bit),$rem
551 shrpw $Zhl,$Zlh,4,$Zlh
552 xor $byte,$nlo,$nlo
553 shrpw $Zhh,$Zhl,4,$Zhl
554 and $mask0xf0,$nlo,$nhi
555 extru $Zhh,27,28,$Zhh
556 zdep $nlo,27,4,$nlo
557 xor $Tll,$Zll,$Zll
558 ldwx $nlo($Hll),$Tll
559 xor $Tlh,$Zlh,$Zlh
560 ldwx $nlo($Hlh),$Tlh
561 xor $rem,$Zhh,$Zhh
562 addib,uv -1,$cnt,L\$oop_ghash_pa1
563 xor $Thl,$Zhl,$Zhl
564
565 zdep $Zll,28,4,$rem
566 ldwx $nlo($Hhl),$Thl
567 xor $Thh,$Zhh,$Zhh
568 ldwx $rem($rem_4bit),$rem
569 shrpw $Zlh,$Zll,4,$Zll
570 ldwx $nlo($Hhh),$Thh
571 shrpw $Zhl,$Zlh,4,$Zlh
572 xor $Tll,$Zll,$Zll
573 ldwx $nhi($Hll),$Tll
574 shrpw $Zhh,$Zhl,4,$Zhl
575 xor $Tlh,$Zlh,$Zlh
576 ldwx $nhi($Hlh),$Tlh
577 extru $Zhh,27,28,$Zhh
578 xor $rem,$Zhh,$Zhh
579 xor $Thl,$Zhl,$Zhl
580 ldwx $nhi($Hhl),$Thl
581 xor $Thh,$Zhh,$Zhh
582 ldwx $nhi($Hhh),$Thh
583 zdep $Zll,28,4,$rem
584 ldwx $rem($rem_4bit),$rem
585 shrpw $Zlh,$Zll,4,$Zll
586 shrpw $Zhl,$Zlh,4,$Zlh
587 shrpw $Zhh,$Zhl,4,$Zhl
588 extru $Zhh,27,28,$Zhh
589 xor $Tll,$Zll,$Zll
590 xor $Tlh,$Zlh,$Zlh
591 xor $rem,$Zhh,$Zhh
592 stw $Zll,12($Xi)
593 xor $Thl,$Zhl,$Zhl
594 stw $Zlh,8($Xi)
595 xor $Thh,$Zhh,$Zhh
596 stw $Zhl,4($Xi)
597 ldo 16($inp),$inp
598 stw $Zhh,0($Xi)
599 comb,<> $inp,$len,L\$outer_ghash_pa1
600 copy $Zll,$nlo
601___
602$code.=<<___;
603L\$done_ghash
604 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
605 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
606 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
607 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
608___
609$code.=<<___ if ($SIZE_T==4);
610 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
611 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
612 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
613 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
614 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
615___
616$code.=<<___;
617 bv (%r2)
618 .EXIT
619 $POPMB -$FRAME(%sp),%r3
620 .PROCEND
621
622 .ALIGN 64
623L\$rem_4bit
624 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
625 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
626 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
627 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
628
629 .data
630 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
631 .ALIGN 64
632___
633
634# Explicitly encode PA-RISC 2.0 instructions used in this module, so
635# that it can be compiled with .LEVEL 1.0. It should be noted that I
636# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
637# directive...
638
639my $ldd = sub {
640 my ($mod,$args) = @_;
641 my $orig = "ldd$mod\t$args";
642
643 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
644 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
645 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
646 }
647 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
648 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
649 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
650 $opcode|=(1<<5) if ($mod =~ /^,m/);
651 $opcode|=(1<<13) if ($mod =~ /^,mb/);
652 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
653 }
654 else { "\t".$orig; }
655};
656
657my $std = sub {
658 my ($mod,$args) = @_;
659 my $orig = "std$mod\t$args";
660
661 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
662 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
663 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
664 }
665 else { "\t".$orig; }
666};
667
668my $extrd = sub {
669 my ($mod,$args) = @_;
670 my $orig = "extrd$mod\t$args";
671
672 # I only have ",u" completer, it's implicitly encoded...
673 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
674 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
675 my $len=32-$3;
676 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
677 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
678 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
679 }
680 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
681 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
682 my $len=32-$2;
683 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
684 $opcode |= (1<<13) if ($mod =~ /,\**=/);
685 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
686 }
687 else { "\t".$orig; }
688};
689
690my $shrpd = sub {
691 my ($mod,$args) = @_;
692 my $orig = "shrpd$mod\t$args";
693
694 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
695 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
696 my $cpos=63-$3;
697 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
698 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
699 }
700 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
701 { sprintf "\t.WORD\t0x%08x\t; %s",
702 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
703 }
704 else { "\t".$orig; }
705};
706
707my $depd = sub {
708 my ($mod,$args) = @_;
709 my $orig = "depd$mod\t$args";
710
711 # I only have ",z" completer, it's implicitly encoded...
712 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
713 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
714 my $cpos=63-$2;
715 my $len=32-$3;
716 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
717 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
718 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
719 }
720 else { "\t".$orig; }
721};
722
723sub assemble {
724 my ($mnemonic,$mod,$args)=@_;
725 my $opcode = eval("\$$mnemonic");
726
727 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
728}
729
730foreach (split("\n",$code)) {
731 s/\`([^\`]*)\`/eval $1/ge;
732 if ($SIZE_T==4) {
733 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
734 s/cmpb,\*/comb,/;
735 s/,\*/,/;
736 }
737 s/\bbv\b/bve/ if ($SIZE_T==8);
738 print $_,"\n";
739}
740
741close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
deleted file mode 100644
index 6a40d5d89c..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-s390x.pl
+++ /dev/null
@@ -1,262 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# September 2010.
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# was measured to be ~18 cycles per processed byte on z10, which is
16# almost 40% better than gcc-generated code. It should be noted that
17# 18 cycles is worse result than expected: loop is scheduled for 12
18# and the result should be close to 12. In the lack of instruction-
19# level profiling data it's impossible to tell why...
20
21# November 2010.
22#
23# Adapt for -m31 build. If kernel supports what's called "highgprs"
24# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
25# instructions and achieve "64-bit" performance even in 31-bit legacy
26# application context. The feature is not specific to any particular
27# processor, as long as it's "z-CPU". Latter implies that the code
28# remains z/Architecture specific. On z990 it was measured to perform
29# 2.8x better than 32-bit code generated by gcc 4.3.
30
31# March 2011.
32#
33# Support for hardware KIMD-GHASH is verified to produce correct
34# result and therefore is engaged. On z196 it was measured to process
35# 8KB buffer ~7 faster than software implementation. It's not as
36# impressive for smaller buffer sizes and for smallest 16-bytes buffer
37# it's actually almost 2 times slower. Which is the reason why
38# KIMD-GHASH is not used in gcm_gmult_4bit.
39
40$flavour = shift;
41
42if ($flavour =~ /3[12]/) {
43 $SIZE_T=4;
44 $g="";
45} else {
46 $SIZE_T=8;
47 $g="g";
48}
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53$softonly=0;
54
55$Zhi="%r0";
56$Zlo="%r1";
57
58$Xi="%r2"; # argument block
59$Htbl="%r3";
60$inp="%r4";
61$len="%r5";
62
63$rem0="%r6"; # variables
64$rem1="%r7";
65$nlo="%r8";
66$nhi="%r9";
67$xi="%r10";
68$cnt="%r11";
69$tmp="%r12";
70$x78="%r13";
71$rem_4bit="%r14";
72
73$sp="%r15";
74
75$code.=<<___;
76.text
77
78.globl gcm_gmult_4bit
79.align 32
80gcm_gmult_4bit:
81___
82$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
83 larl %r1,OPENSSL_s390xcap_P
84 lg %r0,0(%r1)
85 tmhl %r0,0x4000 # check for message-security-assist
86 jz .Lsoft_gmult
87 lghi %r0,0
88 la %r1,16($sp)
89 .long 0xb93e0004 # kimd %r0,%r4
90 lg %r1,24($sp)
91 tmhh %r1,0x4000 # check for function 65
92 jz .Lsoft_gmult
93 stg %r0,16($sp) # arrange 16 bytes of zero input
94 stg %r0,24($sp)
95 lghi %r0,65 # function 65
96 la %r1,0($Xi) # H lies right after Xi in gcm128_context
97 la $inp,16($sp)
98 lghi $len,16
99 .long 0xb93e0004 # kimd %r0,$inp
100 brc 1,.-4 # pay attention to "partial completion"
101 br %r14
102.align 32
103.Lsoft_gmult:
104___
105$code.=<<___;
106 stm${g} %r6,%r14,6*$SIZE_T($sp)
107
108 aghi $Xi,-1
109 lghi $len,1
110 lghi $x78,`0xf<<3`
111 larl $rem_4bit,rem_4bit
112
113 lg $Zlo,8+1($Xi) # Xi
114 j .Lgmult_shortcut
115.type gcm_gmult_4bit,\@function
116.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
117
118.globl gcm_ghash_4bit
119.align 32
120gcm_ghash_4bit:
121___
122$code.=<<___ if(!$softonly);
123 larl %r1,OPENSSL_s390xcap_P
124 lg %r0,0(%r1)
125 tmhl %r0,0x4000 # check for message-security-assist
126 jz .Lsoft_ghash
127 lghi %r0,0
128 la %r1,16($sp)
129 .long 0xb93e0004 # kimd %r0,%r4
130 lg %r1,24($sp)
131 tmhh %r1,0x4000 # check for function 65
132 jz .Lsoft_ghash
133 lghi %r0,65 # function 65
134 la %r1,0($Xi) # H lies right after Xi in gcm128_context
135 .long 0xb93e0004 # kimd %r0,$inp
136 brc 1,.-4 # pay attention to "partial completion"
137 br %r14
138.align 32
139.Lsoft_ghash:
140___
141$code.=<<___ if ($flavour =~ /3[12]/);
142 llgfr $len,$len
143___
144$code.=<<___;
145 stm${g} %r6,%r14,6*$SIZE_T($sp)
146
147 aghi $Xi,-1
148 srlg $len,$len,4
149 lghi $x78,`0xf<<3`
150 larl $rem_4bit,rem_4bit
151
152 lg $Zlo,8+1($Xi) # Xi
153 lg $Zhi,0+1($Xi)
154 lghi $tmp,0
155.Louter:
156 xg $Zhi,0($inp) # Xi ^= inp
157 xg $Zlo,8($inp)
158 xgr $Zhi,$tmp
159 stg $Zlo,8+1($Xi)
160 stg $Zhi,0+1($Xi)
161
162.Lgmult_shortcut:
163 lghi $tmp,0xf0
164 sllg $nlo,$Zlo,4
165 srlg $xi,$Zlo,8 # extract second byte
166 ngr $nlo,$tmp
167 lgr $nhi,$Zlo
168 lghi $cnt,14
169 ngr $nhi,$tmp
170
171 lg $Zlo,8($nlo,$Htbl)
172 lg $Zhi,0($nlo,$Htbl)
173
174 sllg $nlo,$xi,4
175 sllg $rem0,$Zlo,3
176 ngr $nlo,$tmp
177 ngr $rem0,$x78
178 ngr $xi,$tmp
179
180 sllg $tmp,$Zhi,60
181 srlg $Zlo,$Zlo,4
182 srlg $Zhi,$Zhi,4
183 xg $Zlo,8($nhi,$Htbl)
184 xg $Zhi,0($nhi,$Htbl)
185 lgr $nhi,$xi
186 sllg $rem1,$Zlo,3
187 xgr $Zlo,$tmp
188 ngr $rem1,$x78
189 j .Lghash_inner
190.align 16
191.Lghash_inner:
192 srlg $Zlo,$Zlo,4
193 sllg $tmp,$Zhi,60
194 xg $Zlo,8($nlo,$Htbl)
195 srlg $Zhi,$Zhi,4
196 llgc $xi,0($cnt,$Xi)
197 xg $Zhi,0($nlo,$Htbl)
198 sllg $nlo,$xi,4
199 xg $Zhi,0($rem0,$rem_4bit)
200 nill $nlo,0xf0
201 sllg $rem0,$Zlo,3
202 xgr $Zlo,$tmp
203 ngr $rem0,$x78
204 nill $xi,0xf0
205
206 sllg $tmp,$Zhi,60
207 srlg $Zlo,$Zlo,4
208 srlg $Zhi,$Zhi,4
209 xg $Zlo,8($nhi,$Htbl)
210 xg $Zhi,0($nhi,$Htbl)
211 lgr $nhi,$xi
212 xg $Zhi,0($rem1,$rem_4bit)
213 sllg $rem1,$Zlo,3
214 xgr $Zlo,$tmp
215 ngr $rem1,$x78
216 brct $cnt,.Lghash_inner
217
218 sllg $tmp,$Zhi,60
219 srlg $Zlo,$Zlo,4
220 srlg $Zhi,$Zhi,4
221 xg $Zlo,8($nlo,$Htbl)
222 xg $Zhi,0($nlo,$Htbl)
223 sllg $xi,$Zlo,3
224 xg $Zhi,0($rem0,$rem_4bit)
225 xgr $Zlo,$tmp
226 ngr $xi,$x78
227
228 sllg $tmp,$Zhi,60
229 srlg $Zlo,$Zlo,4
230 srlg $Zhi,$Zhi,4
231 xg $Zlo,8($nhi,$Htbl)
232 xg $Zhi,0($nhi,$Htbl)
233 xgr $Zlo,$tmp
234 xg $Zhi,0($rem1,$rem_4bit)
235
236 lg $tmp,0($xi,$rem_4bit)
237 la $inp,16($inp)
238 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
239 brctg $len,.Louter
240
241 xgr $Zhi,$tmp
242 stg $Zlo,8+1($Xi)
243 stg $Zhi,0+1($Xi)
244 lm${g} %r6,%r14,6*$SIZE_T($sp)
245 br %r14
246.type gcm_ghash_4bit,\@function
247.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
248
249.align 64
250rem_4bit:
251 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
252 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
253 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
254 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
255.type rem_4bit,\@object
256.size rem_4bit,(.-rem_4bit)
257.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
258___
259
260$code =~ s/\`([^\`]*)\`/eval $1/gem;
261print $code;
262close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index 70e7b044a3..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,330 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18# gcc 3.3.x cc 5.2 this assembler
19#
20# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
21# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25# gcc 4.4.1 this assembler
26#
27# 32-bit build 566 50 (+1000%)
28# 64-bit build 56 50 (+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64) { $bias=2047; $frame=192; }
43else { $bias=0; $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0"; # 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0"; # small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0"; # input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section ".text",#alloc,#execinstr
71
72.align 64
73rem_4bit:
74 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type rem_4bit,#object
79.size rem_4bit,(.-rem_4bit)
80
81.globl gcm_ghash_4bit
82.align 32
83gcm_ghash_4bit:
84 save %sp,-$frame,%sp
85 ldub [$inp+15],$nlo
86 ldub [$Xi+15],$xi0
87 ldub [$Xi+14],$xi1
88 add $len,$inp,$len
89 add $Htbl,8,$Htblo
90
911: call .+8
92 add %o7,rem_4bit-1b,$rem_4bit
93
94.Louter:
95 xor $xi0,$nlo,$nlo
96 and $nlo,0xf0,$nhi
97 and $nlo,0x0f,$nlo
98 sll $nlo,4,$nlo
99 ldx [$Htblo+$nlo],$Zlo
100 ldx [$Htbl+$nlo],$Zhi
101
102 ldub [$inp+14],$nlo
103
104 ldx [$Htblo+$nhi],$Tlo
105 and $Zlo,0xf,$remi
106 ldx [$Htbl+$nhi],$Thi
107 sll $remi,3,$remi
108 ldx [$rem_4bit+$remi],$rem
109 srlx $Zlo,4,$Zlo
110 mov 13,$cnt
111 sllx $Zhi,60,$tmp
112 xor $Tlo,$Zlo,$Zlo
113 srlx $Zhi,4,$Zhi
114 xor $Zlo,$tmp,$Zlo
115
116 xor $xi1,$nlo,$nlo
117 and $Zlo,0xf,$remi
118 and $nlo,0xf0,$nhi
119 and $nlo,0x0f,$nlo
120 ba .Lghash_inner
121 sll $nlo,4,$nlo
122.align 32
123.Lghash_inner:
124 ldx [$Htblo+$nlo],$Tlo
125 sll $remi,3,$remi
126 xor $Thi,$Zhi,$Zhi
127 ldx [$Htbl+$nlo],$Thi
128 srlx $Zlo,4,$Zlo
129 xor $rem,$Zhi,$Zhi
130 ldx [$rem_4bit+$remi],$rem
131 sllx $Zhi,60,$tmp
132 xor $Tlo,$Zlo,$Zlo
133 ldub [$inp+$cnt],$nlo
134 srlx $Zhi,4,$Zhi
135 xor $Zlo,$tmp,$Zlo
136 ldub [$Xi+$cnt],$xi1
137 xor $Thi,$Zhi,$Zhi
138 and $Zlo,0xf,$remi
139
140 ldx [$Htblo+$nhi],$Tlo
141 sll $remi,3,$remi
142 xor $rem,$Zhi,$Zhi
143 ldx [$Htbl+$nhi],$Thi
144 srlx $Zlo,4,$Zlo
145 ldx [$rem_4bit+$remi],$rem
146 sllx $Zhi,60,$tmp
147 xor $xi1,$nlo,$nlo
148 srlx $Zhi,4,$Zhi
149 and $nlo,0xf0,$nhi
150 addcc $cnt,-1,$cnt
151 xor $Zlo,$tmp,$Zlo
152 and $nlo,0x0f,$nlo
153 xor $Tlo,$Zlo,$Zlo
154 sll $nlo,4,$nlo
155 blu .Lghash_inner
156 and $Zlo,0xf,$remi
157
158 ldx [$Htblo+$nlo],$Tlo
159 sll $remi,3,$remi
160 xor $Thi,$Zhi,$Zhi
161 ldx [$Htbl+$nlo],$Thi
162 srlx $Zlo,4,$Zlo
163 xor $rem,$Zhi,$Zhi
164 ldx [$rem_4bit+$remi],$rem
165 sllx $Zhi,60,$tmp
166 xor $Tlo,$Zlo,$Zlo
167 srlx $Zhi,4,$Zhi
168 xor $Zlo,$tmp,$Zlo
169 xor $Thi,$Zhi,$Zhi
170
171 add $inp,16,$inp
172 cmp $inp,$len
173 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
174 and $Zlo,0xf,$remi
175
176 ldx [$Htblo+$nhi],$Tlo
177 sll $remi,3,$remi
178 xor $rem,$Zhi,$Zhi
179 ldx [$Htbl+$nhi],$Thi
180 srlx $Zlo,4,$Zlo
181 ldx [$rem_4bit+$remi],$rem
182 sllx $Zhi,60,$tmp
183 xor $Tlo,$Zlo,$Zlo
184 ldub [$inp+15],$nlo
185 srlx $Zhi,4,$Zhi
186 xor $Zlo,$tmp,$Zlo
187 xor $Thi,$Zhi,$Zhi
188 stx $Zlo,[$Xi+8]
189 xor $rem,$Zhi,$Zhi
190 stx $Zhi,[$Xi]
191 srl $Zlo,8,$xi1
192 and $Zlo,0xff,$xi0
193 ba .Louter
194 and $xi1,0xff,$xi1
195.align 32
196.Ldone:
197 ldx [$Htblo+$nhi],$Tlo
198 sll $remi,3,$remi
199 xor $rem,$Zhi,$Zhi
200 ldx [$Htbl+$nhi],$Thi
201 srlx $Zlo,4,$Zlo
202 ldx [$rem_4bit+$remi],$rem
203 sllx $Zhi,60,$tmp
204 xor $Tlo,$Zlo,$Zlo
205 srlx $Zhi,4,$Zhi
206 xor $Zlo,$tmp,$Zlo
207 xor $Thi,$Zhi,$Zhi
208 stx $Zlo,[$Xi+8]
209 xor $rem,$Zhi,$Zhi
210 stx $Zhi,[$Xi]
211
212 ret
213 restore
214.type gcm_ghash_4bit,#function
215.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
216___
217
218undef $inp;
219undef $len;
220
221$code.=<<___;
222.globl gcm_gmult_4bit
223.align 32
224gcm_gmult_4bit:
225 save %sp,-$frame,%sp
226 ldub [$Xi+15],$nlo
227 add $Htbl,8,$Htblo
228
2291: call .+8
230 add %o7,rem_4bit-1b,$rem_4bit
231
232 and $nlo,0xf0,$nhi
233 and $nlo,0x0f,$nlo
234 sll $nlo,4,$nlo
235 ldx [$Htblo+$nlo],$Zlo
236 ldx [$Htbl+$nlo],$Zhi
237
238 ldub [$Xi+14],$nlo
239
240 ldx [$Htblo+$nhi],$Tlo
241 and $Zlo,0xf,$remi
242 ldx [$Htbl+$nhi],$Thi
243 sll $remi,3,$remi
244 ldx [$rem_4bit+$remi],$rem
245 srlx $Zlo,4,$Zlo
246 mov 13,$cnt
247 sllx $Zhi,60,$tmp
248 xor $Tlo,$Zlo,$Zlo
249 srlx $Zhi,4,$Zhi
250 xor $Zlo,$tmp,$Zlo
251
252 and $Zlo,0xf,$remi
253 and $nlo,0xf0,$nhi
254 and $nlo,0x0f,$nlo
255 ba .Lgmult_inner
256 sll $nlo,4,$nlo
257.align 32
258.Lgmult_inner:
259 ldx [$Htblo+$nlo],$Tlo
260 sll $remi,3,$remi
261 xor $Thi,$Zhi,$Zhi
262 ldx [$Htbl+$nlo],$Thi
263 srlx $Zlo,4,$Zlo
264 xor $rem,$Zhi,$Zhi
265 ldx [$rem_4bit+$remi],$rem
266 sllx $Zhi,60,$tmp
267 xor $Tlo,$Zlo,$Zlo
268 ldub [$Xi+$cnt],$nlo
269 srlx $Zhi,4,$Zhi
270 xor $Zlo,$tmp,$Zlo
271 xor $Thi,$Zhi,$Zhi
272 and $Zlo,0xf,$remi
273
274 ldx [$Htblo+$nhi],$Tlo
275 sll $remi,3,$remi
276 xor $rem,$Zhi,$Zhi
277 ldx [$Htbl+$nhi],$Thi
278 srlx $Zlo,4,$Zlo
279 ldx [$rem_4bit+$remi],$rem
280 sllx $Zhi,60,$tmp
281 srlx $Zhi,4,$Zhi
282 and $nlo,0xf0,$nhi
283 addcc $cnt,-1,$cnt
284 xor $Zlo,$tmp,$Zlo
285 and $nlo,0x0f,$nlo
286 xor $Tlo,$Zlo,$Zlo
287 sll $nlo,4,$nlo
288 blu .Lgmult_inner
289 and $Zlo,0xf,$remi
290
291 ldx [$Htblo+$nlo],$Tlo
292 sll $remi,3,$remi
293 xor $Thi,$Zhi,$Zhi
294 ldx [$Htbl+$nlo],$Thi
295 srlx $Zlo,4,$Zlo
296 xor $rem,$Zhi,$Zhi
297 ldx [$rem_4bit+$remi],$rem
298 sllx $Zhi,60,$tmp
299 xor $Tlo,$Zlo,$Zlo
300 srlx $Zhi,4,$Zhi
301 xor $Zlo,$tmp,$Zlo
302 xor $Thi,$Zhi,$Zhi
303 and $Zlo,0xf,$remi
304
305 ldx [$Htblo+$nhi],$Tlo
306 sll $remi,3,$remi
307 xor $rem,$Zhi,$Zhi
308 ldx [$Htbl+$nhi],$Thi
309 srlx $Zlo,4,$Zlo
310 ldx [$rem_4bit+$remi],$rem
311 sllx $Zhi,60,$tmp
312 xor $Tlo,$Zlo,$Zlo
313 srlx $Zhi,4,$Zhi
314 xor $Zlo,$tmp,$Zlo
315 xor $Thi,$Zhi,$Zhi
316 stx $Zlo,[$Xi+8]
317 xor $rem,$Zhi,$Zhi
318 stx $Zhi,[$Xi]
319
320 ret
321 restore
322.type gcm_gmult_4bit,#function
323.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
324.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
325.align 4
326___
327
328$code =~ s/\`([^\`]*)\`/eval $1/gem;
329print $code;
330close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 83c727e07f..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1342 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, May, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15# code paths: vanilla x86 and vanilla MMX. Former will be executed on
16# 486 and Pentium, latter on all others. MMX GHASH features so called
17# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18# of per-key storage [+512 bytes shared table]. Performance results
19# are for streamed GHASH subroutine and are expressed in cycles per
20# processed byte, less is better:
21#
22# gcc 2.95.3(*) MMX assembler x86 assembler
23#
24# Pentium 105/111(**) - 50
25# PIII 68 /75 12.2 24
26# P4 125/125 17.8 84(***)
27# Opteron 66 /70 10.1 30
28# Core2 54 /67 8.4 18
29#
30# (*) gcc 3.4.x was observed to generate few percent slower code,
31# which is one of reasons why 2.95.3 results were chosen,
32# another reason is lack of 3.4.x results for older CPUs;
33# comparison with MMX results is not completely fair, because C
34# results are for vanilla "256B" implementation, while
35# assembler results are for "528B";-)
36# (**) second number is result for code compiled with -fPIC flag,
37# which is actually more relevant, because assembler code is
38# position-independent;
39# (***) see comment in non-MMX routine for further details;
40#
41# To summarize, it's >2-5 times faster than gcc-generated code. To
42# anchor it to something else SHA1 assembler processes one byte in
43# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
44# particular, see comment at the end of the file...
45
46# May 2010
47#
48# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
49# The question is how close is it to theoretical limit? The pclmulqdq
50# instruction latency appears to be 14 cycles and there can't be more
51# than 2 of them executing at any given time. This means that single
52# Karatsuba multiplication would take 28 cycles *plus* few cycles for
53# pre- and post-processing. Then multiplication has to be followed by
54# modulo-reduction. Given that aggregated reduction method [see
55# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
56# white paper by Intel] allows you to perform reduction only once in
57# a while we can assume that asymptotic performance can be estimated
58# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
59# and Naggr is the aggregation factor.
60#
61# Before we proceed to this implementation let's have closer look at
62# the best-performing code suggested by Intel in their white paper.
63# By tracing inter-register dependencies Tmod is estimated as ~19
64# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
65# processed byte. As implied, this is quite optimistic estimate,
66# because it does not account for Karatsuba pre- and post-processing,
67# which for a single multiplication is ~5 cycles. Unfortunately Intel
68# does not provide performance data for GHASH alone. But benchmarking
69# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
70# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
71# the result accounts even for pre-computing of degrees of the hash
72# key H, but its portion is negligible at 16KB buffer size.
73#
74# Moving on to the implementation in question. Tmod is estimated as
75# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
76# 2.16. How is it possible that measured performance is better than
77# optimistic theoretical estimate? There is one thing Intel failed
78# to recognize. By serializing GHASH with CTR in same subroutine
79# former's performance is really limited to above (Tmul + Tmod/Naggr)
80# equation. But if GHASH procedure is detached, the modulo-reduction
81# can be interleaved with Naggr-1 multiplications at instruction level
82# and under ideal conditions even disappear from the equation. So that
83# optimistic theoretical estimate for this implementation is ...
84# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
85# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
86# where Tproc is time required for Karatsuba pre- and post-processing,
87# is more realistic estimate. In this case it gives ... 1.91 cycles.
88# Or in other words, depending on how well we can interleave reduction
89# and one of the two multiplications the performance should be betwen
90# 1.91 and 2.16. As already mentioned, this implementation processes
91# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
92# - in 2.02. x86_64 performance is better, because larger register
93# bank allows to interleave reduction and multiplication better.
94#
95# Does it make sense to increase Naggr? To start with it's virtually
96# impossible in 32-bit mode, because of limited register bank
97# capacity. Otherwise improvement has to be weighed agiainst slower
98# setup, as well as code size and complexity increase. As even
99# optimistic estimate doesn't promise 30% performance improvement,
100# there are currently no plans to increase Naggr.
101#
102# Special thanks to David Woodhouse <dwmw2@infradead.org> for
103# providing access to a Westmere-based system on behalf of Intel
104# Open Source Technology Centre.
105
106# January 2010
107#
108# Tweaked to optimize transitions between integer and FP operations
109# on same XMM register, PCLMULQDQ subroutine was measured to process
110# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
111# The minor regression on Westmere is outweighed by ~15% improvement
112# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
113# similar manner resulted in almost 20% degradation on Sandy Bridge,
114# where original 64-bit code processes one byte in 1.95 cycles.
115
116$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117push(@INC,"${dir}","${dir}../../perlasm");
118require "x86asm.pl";
119
120&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
121
122$sse2=0;
123for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
124
125($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
126$inp = "edi";
127$Htbl = "esi";
128
129$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
130 # than unrolled, which has to be weighted against
131 # 2.5x x86-specific code size reduction.
132
133sub x86_loop {
134 my $off = shift;
135 my $rem = "eax";
136
137 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
138 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
139 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
140 &mov ($Zll,&DWP(8,$Htbl,$Zll));
141 &xor ($rem,$rem); # avoid partial register stalls on PIII
142
143 # shrd practically kills P4, 2.5x deterioration, but P4 has
144 # MMX code-path to execute. shrd runs tad faster [than twice
145 # the shifts, move's and or's] on pre-MMX Pentium (as well as
146 # PIII and Core2), *but* minimizes code size, spares register
147 # and thus allows to fold the loop...
148 if (!$unroll) {
149 my $cnt = $inp;
150 &mov ($cnt,15);
151 &jmp (&label("x86_loop"));
152 &set_label("x86_loop",16);
153 for($i=1;$i<=2;$i++) {
154 &mov (&LB($rem),&LB($Zll));
155 &shrd ($Zll,$Zlh,4);
156 &and (&LB($rem),0xf);
157 &shrd ($Zlh,$Zhl,4);
158 &shrd ($Zhl,$Zhh,4);
159 &shr ($Zhh,4);
160 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
161
162 &mov (&LB($rem),&BP($off,"esp",$cnt));
163 if ($i&1) {
164 &and (&LB($rem),0xf0);
165 } else {
166 &shl (&LB($rem),4);
167 }
168
169 &xor ($Zll,&DWP(8,$Htbl,$rem));
170 &xor ($Zlh,&DWP(12,$Htbl,$rem));
171 &xor ($Zhl,&DWP(0,$Htbl,$rem));
172 &xor ($Zhh,&DWP(4,$Htbl,$rem));
173
174 if ($i&1) {
175 &dec ($cnt);
176 &js (&label("x86_break"));
177 } else {
178 &jmp (&label("x86_loop"));
179 }
180 }
181 &set_label("x86_break",16);
182 } else {
183 for($i=1;$i<32;$i++) {
184 &comment($i);
185 &mov (&LB($rem),&LB($Zll));
186 &shrd ($Zll,$Zlh,4);
187 &and (&LB($rem),0xf);
188 &shrd ($Zlh,$Zhl,4);
189 &shrd ($Zhl,$Zhh,4);
190 &shr ($Zhh,4);
191 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
192
193 if ($i&1) {
194 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
195 &and (&LB($rem),0xf0);
196 } else {
197 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
198 &shl (&LB($rem),4);
199 }
200
201 &xor ($Zll,&DWP(8,$Htbl,$rem));
202 &xor ($Zlh,&DWP(12,$Htbl,$rem));
203 &xor ($Zhl,&DWP(0,$Htbl,$rem));
204 &xor ($Zhh,&DWP(4,$Htbl,$rem));
205 }
206 }
207 &bswap ($Zll);
208 &bswap ($Zlh);
209 &bswap ($Zhl);
210 if (!$x86only) {
211 &bswap ($Zhh);
212 } else {
213 &mov ("eax",$Zhh);
214 &bswap ("eax");
215 &mov ($Zhh,"eax");
216 }
217}
218
219if ($unroll) {
220 &function_begin_B("_x86_gmult_4bit_inner");
221 &x86_loop(4);
222 &ret ();
223 &function_end_B("_x86_gmult_4bit_inner");
224}
225
226sub deposit_rem_4bit {
227 my $bias = shift;
228
229 &mov (&DWP($bias+0, "esp"),0x0000<<16);
230 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
231 &mov (&DWP($bias+8, "esp"),0x3840<<16);
232 &mov (&DWP($bias+12,"esp"),0x2460<<16);
233 &mov (&DWP($bias+16,"esp"),0x7080<<16);
234 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
235 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
236 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
237 &mov (&DWP($bias+32,"esp"),0xE100<<16);
238 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
239 &mov (&DWP($bias+40,"esp"),0xD940<<16);
240 &mov (&DWP($bias+44,"esp"),0xC560<<16);
241 &mov (&DWP($bias+48,"esp"),0x9180<<16);
242 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
243 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
244 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
245}
246
247$suffix = $x86only ? "" : "_x86";
248
249&function_begin("gcm_gmult_4bit".$suffix);
250 &stack_push(16+4+1); # +1 for stack alignment
251 &mov ($inp,&wparam(0)); # load Xi
252 &mov ($Htbl,&wparam(1)); # load Htable
253
254 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
255 &mov ($Zhl,&DWP(4,$inp));
256 &mov ($Zlh,&DWP(8,$inp));
257 &mov ($Zll,&DWP(12,$inp));
258
259 &deposit_rem_4bit(16);
260
261 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
262 &mov (&DWP(4,"esp"),$Zhl);
263 &mov (&DWP(8,"esp"),$Zlh);
264 &mov (&DWP(12,"esp"),$Zll);
265 &shr ($Zll,20);
266 &and ($Zll,0xf0);
267
268 if ($unroll) {
269 &call ("_x86_gmult_4bit_inner");
270 } else {
271 &x86_loop(0);
272 &mov ($inp,&wparam(0));
273 }
274
275 &mov (&DWP(12,$inp),$Zll);
276 &mov (&DWP(8,$inp),$Zlh);
277 &mov (&DWP(4,$inp),$Zhl);
278 &mov (&DWP(0,$inp),$Zhh);
279 &stack_pop(16+4+1);
280&function_end("gcm_gmult_4bit".$suffix);
281
282&function_begin("gcm_ghash_4bit".$suffix);
283 &stack_push(16+4+1); # +1 for 64-bit alignment
284 &mov ($Zll,&wparam(0)); # load Xi
285 &mov ($Htbl,&wparam(1)); # load Htable
286 &mov ($inp,&wparam(2)); # load in
287 &mov ("ecx",&wparam(3)); # load len
288 &add ("ecx",$inp);
289 &mov (&wparam(3),"ecx");
290
291 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
292 &mov ($Zhl,&DWP(4,$Zll));
293 &mov ($Zlh,&DWP(8,$Zll));
294 &mov ($Zll,&DWP(12,$Zll));
295
296 &deposit_rem_4bit(16);
297
298 &set_label("x86_outer_loop",16);
299 &xor ($Zll,&DWP(12,$inp)); # xor with input
300 &xor ($Zlh,&DWP(8,$inp));
301 &xor ($Zhl,&DWP(4,$inp));
302 &xor ($Zhh,&DWP(0,$inp));
303 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
304 &mov (&DWP(8,"esp"),$Zlh);
305 &mov (&DWP(4,"esp"),$Zhl);
306 &mov (&DWP(0,"esp"),$Zhh);
307
308 &shr ($Zll,20);
309 &and ($Zll,0xf0);
310
311 if ($unroll) {
312 &call ("_x86_gmult_4bit_inner");
313 } else {
314 &x86_loop(0);
315 &mov ($inp,&wparam(2));
316 }
317 &lea ($inp,&DWP(16,$inp));
318 &cmp ($inp,&wparam(3));
319 &mov (&wparam(2),$inp) if (!$unroll);
320 &jb (&label("x86_outer_loop"));
321
322 &mov ($inp,&wparam(0)); # load Xi
323 &mov (&DWP(12,$inp),$Zll);
324 &mov (&DWP(8,$inp),$Zlh);
325 &mov (&DWP(4,$inp),$Zhl);
326 &mov (&DWP(0,$inp),$Zhh);
327 &stack_pop(16+4+1);
328&function_end("gcm_ghash_4bit".$suffix);
329
330if (!$x86only) {{{
331
332&static_label("rem_4bit");
333
334if (!$sse2) {{ # pure-MMX "May" version...
335
336$S=12; # shift factor for rem_4bit
337
338&function_begin_B("_mmx_gmult_4bit_inner");
339# MMX version performs 3.5 times better on P4 (see comment in non-MMX
340# routine for further details), 100% better on Opteron, ~70% better
341# on Core2 and PIII... In other words effort is considered to be well
342# spent... Since initial release the loop was unrolled in order to
343# "liberate" register previously used as loop counter. Instead it's
344# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
345# The path involves move of Z.lo from MMX to integer register,
346# effective address calculation and finally merge of value to Z.hi.
347# Reference to rem_4bit is scheduled so late that I had to >>4
348# rem_4bit elements. This resulted in 20-45% procent improvement
349# on contemporary µ-archs.
350{
351 my $cnt;
352 my $rem_4bit = "eax";
353 my @rem = ($Zhh,$Zll);
354 my $nhi = $Zhl;
355 my $nlo = $Zlh;
356
357 my ($Zlo,$Zhi) = ("mm0","mm1");
358 my $tmp = "mm2";
359
360 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
361 &mov ($nhi,$Zll);
362 &mov (&LB($nlo),&LB($nhi));
363 &shl (&LB($nlo),4);
364 &and ($nhi,0xf0);
365 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
366 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
367 &movd ($rem[0],$Zlo);
368
369 for ($cnt=28;$cnt>=-2;$cnt--) {
370 my $odd = $cnt&1;
371 my $nix = $odd ? $nlo : $nhi;
372
373 &shl (&LB($nlo),4) if ($odd);
374 &psrlq ($Zlo,4);
375 &movq ($tmp,$Zhi);
376 &psrlq ($Zhi,4);
377 &pxor ($Zlo,&QWP(8,$Htbl,$nix));
378 &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
379 &psllq ($tmp,60);
380 &and ($nhi,0xf0) if ($odd);
381 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
382 &and ($rem[0],0xf);
383 &pxor ($Zhi,&QWP(0,$Htbl,$nix));
384 &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
385 &movd ($rem[1],$Zlo);
386 &pxor ($Zlo,$tmp);
387
388 push (@rem,shift(@rem)); # "rotate" registers
389 }
390
391 &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
392
393 &psrlq ($Zlo,32); # lower part of Zlo is already there
394 &movd ($Zhl,$Zhi);
395 &psrlq ($Zhi,32);
396 &movd ($Zlh,$Zlo);
397 &movd ($Zhh,$Zhi);
398 &shl ($inp,4); # compensate for rem_4bit[i] being >>4
399
400 &bswap ($Zll);
401 &bswap ($Zhl);
402 &bswap ($Zlh);
403 &xor ($Zhh,$inp);
404 &bswap ($Zhh);
405
406 &ret ();
407}
408&function_end_B("_mmx_gmult_4bit_inner");
409
410&function_begin("gcm_gmult_4bit_mmx");
411 &mov ($inp,&wparam(0)); # load Xi
412 &mov ($Htbl,&wparam(1)); # load Htable
413
414 &call (&label("pic_point"));
415 &set_label("pic_point");
416 &blindpop("eax");
417 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
418
419 &movz ($Zll,&BP(15,$inp));
420
421 &call ("_mmx_gmult_4bit_inner");
422
423 &mov ($inp,&wparam(0)); # load Xi
424 &emms ();
425 &mov (&DWP(12,$inp),$Zll);
426 &mov (&DWP(4,$inp),$Zhl);
427 &mov (&DWP(8,$inp),$Zlh);
428 &mov (&DWP(0,$inp),$Zhh);
429&function_end("gcm_gmult_4bit_mmx");
430
431# Streamed version performs 20% better on P4, 7% on Opteron,
432# 10% on Core2 and PIII...
433&function_begin("gcm_ghash_4bit_mmx");
434 &mov ($Zhh,&wparam(0)); # load Xi
435 &mov ($Htbl,&wparam(1)); # load Htable
436 &mov ($inp,&wparam(2)); # load in
437 &mov ($Zlh,&wparam(3)); # load len
438
439 &call (&label("pic_point"));
440 &set_label("pic_point");
441 &blindpop("eax");
442 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
443
444 &add ($Zlh,$inp);
445 &mov (&wparam(3),$Zlh); # len to point at the end of input
446 &stack_push(4+1); # +1 for stack alignment
447
448 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
449 &mov ($Zhl,&DWP(4,$Zhh));
450 &mov ($Zlh,&DWP(8,$Zhh));
451 &mov ($Zhh,&DWP(0,$Zhh));
452 &jmp (&label("mmx_outer_loop"));
453
454 &set_label("mmx_outer_loop",16);
455 &xor ($Zll,&DWP(12,$inp));
456 &xor ($Zhl,&DWP(4,$inp));
457 &xor ($Zlh,&DWP(8,$inp));
458 &xor ($Zhh,&DWP(0,$inp));
459 &mov (&wparam(2),$inp);
460 &mov (&DWP(12,"esp"),$Zll);
461 &mov (&DWP(4,"esp"),$Zhl);
462 &mov (&DWP(8,"esp"),$Zlh);
463 &mov (&DWP(0,"esp"),$Zhh);
464
465 &mov ($inp,"esp");
466 &shr ($Zll,24);
467
468 &call ("_mmx_gmult_4bit_inner");
469
470 &mov ($inp,&wparam(2));
471 &lea ($inp,&DWP(16,$inp));
472 &cmp ($inp,&wparam(3));
473 &jb (&label("mmx_outer_loop"));
474
475 &mov ($inp,&wparam(0)); # load Xi
476 &emms ();
477 &mov (&DWP(12,$inp),$Zll);
478 &mov (&DWP(4,$inp),$Zhl);
479 &mov (&DWP(8,$inp),$Zlh);
480 &mov (&DWP(0,$inp),$Zhh);
481
482 &stack_pop(4+1);
483&function_end("gcm_ghash_4bit_mmx");
484
485}} else {{ # "June" MMX version...
486 # ... has slower "April" gcm_gmult_4bit_mmx with folded
487 # loop. This is done to conserve code size...
488$S=16; # shift factor for rem_4bit
489
490sub mmx_loop() {
491# MMX version performs 2.8 times better on P4 (see comment in non-MMX
492# routine for further details), 40% better on Opteron and Core2, 50%
493# better on PIII... In other words effort is considered to be well
494# spent...
495 my $inp = shift;
496 my $rem_4bit = shift;
497 my $cnt = $Zhh;
498 my $nhi = $Zhl;
499 my $nlo = $Zlh;
500 my $rem = $Zll;
501
502 my ($Zlo,$Zhi) = ("mm0","mm1");
503 my $tmp = "mm2";
504
505 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
506 &mov ($nhi,$Zll);
507 &mov (&LB($nlo),&LB($nhi));
508 &mov ($cnt,14);
509 &shl (&LB($nlo),4);
510 &and ($nhi,0xf0);
511 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
512 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
513 &movd ($rem,$Zlo);
514 &jmp (&label("mmx_loop"));
515
516 &set_label("mmx_loop",16);
517 &psrlq ($Zlo,4);
518 &and ($rem,0xf);
519 &movq ($tmp,$Zhi);
520 &psrlq ($Zhi,4);
521 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
522 &mov (&LB($nlo),&BP(0,$inp,$cnt));
523 &psllq ($tmp,60);
524 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
525 &dec ($cnt);
526 &movd ($rem,$Zlo);
527 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
528 &mov ($nhi,$nlo);
529 &pxor ($Zlo,$tmp);
530 &js (&label("mmx_break"));
531
532 &shl (&LB($nlo),4);
533 &and ($rem,0xf);
534 &psrlq ($Zlo,4);
535 &and ($nhi,0xf0);
536 &movq ($tmp,$Zhi);
537 &psrlq ($Zhi,4);
538 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
539 &psllq ($tmp,60);
540 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
541 &movd ($rem,$Zlo);
542 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
543 &pxor ($Zlo,$tmp);
544 &jmp (&label("mmx_loop"));
545
546 &set_label("mmx_break",16);
547 &shl (&LB($nlo),4);
548 &and ($rem,0xf);
549 &psrlq ($Zlo,4);
550 &and ($nhi,0xf0);
551 &movq ($tmp,$Zhi);
552 &psrlq ($Zhi,4);
553 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
554 &psllq ($tmp,60);
555 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
556 &movd ($rem,$Zlo);
557 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
558 &pxor ($Zlo,$tmp);
559
560 &psrlq ($Zlo,4);
561 &and ($rem,0xf);
562 &movq ($tmp,$Zhi);
563 &psrlq ($Zhi,4);
564 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
565 &psllq ($tmp,60);
566 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
567 &movd ($rem,$Zlo);
568 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
569 &pxor ($Zlo,$tmp);
570
571 &psrlq ($Zlo,32); # lower part of Zlo is already there
572 &movd ($Zhl,$Zhi);
573 &psrlq ($Zhi,32);
574 &movd ($Zlh,$Zlo);
575 &movd ($Zhh,$Zhi);
576
577 &bswap ($Zll);
578 &bswap ($Zhl);
579 &bswap ($Zlh);
580 &bswap ($Zhh);
581}
582
583&function_begin("gcm_gmult_4bit_mmx");
584 &mov ($inp,&wparam(0)); # load Xi
585 &mov ($Htbl,&wparam(1)); # load Htable
586
587 &call (&label("pic_point"));
588 &set_label("pic_point");
589 &blindpop("eax");
590 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
591
592 &movz ($Zll,&BP(15,$inp));
593
594 &mmx_loop($inp,"eax");
595
596 &emms ();
597 &mov (&DWP(12,$inp),$Zll);
598 &mov (&DWP(4,$inp),$Zhl);
599 &mov (&DWP(8,$inp),$Zlh);
600 &mov (&DWP(0,$inp),$Zhh);
601&function_end("gcm_gmult_4bit_mmx");
602
603######################################################################
604# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
605# (see gcm128.c for details). It provides further 20-40% performance
606# improvement over above mentioned "May" version.
607
608&static_label("rem_8bit");
609
610&function_begin("gcm_ghash_4bit_mmx");
611{ my ($Zlo,$Zhi) = ("mm7","mm6");
612 my $rem_8bit = "esi";
613 my $Htbl = "ebx";
614
615 # parameter block
616 &mov ("eax",&wparam(0)); # Xi
617 &mov ("ebx",&wparam(1)); # Htable
618 &mov ("ecx",&wparam(2)); # inp
619 &mov ("edx",&wparam(3)); # len
620 &mov ("ebp","esp"); # original %esp
621 &call (&label("pic_point"));
622 &set_label ("pic_point");
623 &blindpop ($rem_8bit);
624 &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
625
626 &sub ("esp",512+16+16); # allocate stack frame...
627 &and ("esp",-64); # ...and align it
628 &sub ("esp",16); # place for (u8)(H[]<<4)
629
630 &add ("edx","ecx"); # pointer to the end of input
631 &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
632 &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
633 &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
634
635 { my @lo = ("mm0","mm1","mm2");
636 my @hi = ("mm3","mm4","mm5");
637 my @tmp = ("mm6","mm7");
638 my ($off1,$off2,$i) = (0,0,);
639
640 &add ($Htbl,128); # optimize for size
641 &lea ("edi",&DWP(16+128,"esp"));
642 &lea ("ebp",&DWP(16+256+128,"esp"));
643
644 # decompose Htable (low and high parts are kept separately),
645 # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
646 for ($i=0;$i<18;$i++) {
647
648 &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
649 &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
650 &psllq ($tmp[1],60) if ($i>1);
651 &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
652 &por ($lo[2],$tmp[1]) if ($i>1);
653 &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
654 &psrlq ($lo[1],4) if ($i>0 && $i<17);
655 &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
656 &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
657 &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
658 &psrlq ($hi[1],4) if ($i>0 && $i<17);
659 &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
660 &shl ("edx",4) if ($i<16);
661 &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
662
663 unshift (@lo,pop(@lo)); # "rotate" registers
664 unshift (@hi,pop(@hi));
665 unshift (@tmp,pop(@tmp));
666 $off1 += 8 if ($i>0);
667 $off2 += 8 if ($i>1);
668 }
669 }
670
671 &movq ($Zhi,&QWP(0,"eax"));
672 &mov ("ebx",&DWP(8,"eax"));
673 &mov ("edx",&DWP(12,"eax")); # load Xi
674
675&set_label("outer",16);
676 { my $nlo = "eax";
677 my $dat = "edx";
678 my @nhi = ("edi","ebp");
679 my @rem = ("ebx","ecx");
680 my @red = ("mm0","mm1","mm2");
681 my $tmp = "mm3";
682
683 &xor ($dat,&DWP(12,"ecx")); # merge input data
684 &xor ("ebx",&DWP(8,"ecx"));
685 &pxor ($Zhi,&QWP(0,"ecx"));
686 &lea ("ecx",&DWP(16,"ecx")); # inp+=16
687 #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
688 &mov (&DWP(528+8,"esp"),"ebx");
689 &movq (&QWP(528+0,"esp"),$Zhi);
690 &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
691
692 &xor ($nlo,$nlo);
693 &rol ($dat,8);
694 &mov (&LB($nlo),&LB($dat));
695 &mov ($nhi[1],$nlo);
696 &and (&LB($nlo),0x0f);
697 &shr ($nhi[1],4);
698 &pxor ($red[0],$red[0]);
699 &rol ($dat,8); # next byte
700 &pxor ($red[1],$red[1]);
701 &pxor ($red[2],$red[2]);
702
703 # Just like in "May" verson modulo-schedule for critical path in
704 # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
705 # is scheduled so late that rem_8bit[] has to be shifted *right*
706 # by 16, which is why last argument to pinsrw is 2, which
707 # corresponds to <<32=<<48>>16...
708 for ($j=11,$i=0;$i<15;$i++) {
709
710 if ($i>0) {
711 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
712 &rol ($dat,8); # next byte
713 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
714
715 &pxor ($Zlo,$tmp);
716 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
717 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
718 } else {
719 &movq ($Zlo,&QWP(16,"esp",$nlo,8));
720 &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
721 }
722
723 &mov (&LB($nlo),&LB($dat));
724 &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
725
726 &movd ($rem[0],$Zlo);
727 &movz ($rem[1],&LB($rem[1])) if ($i>0);
728 &psrlq ($Zlo,8); # Z>>=8
729
730 &movq ($tmp,$Zhi);
731 &mov ($nhi[0],$nlo);
732 &psrlq ($Zhi,8);
733
734 &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
735 &and (&LB($nlo),0x0f);
736 &psllq ($tmp,56);
737
738 &pxor ($Zhi,$red[1]) if ($i>1);
739 &shr ($nhi[0],4);
740 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
741
742 unshift (@red,pop(@red)); # "rotate" registers
743 unshift (@rem,pop(@rem));
744 unshift (@nhi,pop(@nhi));
745 }
746
747 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
748 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
749 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
750
751 &pxor ($Zlo,$tmp);
752 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
753 &movz ($rem[1],&LB($rem[1]));
754
755 &pxor ($red[2],$red[2]); # clear 2nd word
756 &psllq ($red[1],4);
757
758 &movd ($rem[0],$Zlo);
759 &psrlq ($Zlo,4); # Z>>=4
760
761 &movq ($tmp,$Zhi);
762 &psrlq ($Zhi,4);
763 &shl ($rem[0],4); # rem<<4
764
765 &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
766 &psllq ($tmp,60);
767 &movz ($rem[0],&LB($rem[0]));
768
769 &pxor ($Zlo,$tmp);
770 &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
771
772 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
773 &pxor ($Zhi,$red[1]);
774
775 &movd ($dat,$Zlo);
776 &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
777
778 &psllq ($red[0],12); # correct by <<16>>4
779 &pxor ($Zhi,$red[0]);
780 &psrlq ($Zlo,32);
781 &pxor ($Zhi,$red[2]);
782
783 &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
784 &movd ("ebx",$Zlo);
785 &movq ($tmp,$Zhi); # 01234567
786 &psllw ($Zhi,8); # 1.3.5.7.
787 &psrlw ($tmp,8); # .0.2.4.6
788 &por ($Zhi,$tmp); # 10325476
789 &bswap ($dat);
790 &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
791 &bswap ("ebx");
792
793 &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
794 &jne (&label("outer"));
795 }
796
797 &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
798 &mov (&DWP(12,"eax"),"edx");
799 &mov (&DWP(8,"eax"),"ebx");
800 &movq (&QWP(0,"eax"),$Zhi);
801
802 &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
803 &emms ();
804}
805&function_end("gcm_ghash_4bit_mmx");
806}}
807
808if ($sse2) {{
809######################################################################
810# PCLMULQDQ version.
811
812$Xip="eax";
813$Htbl="edx";
814$const="ecx";
815$inp="esi";
816$len="ebx";
817
818($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
819($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
820($Xn,$Xhn)=("xmm6","xmm7");
821
822&static_label("bswap");
823
824sub clmul64x64_T2 { # minimal "register" pressure
825my ($Xhi,$Xi,$Hkey)=@_;
826
827 &movdqa ($Xhi,$Xi); #
828 &pshufd ($T1,$Xi,0b01001110);
829 &pshufd ($T2,$Hkey,0b01001110);
830 &pxor ($T1,$Xi); #
831 &pxor ($T2,$Hkey);
832
833 &pclmulqdq ($Xi,$Hkey,0x00); #######
834 &pclmulqdq ($Xhi,$Hkey,0x11); #######
835 &pclmulqdq ($T1,$T2,0x00); #######
836 &xorps ($T1,$Xi); #
837 &xorps ($T1,$Xhi); #
838
839 &movdqa ($T2,$T1); #
840 &psrldq ($T1,8);
841 &pslldq ($T2,8); #
842 &pxor ($Xhi,$T1);
843 &pxor ($Xi,$T2); #
844}
845
846sub clmul64x64_T3 {
847# Even though this subroutine offers visually better ILP, it
848# was empirically found to be a tad slower than above version.
849# At least in gcm_ghash_clmul context. But it's just as well,
850# because loop modulo-scheduling is possible only thanks to
851# minimized "register" pressure...
852my ($Xhi,$Xi,$Hkey)=@_;
853
854 &movdqa ($T1,$Xi); #
855 &movdqa ($Xhi,$Xi);
856 &pclmulqdq ($Xi,$Hkey,0x00); #######
857 &pclmulqdq ($Xhi,$Hkey,0x11); #######
858 &pshufd ($T2,$T1,0b01001110); #
859 &pshufd ($T3,$Hkey,0b01001110);
860 &pxor ($T2,$T1); #
861 &pxor ($T3,$Hkey);
862 &pclmulqdq ($T2,$T3,0x00); #######
863 &pxor ($T2,$Xi); #
864 &pxor ($T2,$Xhi); #
865
866 &movdqa ($T3,$T2); #
867 &psrldq ($T2,8);
868 &pslldq ($T3,8); #
869 &pxor ($Xhi,$T2);
870 &pxor ($Xi,$T3); #
871}
872
873if (1) { # Algorithm 9 with <<1 twist.
874 # Reduction is shorter and uses only two
875 # temporary registers, which makes it better
876 # candidate for interleaving with 64x64
877 # multiplication. Pre-modulo-scheduled loop
878 # was found to be ~20% faster than Algorithm 5
879 # below. Algorithm 9 was therefore chosen for
880 # further optimization...
881
882sub reduction_alg9 { # 17/13 times faster than Intel version
883my ($Xhi,$Xi) = @_;
884
885 # 1st phase
886 &movdqa ($T1,$Xi); #
887 &psllq ($Xi,1);
888 &pxor ($Xi,$T1); #
889 &psllq ($Xi,5); #
890 &pxor ($Xi,$T1); #
891 &psllq ($Xi,57); #
892 &movdqa ($T2,$Xi); #
893 &pslldq ($Xi,8);
894 &psrldq ($T2,8); #
895 &pxor ($Xi,$T1);
896 &pxor ($Xhi,$T2); #
897
898 # 2nd phase
899 &movdqa ($T2,$Xi);
900 &psrlq ($Xi,5);
901 &pxor ($Xi,$T2); #
902 &psrlq ($Xi,1); #
903 &pxor ($Xi,$T2); #
904 &pxor ($T2,$Xhi);
905 &psrlq ($Xi,1); #
906 &pxor ($Xi,$T2); #
907}
908
909&function_begin_B("gcm_init_clmul");
910 &mov ($Htbl,&wparam(0));
911 &mov ($Xip,&wparam(1));
912
913 &call (&label("pic"));
914&set_label("pic");
915 &blindpop ($const);
916 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
917
918 &movdqu ($Hkey,&QWP(0,$Xip));
919 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
920
921 # <<1 twist
922 &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
923 &movdqa ($T1,$Hkey);
924 &psllq ($Hkey,1);
925 &pxor ($T3,$T3); #
926 &psrlq ($T1,63);
927 &pcmpgtd ($T3,$T2); # broadcast carry bit
928 &pslldq ($T1,8);
929 &por ($Hkey,$T1); # H<<=1
930
931 # magic reduction
932 &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
933 &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
934
935 # calculate H^2
936 &movdqa ($Xi,$Hkey);
937 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
938 &reduction_alg9 ($Xhi,$Xi);
939
940 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
941 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
942
943 &ret ();
944&function_end_B("gcm_init_clmul");
945
946&function_begin_B("gcm_gmult_clmul");
947 &mov ($Xip,&wparam(0));
948 &mov ($Htbl,&wparam(1));
949
950 &call (&label("pic"));
951&set_label("pic");
952 &blindpop ($const);
953 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
954
955 &movdqu ($Xi,&QWP(0,$Xip));
956 &movdqa ($T3,&QWP(0,$const));
957 &movups ($Hkey,&QWP(0,$Htbl));
958 &pshufb ($Xi,$T3);
959
960 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
961 &reduction_alg9 ($Xhi,$Xi);
962
963 &pshufb ($Xi,$T3);
964 &movdqu (&QWP(0,$Xip),$Xi);
965
966 &ret ();
967&function_end_B("gcm_gmult_clmul");
968
969&function_begin("gcm_ghash_clmul");
970 &mov ($Xip,&wparam(0));
971 &mov ($Htbl,&wparam(1));
972 &mov ($inp,&wparam(2));
973 &mov ($len,&wparam(3));
974
975 &call (&label("pic"));
976&set_label("pic");
977 &blindpop ($const);
978 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
979
980 &movdqu ($Xi,&QWP(0,$Xip));
981 &movdqa ($T3,&QWP(0,$const));
982 &movdqu ($Hkey,&QWP(0,$Htbl));
983 &pshufb ($Xi,$T3);
984
985 &sub ($len,0x10);
986 &jz (&label("odd_tail"));
987
988 #######
989 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
990 # [(H*Ii+1) + (H*Xi+1)] mod P =
991 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
992 #
993 &movdqu ($T1,&QWP(0,$inp)); # Ii
994 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
995 &pshufb ($T1,$T3);
996 &pshufb ($Xn,$T3);
997 &pxor ($Xi,$T1); # Ii+Xi
998
999 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
1000 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1001
1002 &lea ($inp,&DWP(32,$inp)); # i+=2
1003 &sub ($len,0x20);
1004 &jbe (&label("even_tail"));
1005
1006&set_label("mod_loop");
1007 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1008 &movdqu ($T1,&QWP(0,$inp)); # Ii
1009 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1010
1011 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1012 &pxor ($Xhi,$Xhn);
1013
1014 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1015 &pshufb ($T1,$T3);
1016 &pshufb ($Xn,$T3);
1017
1018 &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1019 &movdqa ($Xhn,$Xn);
1020 &pxor ($Xhi,$T1); # "Ii+Xi", consume early
1021
1022 &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
1023 &psllq ($Xi,1);
1024 &pxor ($Xi,$T1); #
1025 &psllq ($Xi,5); #
1026 &pxor ($Xi,$T1); #
1027 &pclmulqdq ($Xn,$Hkey,0x00); #######
1028 &psllq ($Xi,57); #
1029 &movdqa ($T2,$Xi); #
1030 &pslldq ($Xi,8);
1031 &psrldq ($T2,8); #
1032 &pxor ($Xi,$T1);
1033 &pshufd ($T1,$T3,0b01001110);
1034 &pxor ($Xhi,$T2); #
1035 &pxor ($T1,$T3);
1036 &pshufd ($T3,$Hkey,0b01001110);
1037 &pxor ($T3,$Hkey); #
1038
1039 &pclmulqdq ($Xhn,$Hkey,0x11); #######
1040 &movdqa ($T2,$Xi); # 2nd phase
1041 &psrlq ($Xi,5);
1042 &pxor ($Xi,$T2); #
1043 &psrlq ($Xi,1); #
1044 &pxor ($Xi,$T2); #
1045 &pxor ($T2,$Xhi);
1046 &psrlq ($Xi,1); #
1047 &pxor ($Xi,$T2); #
1048
1049 &pclmulqdq ($T1,$T3,0x00); #######
1050 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1051 &xorps ($T1,$Xn); #
1052 &xorps ($T1,$Xhn); #
1053
1054 &movdqa ($T3,$T1); #
1055 &psrldq ($T1,8);
1056 &pslldq ($T3,8); #
1057 &pxor ($Xhn,$T1);
1058 &pxor ($Xn,$T3); #
1059 &movdqa ($T3,&QWP(0,$const));
1060
1061 &lea ($inp,&DWP(32,$inp));
1062 &sub ($len,0x20);
1063 &ja (&label("mod_loop"));
1064
1065&set_label("even_tail");
1066 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1067
1068 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1069 &pxor ($Xhi,$Xhn);
1070
1071 &reduction_alg9 ($Xhi,$Xi);
1072
1073 &test ($len,$len);
1074 &jnz (&label("done"));
1075
1076 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1077&set_label("odd_tail");
1078 &movdqu ($T1,&QWP(0,$inp)); # Ii
1079 &pshufb ($T1,$T3);
1080 &pxor ($Xi,$T1); # Ii+Xi
1081
1082 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1083 &reduction_alg9 ($Xhi,$Xi);
1084
1085&set_label("done");
1086 &pshufb ($Xi,$T3);
1087 &movdqu (&QWP(0,$Xip),$Xi);
1088&function_end("gcm_ghash_clmul");
1089
1090} else { # Algorith 5. Kept for reference purposes.
1091
1092sub reduction_alg5 { # 19/16 times faster than Intel version
1093my ($Xhi,$Xi)=@_;
1094
1095 # <<1
1096 &movdqa ($T1,$Xi); #
1097 &movdqa ($T2,$Xhi);
1098 &pslld ($Xi,1);
1099 &pslld ($Xhi,1); #
1100 &psrld ($T1,31);
1101 &psrld ($T2,31); #
1102 &movdqa ($T3,$T1);
1103 &pslldq ($T1,4);
1104 &psrldq ($T3,12); #
1105 &pslldq ($T2,4);
1106 &por ($Xhi,$T3); #
1107 &por ($Xi,$T1);
1108 &por ($Xhi,$T2); #
1109
1110 # 1st phase
1111 &movdqa ($T1,$Xi);
1112 &movdqa ($T2,$Xi);
1113 &movdqa ($T3,$Xi); #
1114 &pslld ($T1,31);
1115 &pslld ($T2,30);
1116 &pslld ($Xi,25); #
1117 &pxor ($T1,$T2);
1118 &pxor ($T1,$Xi); #
1119 &movdqa ($T2,$T1); #
1120 &pslldq ($T1,12);
1121 &psrldq ($T2,4); #
1122 &pxor ($T3,$T1);
1123
1124 # 2nd phase
1125 &pxor ($Xhi,$T3); #
1126 &movdqa ($Xi,$T3);
1127 &movdqa ($T1,$T3);
1128 &psrld ($Xi,1); #
1129 &psrld ($T1,2);
1130 &psrld ($T3,7); #
1131 &pxor ($Xi,$T1);
1132 &pxor ($Xhi,$T2);
1133 &pxor ($Xi,$T3); #
1134 &pxor ($Xi,$Xhi); #
1135}
1136
1137&function_begin_B("gcm_init_clmul");
1138 &mov ($Htbl,&wparam(0));
1139 &mov ($Xip,&wparam(1));
1140
1141 &call (&label("pic"));
1142&set_label("pic");
1143 &blindpop ($const);
1144 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1145
1146 &movdqu ($Hkey,&QWP(0,$Xip));
1147 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1148
1149 # calculate H^2
1150 &movdqa ($Xi,$Hkey);
1151 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1152 &reduction_alg5 ($Xhi,$Xi);
1153
1154 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1155 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1156
1157 &ret ();
1158&function_end_B("gcm_init_clmul");
1159
1160&function_begin_B("gcm_gmult_clmul");
1161 &mov ($Xip,&wparam(0));
1162 &mov ($Htbl,&wparam(1));
1163
1164 &call (&label("pic"));
1165&set_label("pic");
1166 &blindpop ($const);
1167 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1168
1169 &movdqu ($Xi,&QWP(0,$Xip));
1170 &movdqa ($Xn,&QWP(0,$const));
1171 &movdqu ($Hkey,&QWP(0,$Htbl));
1172 &pshufb ($Xi,$Xn);
1173
1174 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1175 &reduction_alg5 ($Xhi,$Xi);
1176
1177 &pshufb ($Xi,$Xn);
1178 &movdqu (&QWP(0,$Xip),$Xi);
1179
1180 &ret ();
1181&function_end_B("gcm_gmult_clmul");
1182
1183&function_begin("gcm_ghash_clmul");
1184 &mov ($Xip,&wparam(0));
1185 &mov ($Htbl,&wparam(1));
1186 &mov ($inp,&wparam(2));
1187 &mov ($len,&wparam(3));
1188
1189 &call (&label("pic"));
1190&set_label("pic");
1191 &blindpop ($const);
1192 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1193
1194 &movdqu ($Xi,&QWP(0,$Xip));
1195 &movdqa ($T3,&QWP(0,$const));
1196 &movdqu ($Hkey,&QWP(0,$Htbl));
1197 &pshufb ($Xi,$T3);
1198
1199 &sub ($len,0x10);
1200 &jz (&label("odd_tail"));
1201
1202 #######
1203 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1204 # [(H*Ii+1) + (H*Xi+1)] mod P =
1205 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1206 #
1207 &movdqu ($T1,&QWP(0,$inp)); # Ii
1208 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1209 &pshufb ($T1,$T3);
1210 &pshufb ($Xn,$T3);
1211 &pxor ($Xi,$T1); # Ii+Xi
1212
1213 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1214 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1215
1216 &sub ($len,0x20);
1217 &lea ($inp,&DWP(32,$inp)); # i+=2
1218 &jbe (&label("even_tail"));
1219
1220&set_label("mod_loop");
1221 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1222 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1223
1224 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1225 &pxor ($Xhi,$Xhn);
1226
1227 &reduction_alg5 ($Xhi,$Xi);
1228
1229 #######
1230 &movdqa ($T3,&QWP(0,$const));
1231 &movdqu ($T1,&QWP(0,$inp)); # Ii
1232 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1233 &pshufb ($T1,$T3);
1234 &pshufb ($Xn,$T3);
1235 &pxor ($Xi,$T1); # Ii+Xi
1236
1237 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1238 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1239
1240 &sub ($len,0x20);
1241 &lea ($inp,&DWP(32,$inp));
1242 &ja (&label("mod_loop"));
1243
1244&set_label("even_tail");
1245 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1246
1247 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1248 &pxor ($Xhi,$Xhn);
1249
1250 &reduction_alg5 ($Xhi,$Xi);
1251
1252 &movdqa ($T3,&QWP(0,$const));
1253 &test ($len,$len);
1254 &jnz (&label("done"));
1255
1256 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1257&set_label("odd_tail");
1258 &movdqu ($T1,&QWP(0,$inp)); # Ii
1259 &pshufb ($T1,$T3);
1260 &pxor ($Xi,$T1); # Ii+Xi
1261
1262 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1263 &reduction_alg5 ($Xhi,$Xi);
1264
1265 &movdqa ($T3,&QWP(0,$const));
1266&set_label("done");
1267 &pshufb ($Xi,$T3);
1268 &movdqu (&QWP(0,$Xip),$Xi);
1269&function_end("gcm_ghash_clmul");
1270
1271}
1272
1273&set_label("bswap",64);
1274 &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1275 &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1276}} # $sse2
1277
1278&set_label("rem_4bit",64);
1279 &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1280 &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1281 &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1282 &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1283&set_label("rem_8bit",64);
1284 &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1285 &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1286 &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1287 &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1288 &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1289 &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1290 &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1291 &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1292 &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1293 &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1294 &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1295 &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1296 &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1297 &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1298 &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1299 &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1300 &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1301 &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1302 &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1303 &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1304 &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1305 &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1306 &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1307 &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1308 &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1309 &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1310 &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1311 &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1312 &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1313 &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1314 &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1315 &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1316}}} # !$x86only
1317
1318&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
1319&asm_finish();
1320
1321# A question was risen about choice of vanilla MMX. Or rather why wasn't
1322# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1323# CPUs such as PIII, "4-bit" MMX version was observed to provide better
1324# performance than *corresponding* SSE2 one even on contemporary CPUs.
1325# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1326# implementation featuring full range of lookup-table sizes, but with
1327# per-invocation lookup table setup. Latter means that table size is
1328# chosen depending on how much data is to be hashed in every given call,
1329# more data - larger table. Best reported result for Core2 is ~4 cycles
1330# per processed byte out of 64KB block. This number accounts even for
1331# 64KB table setup overhead. As discussed in gcm128.c we choose to be
1332# more conservative in respect to lookup table sizes, but how do the
1333# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1334# on same platform. As also discussed in gcm128.c, next in line "8-bit
1335# Shoup's" or "4KB" method should deliver twice the performance of
1336# "256B" one, in other words not worse than ~6 cycles per byte. It
1337# should be also be noted that in SSE2 case improvement can be "super-
1338# linear," i.e. more than twice, mostly because >>8 maps to single
1339# instruction on SSE2 register. This is unlike "4-bit" case when >>4
1340# maps to same amount of instructions in both MMX and SSE2 cases.
1341# Bottom line is that switch to SSE2 is considered to be justifiable
1342# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index 38d779edbc..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,806 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that
14# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15# function features so called "528B" variant utilizing additional
16# 256+16 bytes of per-key storage [+512 bytes shared table].
17# Performance results are for this streamed GHASH subroutine and are
18# expressed in cycles per processed byte, less is better:
19#
20# gcc 3.4.x(*) assembler
21#
22# P4 28.6 14.0 +100%
23# Opteron 19.3 7.7 +150%
24# Core2 17.8 8.1(**) +120%
25#
26# (*) comparison is not completely fair, because C results are
27# for vanilla "256B" implementation, while assembler results
28# are for "528B";-)
29# (**) it's mystery [to me] why Core2 result is not same as for
30# Opteron;
31
32# May 2010
33#
34# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
35# See ghash-x86.pl for background information and details about coding
36# techniques.
37#
38# Special thanks to David Woodhouse <dwmw2@infradead.org> for
39# providing access to a Westmere-based system on behalf of Intel
40# Open Source Technology Centre.
41
42$flavour = shift;
43$output = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour $output";
54*STDOUT=*OUT;
55
56# common register layout
57$nlo="%rax";
58$nhi="%rbx";
59$Zlo="%r8";
60$Zhi="%r9";
61$tmp="%r10";
62$rem_4bit = "%r11";
63
64$Xi="%rdi";
65$Htbl="%rsi";
66
67# per-function register layout
68$cnt="%rcx";
69$rem="%rdx";
70
71sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
72 $r =~ s/%[er]([sd]i)/%\1l/ or
73 $r =~ s/%[er](bp)/%\1l/ or
74 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
75
76sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
77{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
78 my $arg = pop;
79 $arg = "\$$arg" if ($arg*1 eq $arg);
80 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
81}
82
83{ my $N;
84 sub loop() {
85 my $inp = shift;
86
87 $N++;
88$code.=<<___;
89 xor $nlo,$nlo
90 xor $nhi,$nhi
91 mov `&LB("$Zlo")`,`&LB("$nlo")`
92 mov `&LB("$Zlo")`,`&LB("$nhi")`
93 shl \$4,`&LB("$nlo")`
94 mov \$14,$cnt
95 mov 8($Htbl,$nlo),$Zlo
96 mov ($Htbl,$nlo),$Zhi
97 and \$0xf0,`&LB("$nhi")`
98 mov $Zlo,$rem
99 jmp .Loop$N
100
101.align 16
102.Loop$N:
103 shr \$4,$Zlo
104 and \$0xf,$rem
105 mov $Zhi,$tmp
106 mov ($inp,$cnt),`&LB("$nlo")`
107 shr \$4,$Zhi
108 xor 8($Htbl,$nhi),$Zlo
109 shl \$60,$tmp
110 xor ($Htbl,$nhi),$Zhi
111 mov `&LB("$nlo")`,`&LB("$nhi")`
112 xor ($rem_4bit,$rem,8),$Zhi
113 mov $Zlo,$rem
114 shl \$4,`&LB("$nlo")`
115 xor $tmp,$Zlo
116 dec $cnt
117 js .Lbreak$N
118
119 shr \$4,$Zlo
120 and \$0xf,$rem
121 mov $Zhi,$tmp
122 shr \$4,$Zhi
123 xor 8($Htbl,$nlo),$Zlo
124 shl \$60,$tmp
125 xor ($Htbl,$nlo),$Zhi
126 and \$0xf0,`&LB("$nhi")`
127 xor ($rem_4bit,$rem,8),$Zhi
128 mov $Zlo,$rem
129 xor $tmp,$Zlo
130 jmp .Loop$N
131
132.align 16
133.Lbreak$N:
134 shr \$4,$Zlo
135 and \$0xf,$rem
136 mov $Zhi,$tmp
137 shr \$4,$Zhi
138 xor 8($Htbl,$nlo),$Zlo
139 shl \$60,$tmp
140 xor ($Htbl,$nlo),$Zhi
141 and \$0xf0,`&LB("$nhi")`
142 xor ($rem_4bit,$rem,8),$Zhi
143 mov $Zlo,$rem
144 xor $tmp,$Zlo
145
146 shr \$4,$Zlo
147 and \$0xf,$rem
148 mov $Zhi,$tmp
149 shr \$4,$Zhi
150 xor 8($Htbl,$nhi),$Zlo
151 shl \$60,$tmp
152 xor ($Htbl,$nhi),$Zhi
153 xor $tmp,$Zlo
154 xor ($rem_4bit,$rem,8),$Zhi
155
156 bswap $Zlo
157 bswap $Zhi
158___
159}}
160
161$code=<<___;
162.text
163
164.globl gcm_gmult_4bit
165.type gcm_gmult_4bit,\@function,2
166.align 16
167gcm_gmult_4bit:
168 push %rbx
169 push %rbp # %rbp and %r12 are pushed exclusively in
170 push %r12 # order to reuse Win64 exception handler...
171.Lgmult_prologue:
172
173 movzb 15($Xi),$Zlo
174 lea .Lrem_4bit(%rip),$rem_4bit
175___
176 &loop ($Xi);
177$code.=<<___;
178 mov $Zlo,8($Xi)
179 mov $Zhi,($Xi)
180
181 mov 16(%rsp),%rbx
182 lea 24(%rsp),%rsp
183.Lgmult_epilogue:
184 ret
185.size gcm_gmult_4bit,.-gcm_gmult_4bit
186___
187
188# per-function register layout
189$inp="%rdx";
190$len="%rcx";
191$rem_8bit=$rem_4bit;
192
193$code.=<<___;
194.globl gcm_ghash_4bit
195.type gcm_ghash_4bit,\@function,4
196.align 16
197gcm_ghash_4bit:
198 push %rbx
199 push %rbp
200 push %r12
201 push %r13
202 push %r14
203 push %r15
204 sub \$280,%rsp
205.Lghash_prologue:
206 mov $inp,%r14 # reassign couple of args
207 mov $len,%r15
208___
209{ my $inp="%r14";
210 my $dat="%edx";
211 my $len="%r15";
212 my @nhi=("%ebx","%ecx");
213 my @rem=("%r12","%r13");
214 my $Hshr4="%rbp";
215
216 &sub ($Htbl,-128); # size optimization
217 &lea ($Hshr4,"16+128(%rsp)");
218 { my @lo =($nlo,$nhi);
219 my @hi =($Zlo,$Zhi);
220
221 &xor ($dat,$dat);
222 for ($i=0,$j=-2;$i<18;$i++,$j++) {
223 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
224 &or ($lo[0],$tmp) if ($i>1);
225 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
226 &shr ($lo[1],4) if ($i>0 && $i<17);
227 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
228 &shr ($hi[1],4) if ($i>0 && $i<17);
229 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
230 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
231 &shl (&LB($dat),4) if ($i>0 && $i<17);
232 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
233 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
234 &shl ($tmp,60) if ($i>0 && $i<17);
235
236 push (@lo,shift(@lo));
237 push (@hi,shift(@hi));
238 }
239 }
240 &add ($Htbl,-128);
241 &mov ($Zlo,"8($Xi)");
242 &mov ($Zhi,"0($Xi)");
243 &add ($len,$inp); # pointer to the end of data
244 &lea ($rem_8bit,".Lrem_8bit(%rip)");
245 &jmp (".Louter_loop");
246
247$code.=".align 16\n.Louter_loop:\n";
248 &xor ($Zhi,"($inp)");
249 &mov ("%rdx","8($inp)");
250 &lea ($inp,"16($inp)");
251 &xor ("%rdx",$Zlo);
252 &mov ("($Xi)",$Zhi);
253 &mov ("8($Xi)","%rdx");
254 &shr ("%rdx",32);
255
256 &xor ($nlo,$nlo);
257 &rol ($dat,8);
258 &mov (&LB($nlo),&LB($dat));
259 &movz ($nhi[0],&LB($dat));
260 &shl (&LB($nlo),4);
261 &shr ($nhi[0],4);
262
263 for ($j=11,$i=0;$i<15;$i++) {
264 &rol ($dat,8);
265 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
266 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
267 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
268 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
269
270 &mov (&LB($nlo),&LB($dat));
271 &xor ($Zlo,$tmp) if ($i>0);
272 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
273
274 &movz ($nhi[1],&LB($dat));
275 &shl (&LB($nlo),4);
276 &movzb ($rem[0],"(%rsp,$nhi[0])");
277
278 &shr ($nhi[1],4) if ($i<14);
279 &and ($nhi[1],0xf0) if ($i==14);
280 &shl ($rem[1],48) if ($i>0);
281 &xor ($rem[0],$Zlo);
282
283 &mov ($tmp,$Zhi);
284 &xor ($Zhi,$rem[1]) if ($i>0);
285 &shr ($Zlo,8);
286
287 &movz ($rem[0],&LB($rem[0]));
288 &mov ($dat,"$j($Xi)") if (--$j%4==0);
289 &shr ($Zhi,8);
290
291 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
292 &shl ($tmp,56);
293 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
294
295 unshift (@nhi,pop(@nhi)); # "rotate" registers
296 unshift (@rem,pop(@rem));
297 }
298 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
299 &xor ($Zlo,"8($Htbl,$nlo)");
300 &xor ($Zhi,"($Htbl,$nlo)");
301
302 &shl ($rem[1],48);
303 &xor ($Zlo,$tmp);
304
305 &xor ($Zhi,$rem[1]);
306 &movz ($rem[0],&LB($Zlo));
307 &shr ($Zlo,4);
308
309 &mov ($tmp,$Zhi);
310 &shl (&LB($rem[0]),4);
311 &shr ($Zhi,4);
312
313 &xor ($Zlo,"8($Htbl,$nhi[0])");
314 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
315 &shl ($tmp,60);
316
317 &xor ($Zhi,"($Htbl,$nhi[0])");
318 &xor ($Zlo,$tmp);
319 &shl ($rem[0],48);
320
321 &bswap ($Zlo);
322 &xor ($Zhi,$rem[0]);
323
324 &bswap ($Zhi);
325 &cmp ($inp,$len);
326 &jb (".Louter_loop");
327}
328$code.=<<___;
329 mov $Zlo,8($Xi)
330 mov $Zhi,($Xi)
331
332 lea 280(%rsp),%rsi
333 mov 0(%rsi),%r15
334 mov 8(%rsi),%r14
335 mov 16(%rsi),%r13
336 mov 24(%rsi),%r12
337 mov 32(%rsi),%rbp
338 mov 40(%rsi),%rbx
339 lea 48(%rsi),%rsp
340.Lghash_epilogue:
341 ret
342.size gcm_ghash_4bit,.-gcm_ghash_4bit
343___
344
345######################################################################
346# PCLMULQDQ version.
347
348@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
349 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
350
351($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
352($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
353
354sub clmul64x64_T2 { # minimal register pressure
355my ($Xhi,$Xi,$Hkey,$modulo)=@_;
356
357$code.=<<___ if (!defined($modulo));
358 movdqa $Xi,$Xhi #
359 pshufd \$0b01001110,$Xi,$T1
360 pshufd \$0b01001110,$Hkey,$T2
361 pxor $Xi,$T1 #
362 pxor $Hkey,$T2
363___
364$code.=<<___;
365 pclmulqdq \$0x00,$Hkey,$Xi #######
366 pclmulqdq \$0x11,$Hkey,$Xhi #######
367 pclmulqdq \$0x00,$T2,$T1 #######
368 pxor $Xi,$T1 #
369 pxor $Xhi,$T1 #
370
371 movdqa $T1,$T2 #
372 psrldq \$8,$T1
373 pslldq \$8,$T2 #
374 pxor $T1,$Xhi
375 pxor $T2,$Xi #
376___
377}
378
379sub reduction_alg9 { # 17/13 times faster than Intel version
380my ($Xhi,$Xi) = @_;
381
382$code.=<<___;
383 # 1st phase
384 movdqa $Xi,$T1 #
385 psllq \$1,$Xi
386 pxor $T1,$Xi #
387 psllq \$5,$Xi #
388 pxor $T1,$Xi #
389 psllq \$57,$Xi #
390 movdqa $Xi,$T2 #
391 pslldq \$8,$Xi
392 psrldq \$8,$T2 #
393 pxor $T1,$Xi
394 pxor $T2,$Xhi #
395
396 # 2nd phase
397 movdqa $Xi,$T2
398 psrlq \$5,$Xi
399 pxor $T2,$Xi #
400 psrlq \$1,$Xi #
401 pxor $T2,$Xi #
402 pxor $Xhi,$T2
403 psrlq \$1,$Xi #
404 pxor $T2,$Xi #
405___
406}
407
408{ my ($Htbl,$Xip)=@_4args;
409
410$code.=<<___;
411.globl gcm_init_clmul
412.type gcm_init_clmul,\@abi-omnipotent
413.align 16
414gcm_init_clmul:
415 movdqu ($Xip),$Hkey
416 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
417
418 # <<1 twist
419 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
420 movdqa $Hkey,$T1
421 psllq \$1,$Hkey
422 pxor $T3,$T3 #
423 psrlq \$63,$T1
424 pcmpgtd $T2,$T3 # broadcast carry bit
425 pslldq \$8,$T1
426 por $T1,$Hkey # H<<=1
427
428 # magic reduction
429 pand .L0x1c2_polynomial(%rip),$T3
430 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
431
432 # calculate H^2
433 movdqa $Hkey,$Xi
434___
435 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
436 &reduction_alg9 ($Xhi,$Xi);
437$code.=<<___;
438 movdqu $Hkey,($Htbl) # save H
439 movdqu $Xi,16($Htbl) # save H^2
440 ret
441.size gcm_init_clmul,.-gcm_init_clmul
442___
443}
444
445{ my ($Xip,$Htbl)=@_4args;
446
447$code.=<<___;
448.globl gcm_gmult_clmul
449.type gcm_gmult_clmul,\@abi-omnipotent
450.align 16
451gcm_gmult_clmul:
452 movdqu ($Xip),$Xi
453 movdqa .Lbswap_mask(%rip),$T3
454 movdqu ($Htbl),$Hkey
455 pshufb $T3,$Xi
456___
457 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
458 &reduction_alg9 ($Xhi,$Xi);
459$code.=<<___;
460 pshufb $T3,$Xi
461 movdqu $Xi,($Xip)
462 ret
463.size gcm_gmult_clmul,.-gcm_gmult_clmul
464___
465}
466
467{ my ($Xip,$Htbl,$inp,$len)=@_4args;
468 my $Xn="%xmm6";
469 my $Xhn="%xmm7";
470 my $Hkey2="%xmm8";
471 my $T1n="%xmm9";
472 my $T2n="%xmm10";
473
474$code.=<<___;
475.globl gcm_ghash_clmul
476.type gcm_ghash_clmul,\@abi-omnipotent
477.align 16
478gcm_ghash_clmul:
479___
480$code.=<<___ if ($win64);
481.LSEH_begin_gcm_ghash_clmul:
482 # I can't trust assembler to use specific encoding:-(
483 .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
484 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
485 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
486 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
487 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
488 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
489___
490$code.=<<___;
491 movdqa .Lbswap_mask(%rip),$T3
492
493 movdqu ($Xip),$Xi
494 movdqu ($Htbl),$Hkey
495 pshufb $T3,$Xi
496
497 sub \$0x10,$len
498 jz .Lodd_tail
499
500 movdqu 16($Htbl),$Hkey2
501 #######
502 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
503 # [(H*Ii+1) + (H*Xi+1)] mod P =
504 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
505 #
506 movdqu ($inp),$T1 # Ii
507 movdqu 16($inp),$Xn # Ii+1
508 pshufb $T3,$T1
509 pshufb $T3,$Xn
510 pxor $T1,$Xi # Ii+Xi
511___
512 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
513$code.=<<___;
514 movdqa $Xi,$Xhi #
515 pshufd \$0b01001110,$Xi,$T1
516 pshufd \$0b01001110,$Hkey2,$T2
517 pxor $Xi,$T1 #
518 pxor $Hkey2,$T2
519
520 lea 32($inp),$inp # i+=2
521 sub \$0x20,$len
522 jbe .Leven_tail
523
524.Lmod_loop:
525___
526 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
527$code.=<<___;
528 movdqu ($inp),$T1 # Ii
529 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
530 pxor $Xhn,$Xhi
531
532 movdqu 16($inp),$Xn # Ii+1
533 pshufb $T3,$T1
534 pshufb $T3,$Xn
535
536 movdqa $Xn,$Xhn #
537 pshufd \$0b01001110,$Xn,$T1n
538 pshufd \$0b01001110,$Hkey,$T2n
539 pxor $Xn,$T1n #
540 pxor $Hkey,$T2n
541 pxor $T1,$Xhi # "Ii+Xi", consume early
542
543 movdqa $Xi,$T1 # 1st phase
544 psllq \$1,$Xi
545 pxor $T1,$Xi #
546 psllq \$5,$Xi #
547 pxor $T1,$Xi #
548 pclmulqdq \$0x00,$Hkey,$Xn #######
549 psllq \$57,$Xi #
550 movdqa $Xi,$T2 #
551 pslldq \$8,$Xi
552 psrldq \$8,$T2 #
553 pxor $T1,$Xi
554 pxor $T2,$Xhi #
555
556 pclmulqdq \$0x11,$Hkey,$Xhn #######
557 movdqa $Xi,$T2 # 2nd phase
558 psrlq \$5,$Xi
559 pxor $T2,$Xi #
560 psrlq \$1,$Xi #
561 pxor $T2,$Xi #
562 pxor $Xhi,$T2
563 psrlq \$1,$Xi #
564 pxor $T2,$Xi #
565
566 pclmulqdq \$0x00,$T2n,$T1n #######
567 movdqa $Xi,$Xhi #
568 pshufd \$0b01001110,$Xi,$T1
569 pshufd \$0b01001110,$Hkey2,$T2
570 pxor $Xi,$T1 #
571 pxor $Hkey2,$T2
572
573 pxor $Xn,$T1n #
574 pxor $Xhn,$T1n #
575 movdqa $T1n,$T2n #
576 psrldq \$8,$T1n
577 pslldq \$8,$T2n #
578 pxor $T1n,$Xhn
579 pxor $T2n,$Xn #
580
581 lea 32($inp),$inp
582 sub \$0x20,$len
583 ja .Lmod_loop
584
585.Leven_tail:
586___
587 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
588$code.=<<___;
589 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
590 pxor $Xhn,$Xhi
591___
592 &reduction_alg9 ($Xhi,$Xi);
593$code.=<<___;
594 test $len,$len
595 jnz .Ldone
596
597.Lodd_tail:
598 movdqu ($inp),$T1 # Ii
599 pshufb $T3,$T1
600 pxor $T1,$Xi # Ii+Xi
601___
602 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
603 &reduction_alg9 ($Xhi,$Xi);
604$code.=<<___;
605.Ldone:
606 pshufb $T3,$Xi
607 movdqu $Xi,($Xip)
608___
609$code.=<<___ if ($win64);
610 movaps (%rsp),%xmm6
611 movaps 0x10(%rsp),%xmm7
612 movaps 0x20(%rsp),%xmm8
613 movaps 0x30(%rsp),%xmm9
614 movaps 0x40(%rsp),%xmm10
615 add \$0x58,%rsp
616___
617$code.=<<___;
618 ret
619.LSEH_end_gcm_ghash_clmul:
620.size gcm_ghash_clmul,.-gcm_ghash_clmul
621___
622}
623
624$code.=<<___;
625.align 64
626.Lbswap_mask:
627 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
628.L0x1c2_polynomial:
629 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
630.align 64
631.type .Lrem_4bit,\@object
632.Lrem_4bit:
633 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
634 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
635 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
636 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
637.type .Lrem_8bit,\@object
638.Lrem_8bit:
639 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
640 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
641 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
642 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
643 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
644 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
645 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
646 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
647 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
648 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
649 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
650 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
651 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
652 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
653 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
654 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
655 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
656 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
657 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
658 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
659 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
660 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
661 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
662 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
663 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
664 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
665 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
666 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
667 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
668 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
669 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
670 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
671
672.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
673.align 64
674___
675
676# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
677# CONTEXT *context,DISPATCHER_CONTEXT *disp)
678if ($win64) {
679$rec="%rcx";
680$frame="%rdx";
681$context="%r8";
682$disp="%r9";
683
684$code.=<<___;
685.extern __imp_RtlVirtualUnwind
686.type se_handler,\@abi-omnipotent
687.align 16
688se_handler:
689 push %rsi
690 push %rdi
691 push %rbx
692 push %rbp
693 push %r12
694 push %r13
695 push %r14
696 push %r15
697 pushfq
698 sub \$64,%rsp
699
700 mov 120($context),%rax # pull context->Rax
701 mov 248($context),%rbx # pull context->Rip
702
703 mov 8($disp),%rsi # disp->ImageBase
704 mov 56($disp),%r11 # disp->HandlerData
705
706 mov 0(%r11),%r10d # HandlerData[0]
707 lea (%rsi,%r10),%r10 # prologue label
708 cmp %r10,%rbx # context->Rip<prologue label
709 jb .Lin_prologue
710
711 mov 152($context),%rax # pull context->Rsp
712
713 mov 4(%r11),%r10d # HandlerData[1]
714 lea (%rsi,%r10),%r10 # epilogue label
715 cmp %r10,%rbx # context->Rip>=epilogue label
716 jae .Lin_prologue
717
718 lea 24(%rax),%rax # adjust "rsp"
719
720 mov -8(%rax),%rbx
721 mov -16(%rax),%rbp
722 mov -24(%rax),%r12
723 mov %rbx,144($context) # restore context->Rbx
724 mov %rbp,160($context) # restore context->Rbp
725 mov %r12,216($context) # restore context->R12
726
727.Lin_prologue:
728 mov 8(%rax),%rdi
729 mov 16(%rax),%rsi
730 mov %rax,152($context) # restore context->Rsp
731 mov %rsi,168($context) # restore context->Rsi
732 mov %rdi,176($context) # restore context->Rdi
733
734 mov 40($disp),%rdi # disp->ContextRecord
735 mov $context,%rsi # context
736 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
737 .long 0xa548f3fc # cld; rep movsq
738
739 mov $disp,%rsi
740 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
741 mov 8(%rsi),%rdx # arg2, disp->ImageBase
742 mov 0(%rsi),%r8 # arg3, disp->ControlPc
743 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
744 mov 40(%rsi),%r10 # disp->ContextRecord
745 lea 56(%rsi),%r11 # &disp->HandlerData
746 lea 24(%rsi),%r12 # &disp->EstablisherFrame
747 mov %r10,32(%rsp) # arg5
748 mov %r11,40(%rsp) # arg6
749 mov %r12,48(%rsp) # arg7
750 mov %rcx,56(%rsp) # arg8, (NULL)
751 call *__imp_RtlVirtualUnwind(%rip)
752
753 mov \$1,%eax # ExceptionContinueSearch
754 add \$64,%rsp
755 popfq
756 pop %r15
757 pop %r14
758 pop %r13
759 pop %r12
760 pop %rbp
761 pop %rbx
762 pop %rdi
763 pop %rsi
764 ret
765.size se_handler,.-se_handler
766
767.section .pdata
768.align 4
769 .rva .LSEH_begin_gcm_gmult_4bit
770 .rva .LSEH_end_gcm_gmult_4bit
771 .rva .LSEH_info_gcm_gmult_4bit
772
773 .rva .LSEH_begin_gcm_ghash_4bit
774 .rva .LSEH_end_gcm_ghash_4bit
775 .rva .LSEH_info_gcm_ghash_4bit
776
777 .rva .LSEH_begin_gcm_ghash_clmul
778 .rva .LSEH_end_gcm_ghash_clmul
779 .rva .LSEH_info_gcm_ghash_clmul
780
781.section .xdata
782.align 8
783.LSEH_info_gcm_gmult_4bit:
784 .byte 9,0,0,0
785 .rva se_handler
786 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
787.LSEH_info_gcm_ghash_4bit:
788 .byte 9,0,0,0
789 .rva se_handler
790 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
791.LSEH_info_gcm_ghash_clmul:
792 .byte 0x01,0x1f,0x0b,0x00
793 .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
794 .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
795 .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
796 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
797 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
798 .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
799___
800}
801
802$code =~ s/\`([^\`]*)\`/eval($1)/gem;
803
804print $code;
805
806close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
deleted file mode 100644
index fe45103b0c..0000000000
--- a/src/lib/libcrypto/modes/cbc128.c
+++ /dev/null
@@ -1,202 +0,0 @@
1/* $OpenBSD: cbc128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62#undef STRICT_ALIGNMENT
63#ifdef __STRICT_ALIGNMENT
64#define STRICT_ALIGNMENT 1
65#else
66#define STRICT_ALIGNMENT 0
67#endif
68
69void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
70 size_t len, const void *key,
71 unsigned char ivec[16], block128_f block)
72{
73 size_t n;
74 const unsigned char *iv = ivec;
75
76#if !defined(OPENSSL_SMALL_FOOTPRINT)
77 if (STRICT_ALIGNMENT &&
78 ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
79 while (len>=16) {
80 for(n=0; n<16; ++n)
81 out[n] = in[n] ^ iv[n];
82 (*block)(out, out, key);
83 iv = out;
84 len -= 16;
85 in += 16;
86 out += 16;
87 }
88 } else {
89 while (len>=16) {
90 for(n=0; n<16; n+=sizeof(size_t))
91 *(size_t*)(out+n) =
92 *(size_t*)(in+n) ^ *(size_t*)(iv+n);
93 (*block)(out, out, key);
94 iv = out;
95 len -= 16;
96 in += 16;
97 out += 16;
98 }
99 }
100#endif
101 while (len) {
102 for(n=0; n<16 && n<len; ++n)
103 out[n] = in[n] ^ iv[n];
104 for(; n<16; ++n)
105 out[n] = iv[n];
106 (*block)(out, out, key);
107 iv = out;
108 if (len<=16) break;
109 len -= 16;
110 in += 16;
111 out += 16;
112 }
113 memcpy(ivec,iv,16);
114}
115
116void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
117 size_t len, const void *key,
118 unsigned char ivec[16], block128_f block)
119{
120 size_t n;
121 union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
122
123#if !defined(OPENSSL_SMALL_FOOTPRINT)
124 if (in != out) {
125 const unsigned char *iv = ivec;
126
127 if (STRICT_ALIGNMENT &&
128 ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
129 while (len>=16) {
130 (*block)(in, out, key);
131 for(n=0; n<16; ++n)
132 out[n] ^= iv[n];
133 iv = in;
134 len -= 16;
135 in += 16;
136 out += 16;
137 }
138 } else if (16%sizeof(size_t) == 0) { /* always true */
139 while (len>=16) {
140 size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
141
142 (*block)(in, out, key);
143 for(n=0; n<16/sizeof(size_t); n++)
144 out_t[n] ^= iv_t[n];
145 iv = in;
146 len -= 16;
147 in += 16;
148 out += 16;
149 }
150 }
151 memcpy(ivec,iv,16);
152 } else {
153 if (STRICT_ALIGNMENT &&
154 ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
155 unsigned char c;
156 while (len>=16) {
157 (*block)(in, tmp.c, key);
158 for(n=0; n<16; ++n) {
159 c = in[n];
160 out[n] = tmp.c[n] ^ ivec[n];
161 ivec[n] = c;
162 }
163 len -= 16;
164 in += 16;
165 out += 16;
166 }
167 } else if (16%sizeof(size_t) == 0) { /* always true */
168 while (len>=16) {
169 size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
170 const size_t *in_t=(const size_t *)in;
171
172 (*block)(in, tmp.c, key);
173 for(n=0; n<16/sizeof(size_t); n++) {
174 c = in_t[n];
175 out_t[n] = tmp.t[n] ^ ivec_t[n];
176 ivec_t[n] = c;
177 }
178 len -= 16;
179 in += 16;
180 out += 16;
181 }
182 }
183 }
184#endif
185 while (len) {
186 unsigned char c;
187 (*block)(in, tmp.c, key);
188 for(n=0; n<16 && n<len; ++n) {
189 c = in[n];
190 out[n] = tmp.c[n] ^ ivec[n];
191 ivec[n] = c;
192 }
193 if (len<=16) {
194 for (; n<16; ++n)
195 ivec[n] = in[n];
196 break;
197 }
198 len -= 16;
199 in += 16;
200 out += 16;
201 }
202}
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
deleted file mode 100644
index 58cc4f44c6..0000000000
--- a/src/lib/libcrypto/modes/ccm128.c
+++ /dev/null
@@ -1,441 +0,0 @@
1/* $OpenBSD: ccm128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51#include <openssl/crypto.h>
52#include "modes_lcl.h"
53#include <string.h>
54
55#ifndef MODES_DEBUG
56# ifndef NDEBUG
57# define NDEBUG
58# endif
59#endif
60
61/* First you setup M and L parameters and pass the key schedule.
62 * This is called once per session setup... */
63void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
64 unsigned int M,unsigned int L,void *key,block128_f block)
65{
66 memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
67 ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
68 ctx->blocks = 0;
69 ctx->block = block;
70 ctx->key = key;
71}
72
73/* !!! Following interfaces are to be called *once* per packet !!! */
74
75/* Then you setup per-message nonce and pass the length of the message */
76int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
77 const unsigned char *nonce,size_t nlen,size_t mlen)
78{
79 unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
80
81 if (nlen<(14-L)) return -1; /* nonce is too short */
82
83 if (sizeof(mlen)==8 && L>=3) {
84 ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
85 ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
86 ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
87 ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
88 }
89 else
90 ctx->nonce.u[1] = 0;
91
92 ctx->nonce.c[12] = (u8)(mlen>>24);
93 ctx->nonce.c[13] = (u8)(mlen>>16);
94 ctx->nonce.c[14] = (u8)(mlen>>8);
95 ctx->nonce.c[15] = (u8)mlen;
96
97 ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
98 memcpy(&ctx->nonce.c[1],nonce,14-L);
99
100 return 0;
101}
102
103/* Then you pass additional authentication data, this is optional */
104void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
105 const unsigned char *aad,size_t alen)
106{ unsigned int i;
107 block128_f block = ctx->block;
108
109 if (alen==0) return;
110
111 ctx->nonce.c[0] |= 0x40; /* set Adata flag */
112 (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
113 ctx->blocks++;
114
115 if (alen<(0x10000-0x100)) {
116 ctx->cmac.c[0] ^= (u8)(alen>>8);
117 ctx->cmac.c[1] ^= (u8)alen;
118 i=2;
119 }
120 else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
121 ctx->cmac.c[0] ^= 0xFF;
122 ctx->cmac.c[1] ^= 0xFF;
123 ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
124 ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
125 ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
126 ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
127 ctx->cmac.c[6] ^= (u8)(alen>>24);
128 ctx->cmac.c[7] ^= (u8)(alen>>16);
129 ctx->cmac.c[8] ^= (u8)(alen>>8);
130 ctx->cmac.c[9] ^= (u8)alen;
131 i=10;
132 }
133 else {
134 ctx->cmac.c[0] ^= 0xFF;
135 ctx->cmac.c[1] ^= 0xFE;
136 ctx->cmac.c[2] ^= (u8)(alen>>24);
137 ctx->cmac.c[3] ^= (u8)(alen>>16);
138 ctx->cmac.c[4] ^= (u8)(alen>>8);
139 ctx->cmac.c[5] ^= (u8)alen;
140 i=6;
141 }
142
143 do {
144 for(;i<16 && alen;++i,++aad,--alen)
145 ctx->cmac.c[i] ^= *aad;
146 (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
147 ctx->blocks++;
148 i=0;
149 } while (alen);
150}
151
152/* Finally you encrypt or decrypt the message */
153
154/* counter part of nonce may not be larger than L*8 bits,
155 * L is not larger than 8, therefore 64-bit counter... */
156static void ctr64_inc(unsigned char *counter) {
157 unsigned int n=8;
158 u8 c;
159
160 counter += 8;
161 do {
162 --n;
163 c = counter[n];
164 ++c;
165 counter[n] = c;
166 if (c) return;
167 } while (n);
168}
169
170int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
171 const unsigned char *inp, unsigned char *out,
172 size_t len)
173{
174 size_t n;
175 unsigned int i,L;
176 unsigned char flags0 = ctx->nonce.c[0];
177 block128_f block = ctx->block;
178 void * key = ctx->key;
179 union { u64 u[2]; u8 c[16]; } scratch;
180
181 if (!(flags0&0x40))
182 (*block)(ctx->nonce.c,ctx->cmac.c,key),
183 ctx->blocks++;
184
185 ctx->nonce.c[0] = L = flags0&7;
186 for (n=0,i=15-L;i<15;++i) {
187 n |= ctx->nonce.c[i];
188 ctx->nonce.c[i]=0;
189 n <<= 8;
190 }
191 n |= ctx->nonce.c[15]; /* reconstructed length */
192 ctx->nonce.c[15]=1;
193
194 if (n!=len) return -1; /* length mismatch */
195
196 ctx->blocks += ((len+15)>>3)|1;
197 if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
198
199 while (len>=16) {
200#ifdef __STRICT_ALIGNMENT
201 union { u64 u[2]; u8 c[16]; } temp;
202
203 memcpy (temp.c,inp,16);
204 ctx->cmac.u[0] ^= temp.u[0];
205 ctx->cmac.u[1] ^= temp.u[1];
206#else
207 ctx->cmac.u[0] ^= ((u64*)inp)[0];
208 ctx->cmac.u[1] ^= ((u64*)inp)[1];
209#endif
210 (*block)(ctx->cmac.c,ctx->cmac.c,key);
211 (*block)(ctx->nonce.c,scratch.c,key);
212 ctr64_inc(ctx->nonce.c);
213#ifdef __STRICT_ALIGNMENT
214 temp.u[0] ^= scratch.u[0];
215 temp.u[1] ^= scratch.u[1];
216 memcpy(out,temp.c,16);
217#else
218 ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
219 ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
220#endif
221 inp += 16;
222 out += 16;
223 len -= 16;
224 }
225
226 if (len) {
227 for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
228 (*block)(ctx->cmac.c,ctx->cmac.c,key);
229 (*block)(ctx->nonce.c,scratch.c,key);
230 for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
231 }
232
233 for (i=15-L;i<16;++i)
234 ctx->nonce.c[i]=0;
235
236 (*block)(ctx->nonce.c,scratch.c,key);
237 ctx->cmac.u[0] ^= scratch.u[0];
238 ctx->cmac.u[1] ^= scratch.u[1];
239
240 ctx->nonce.c[0] = flags0;
241
242 return 0;
243}
244
245int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
246 const unsigned char *inp, unsigned char *out,
247 size_t len)
248{
249 size_t n;
250 unsigned int i,L;
251 unsigned char flags0 = ctx->nonce.c[0];
252 block128_f block = ctx->block;
253 void * key = ctx->key;
254 union { u64 u[2]; u8 c[16]; } scratch;
255
256 if (!(flags0&0x40))
257 (*block)(ctx->nonce.c,ctx->cmac.c,key);
258
259 ctx->nonce.c[0] = L = flags0&7;
260 for (n=0,i=15-L;i<15;++i) {
261 n |= ctx->nonce.c[i];
262 ctx->nonce.c[i]=0;
263 n <<= 8;
264 }
265 n |= ctx->nonce.c[15]; /* reconstructed length */
266 ctx->nonce.c[15]=1;
267
268 if (n!=len) return -1;
269
270 while (len>=16) {
271#ifdef __STRICT_ALIGNMENT
272 union { u64 u[2]; u8 c[16]; } temp;
273#endif
274 (*block)(ctx->nonce.c,scratch.c,key);
275 ctr64_inc(ctx->nonce.c);
276#ifdef __STRICT_ALIGNMENT
277 memcpy (temp.c,inp,16);
278 ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
279 ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
280 memcpy (out,scratch.c,16);
281#else
282 ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
283 ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
284#endif
285 (*block)(ctx->cmac.c,ctx->cmac.c,key);
286
287 inp += 16;
288 out += 16;
289 len -= 16;
290 }
291
292 if (len) {
293 (*block)(ctx->nonce.c,scratch.c,key);
294 for (i=0; i<len; ++i)
295 ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
296 (*block)(ctx->cmac.c,ctx->cmac.c,key);
297 }
298
299 for (i=15-L;i<16;++i)
300 ctx->nonce.c[i]=0;
301
302 (*block)(ctx->nonce.c,scratch.c,key);
303 ctx->cmac.u[0] ^= scratch.u[0];
304 ctx->cmac.u[1] ^= scratch.u[1];
305
306 ctx->nonce.c[0] = flags0;
307
308 return 0;
309}
310
311static void ctr64_add (unsigned char *counter,size_t inc)
312{ size_t n=8, val=0;
313
314 counter += 8;
315 do {
316 --n;
317 val += counter[n] + (inc&0xff);
318 counter[n] = (unsigned char)val;
319 val >>= 8; /* carry bit */
320 inc >>= 8;
321 } while(n && (inc || val));
322}
323
324int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
325 const unsigned char *inp, unsigned char *out,
326 size_t len,ccm128_f stream)
327{
328 size_t n;
329 unsigned int i,L;
330 unsigned char flags0 = ctx->nonce.c[0];
331 block128_f block = ctx->block;
332 void * key = ctx->key;
333 union { u64 u[2]; u8 c[16]; } scratch;
334
335 if (!(flags0&0x40))
336 (*block)(ctx->nonce.c,ctx->cmac.c,key),
337 ctx->blocks++;
338
339 ctx->nonce.c[0] = L = flags0&7;
340 for (n=0,i=15-L;i<15;++i) {
341 n |= ctx->nonce.c[i];
342 ctx->nonce.c[i]=0;
343 n <<= 8;
344 }
345 n |= ctx->nonce.c[15]; /* reconstructed length */
346 ctx->nonce.c[15]=1;
347
348 if (n!=len) return -1; /* length mismatch */
349
350 ctx->blocks += ((len+15)>>3)|1;
351 if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
352
353 if ((n=len/16)) {
354 (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
355 n *= 16;
356 inp += n;
357 out += n;
358 len -= n;
359 if (len) ctr64_add(ctx->nonce.c,n/16);
360 }
361
362 if (len) {
363 for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
364 (*block)(ctx->cmac.c,ctx->cmac.c,key);
365 (*block)(ctx->nonce.c,scratch.c,key);
366 for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
367 }
368
369 for (i=15-L;i<16;++i)
370 ctx->nonce.c[i]=0;
371
372 (*block)(ctx->nonce.c,scratch.c,key);
373 ctx->cmac.u[0] ^= scratch.u[0];
374 ctx->cmac.u[1] ^= scratch.u[1];
375
376 ctx->nonce.c[0] = flags0;
377
378 return 0;
379}
380
381int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
382 const unsigned char *inp, unsigned char *out,
383 size_t len,ccm128_f stream)
384{
385 size_t n;
386 unsigned int i,L;
387 unsigned char flags0 = ctx->nonce.c[0];
388 block128_f block = ctx->block;
389 void * key = ctx->key;
390 union { u64 u[2]; u8 c[16]; } scratch;
391
392 if (!(flags0&0x40))
393 (*block)(ctx->nonce.c,ctx->cmac.c,key);
394
395 ctx->nonce.c[0] = L = flags0&7;
396 for (n=0,i=15-L;i<15;++i) {
397 n |= ctx->nonce.c[i];
398 ctx->nonce.c[i]=0;
399 n <<= 8;
400 }
401 n |= ctx->nonce.c[15]; /* reconstructed length */
402 ctx->nonce.c[15]=1;
403
404 if (n!=len) return -1;
405
406 if ((n=len/16)) {
407 (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
408 n *= 16;
409 inp += n;
410 out += n;
411 len -= n;
412 if (len) ctr64_add(ctx->nonce.c,n/16);
413 }
414
415 if (len) {
416 (*block)(ctx->nonce.c,scratch.c,key);
417 for (i=0; i<len; ++i)
418 ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
419 (*block)(ctx->cmac.c,ctx->cmac.c,key);
420 }
421
422 for (i=15-L;i<16;++i)
423 ctx->nonce.c[i]=0;
424
425 (*block)(ctx->nonce.c,scratch.c,key);
426 ctx->cmac.u[0] ^= scratch.u[0];
427 ctx->cmac.u[1] ^= scratch.u[1];
428
429 ctx->nonce.c[0] = flags0;
430
431 return 0;
432}
433
434size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
435{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
436
437 M *= 2; M += 2;
438 if (len<M) return 0;
439 memcpy(tag,ctx->cmac.c,M);
440 return M;
441}
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
deleted file mode 100644
index 8399f0c5be..0000000000
--- a/src/lib/libcrypto/modes/cfb128.c
+++ /dev/null
@@ -1,234 +0,0 @@
1/* $OpenBSD: cfb128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62/* The input and output encrypted as though 128bit cfb mode is being
63 * used. The extra state information to record how much of the
64 * 128bit block we have used is contained in *num;
65 */
66void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
67 size_t len, const void *key,
68 unsigned char ivec[16], int *num,
69 int enc, block128_f block)
70{
71 unsigned int n;
72 size_t l = 0;
73
74 n = *num;
75
76 if (enc) {
77#if !defined(OPENSSL_SMALL_FOOTPRINT)
78 if (16%sizeof(size_t) == 0) do { /* always true actually */
79 while (n && len) {
80 *(out++) = ivec[n] ^= *(in++);
81 --len;
82 n = (n+1) % 16;
83 }
84#ifdef __STRICT_ALIGNMENT
85 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
86 break;
87#endif
88 while (len>=16) {
89 (*block)(ivec, ivec, key);
90 for (; n<16; n+=sizeof(size_t)) {
91 *(size_t*)(out+n) =
92 *(size_t*)(ivec+n) ^= *(size_t*)(in+n);
93 }
94 len -= 16;
95 out += 16;
96 in += 16;
97 n = 0;
98 }
99 if (len) {
100 (*block)(ivec, ivec, key);
101 while (len--) {
102 out[n] = ivec[n] ^= in[n];
103 ++n;
104 }
105 }
106 *num = n;
107 return;
108 } while (0);
109 /* the rest would be commonly eliminated by x86* compiler */
110#endif
111 while (l<len) {
112 if (n == 0) {
113 (*block)(ivec, ivec, key);
114 }
115 out[l] = ivec[n] ^= in[l];
116 ++l;
117 n = (n+1) % 16;
118 }
119 *num = n;
120 } else {
121#if !defined(OPENSSL_SMALL_FOOTPRINT)
122 if (16%sizeof(size_t) == 0) do { /* always true actually */
123 while (n && len) {
124 unsigned char c;
125 *(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c;
126 --len;
127 n = (n+1) % 16;
128 }
129#ifdef __STRICT_ALIGNMENT
130 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
131 break;
132#endif
133 while (len>=16) {
134 (*block)(ivec, ivec, key);
135 for (; n<16; n+=sizeof(size_t)) {
136 size_t t = *(size_t*)(in+n);
137 *(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t;
138 *(size_t*)(ivec+n) = t;
139 }
140 len -= 16;
141 out += 16;
142 in += 16;
143 n = 0;
144 }
145 if (len) {
146 (*block)(ivec, ivec, key);
147 while (len--) {
148 unsigned char c;
149 out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c;
150 ++n;
151 }
152 }
153 *num = n;
154 return;
155 } while (0);
156 /* the rest would be commonly eliminated by x86* compiler */
157#endif
158 while (l<len) {
159 unsigned char c;
160 if (n == 0) {
161 (*block)(ivec, ivec, key);
162 }
163 out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c;
164 ++l;
165 n = (n+1) % 16;
166 }
167 *num=n;
168 }
169}
170
171/* This expects a single block of size nbits for both in and out. Note that
172 it corrupts any extra bits in the last byte of out */
173static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
174 int nbits,const void *key,
175 unsigned char ivec[16],int enc,
176 block128_f block)
177{
178 int n,rem,num;
179 unsigned char ovec[16*2 + 1]; /* +1 because we dererefence (but don't use) one byte off the end */
180
181 if (nbits<=0 || nbits>128) return;
182
183 /* fill in the first half of the new IV with the current IV */
184 memcpy(ovec,ivec,16);
185 /* construct the new IV */
186 (*block)(ivec,ivec,key);
187 num = (nbits+7)/8;
188 if (enc) /* encrypt the input */
189 for(n=0 ; n < num ; ++n)
190 out[n] = (ovec[16+n] = in[n] ^ ivec[n]);
191 else /* decrypt the input */
192 for(n=0 ; n < num ; ++n)
193 out[n] = (ovec[16+n] = in[n]) ^ ivec[n];
194 /* shift ovec left... */
195 rem = nbits%8;
196 num = nbits/8;
197 if(rem==0)
198 memcpy(ivec,ovec+num,16);
199 else
200 for(n=0 ; n < 16 ; ++n)
201 ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
202
203 /* it is not necessary to cleanse ovec, since the IV is not secret */
204}
205
206/* N.B. This expects the input to be packed, MS bit first */
207void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
208 size_t bits, const void *key,
209 unsigned char ivec[16], int *num,
210 int enc, block128_f block)
211{
212 size_t n;
213 unsigned char c[1],d[1];
214
215 for(n=0 ; n<bits ; ++n)
216 {
217 c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
218 cfbr_encrypt_block(c,d,1,key,ivec,enc,block);
219 out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) |
220 ((d[0]&0x80) >> (unsigned int)(n%8));
221 }
222}
223
224void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
225 size_t length, const void *key,
226 unsigned char ivec[16], int *num,
227 int enc, block128_f block)
228{
229 size_t n;
230
231 for(n=0 ; n<length ; ++n)
232 cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block);
233}
234
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
deleted file mode 100644
index 7fd0223701..0000000000
--- a/src/lib/libcrypto/modes/ctr128.c
+++ /dev/null
@@ -1,252 +0,0 @@
1/* $OpenBSD: ctr128.c,v 1.6 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63/* NOTE: the IV/counter CTR mode is big-endian. The code itself
64 * is endian-neutral. */
65
66/* increment counter (128-bit int) by 1 */
67static void ctr128_inc(unsigned char *counter) {
68 u32 n=16;
69 u8 c;
70
71 do {
72 --n;
73 c = counter[n];
74 ++c;
75 counter[n] = c;
76 if (c) return;
77 } while (n);
78}
79
80#if !defined(OPENSSL_SMALL_FOOTPRINT)
81static void
82ctr128_inc_aligned(unsigned char *counter)
83{
84 size_t *data,c,n;
85
86 if (BYTE_ORDER == LITTLE_ENDIAN) {
87 ctr128_inc(counter);
88 return;
89 }
90
91 data = (size_t *)counter;
92 n = 16/sizeof(size_t);
93 do {
94 --n;
95 c = data[n];
96 ++c;
97 data[n] = c;
98 if (c) return;
99 } while (n);
100}
101#endif
102
103/* The input encrypted as though 128bit counter mode is being
104 * used. The extra state information to record how much of the
105 * 128bit block we have used is contained in *num, and the
106 * encrypted counter is kept in ecount_buf. Both *num and
107 * ecount_buf must be initialised with zeros before the first
108 * call to CRYPTO_ctr128_encrypt().
109 *
110 * This algorithm assumes that the counter is in the x lower bits
111 * of the IV (ivec), and that the application has full control over
112 * overflow and the rest of the IV. This implementation takes NO
113 * responsability for checking that the counter doesn't overflow
114 * into the rest of the IV when incremented.
115 */
116void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
117 size_t len, const void *key,
118 unsigned char ivec[16], unsigned char ecount_buf[16],
119 unsigned int *num, block128_f block)
120{
121 unsigned int n;
122 size_t l=0;
123
124 assert(*num < 16);
125
126 n = *num;
127
128#if !defined(OPENSSL_SMALL_FOOTPRINT)
129 if (16%sizeof(size_t) == 0) do { /* always true actually */
130 while (n && len) {
131 *(out++) = *(in++) ^ ecount_buf[n];
132 --len;
133 n = (n+1) % 16;
134 }
135
136#ifdef __STRICT_ALIGNMENT
137 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
138 break;
139#endif
140 while (len>=16) {
141 (*block)(ivec, ecount_buf, key);
142 ctr128_inc_aligned(ivec);
143 for (; n<16; n+=sizeof(size_t))
144 *(size_t *)(out+n) =
145 *(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n);
146 len -= 16;
147 out += 16;
148 in += 16;
149 n = 0;
150 }
151 if (len) {
152 (*block)(ivec, ecount_buf, key);
153 ctr128_inc_aligned(ivec);
154 while (len--) {
155 out[n] = in[n] ^ ecount_buf[n];
156 ++n;
157 }
158 }
159 *num = n;
160 return;
161 } while(0);
162 /* the rest would be commonly eliminated by x86* compiler */
163#endif
164 while (l<len) {
165 if (n==0) {
166 (*block)(ivec, ecount_buf, key);
167 ctr128_inc(ivec);
168 }
169 out[l] = in[l] ^ ecount_buf[n];
170 ++l;
171 n = (n+1) % 16;
172 }
173
174 *num=n;
175}
176
177/* increment upper 96 bits of 128-bit counter by 1 */
178static void ctr96_inc(unsigned char *counter) {
179 u32 n=12;
180 u8 c;
181
182 do {
183 --n;
184 c = counter[n];
185 ++c;
186 counter[n] = c;
187 if (c) return;
188 } while (n);
189}
190
191void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
192 size_t len, const void *key,
193 unsigned char ivec[16], unsigned char ecount_buf[16],
194 unsigned int *num, ctr128_f func)
195{
196 unsigned int n,ctr32;
197
198 assert(*num < 16);
199
200 n = *num;
201
202 while (n && len) {
203 *(out++) = *(in++) ^ ecount_buf[n];
204 --len;
205 n = (n+1) % 16;
206 }
207
208 ctr32 = GETU32(ivec+12);
209 while (len>=16) {
210 size_t blocks = len/16;
211 /*
212 * 1<<28 is just a not-so-small yet not-so-large number...
213 * Below condition is practically never met, but it has to
214 * be checked for code correctness.
215 */
216 if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
217 blocks = (1U<<28);
218 /*
219 * As (*func) operates on 32-bit counter, caller
220 * has to handle overflow. 'if' below detects the
221 * overflow, which is then handled by limiting the
222 * amount of blocks to the exact overflow point...
223 */
224 ctr32 += (u32)blocks;
225 if (ctr32 < blocks) {
226 blocks -= ctr32;
227 ctr32 = 0;
228 }
229 (*func)(in,out,blocks,key,ivec);
230 /* (*ctr) does not update ivec, caller does: */
231 PUTU32(ivec+12,ctr32);
232 /* ... overflow was detected, propogate carry. */
233 if (ctr32 == 0) ctr96_inc(ivec);
234 blocks *= 16;
235 len -= blocks;
236 out += blocks;
237 in += blocks;
238 }
239 if (len) {
240 memset(ecount_buf,0,16);
241 (*func)(ecount_buf,ecount_buf,1,key,ivec);
242 ++ctr32;
243 PUTU32(ivec+12,ctr32);
244 if (ctr32 == 0) ctr96_inc(ivec);
245 while (len--) {
246 out[n] = in[n] ^ ecount_buf[n];
247 ++n;
248 }
249 }
250
251 *num=n;
252}
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c
deleted file mode 100644
index 802aa77cd5..0000000000
--- a/src/lib/libcrypto/modes/cts128.c
+++ /dev/null
@@ -1,267 +0,0 @@
1/* $OpenBSD: cts128.c,v 1.5 2015/07/19 18:27:26 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Rights for redistribution and usage in source and binary
6 * forms are granted according to the OpenSSL license.
7 */
8
9#include <openssl/crypto.h>
10#include "modes_lcl.h"
11#include <string.h>
12
13#ifndef MODES_DEBUG
14# ifndef NDEBUG
15# define NDEBUG
16# endif
17#endif
18
19/*
20 * Trouble with Ciphertext Stealing, CTS, mode is that there is no
21 * common official specification, but couple of cipher/application
22 * specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
23 * Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
24 * deviates from mentioned RFCs. Most notably it allows input to be
25 * of block length and it doesn't flip the order of the last two
26 * blocks. CTS is being discussed even in ECB context, but it's not
27 * adopted for any known application. This implementation provides
28 * two interfaces: one compliant with above mentioned RFCs and one
29 * compliant with the NIST proposal, both extending CBC mode.
30 */
31
32size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
33 size_t len, const void *key,
34 unsigned char ivec[16], block128_f block)
35{ size_t residue, n;
36
37 if (len <= 16) return 0;
38
39 if ((residue=len%16) == 0) residue = 16;
40
41 len -= residue;
42
43 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
44
45 in += len;
46 out += len;
47
48 for (n=0; n<residue; ++n)
49 ivec[n] ^= in[n];
50 (*block)(ivec,ivec,key);
51 memcpy(out,out-16,residue);
52 memcpy(out-16,ivec,16);
53
54 return len+residue;
55}
56
57size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
58 size_t len, const void *key,
59 unsigned char ivec[16], block128_f block)
60{ size_t residue, n;
61
62 if (len < 16) return 0;
63
64 residue=len%16;
65
66 len -= residue;
67
68 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
69
70 if (residue==0) return len;
71
72 in += len;
73 out += len;
74
75 for (n=0; n<residue; ++n)
76 ivec[n] ^= in[n];
77 (*block)(ivec,ivec,key);
78 memcpy(out-16+residue,ivec,16);
79
80 return len+residue;
81}
82
83size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
84 size_t len, const void *key,
85 unsigned char ivec[16], cbc128_f cbc)
86{ size_t residue;
87 union { size_t align; unsigned char c[16]; } tmp;
88
89 if (len <= 16) return 0;
90
91 if ((residue=len%16) == 0) residue = 16;
92
93 len -= residue;
94
95 (*cbc)(in,out,len,key,ivec,1);
96
97 in += len;
98 out += len;
99
100 memset(tmp.c,0,sizeof(tmp));
101 memcpy(tmp.c,in,residue);
102 memcpy(out,out-16,residue);
103 (*cbc)(tmp.c,out-16,16,key,ivec,1);
104 return len+residue;
105}
106
107size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
108 size_t len, const void *key,
109 unsigned char ivec[16], cbc128_f cbc)
110{ size_t residue;
111 union { size_t align; unsigned char c[16]; } tmp;
112
113 if (len < 16) return 0;
114
115 residue=len%16;
116
117 len -= residue;
118
119 (*cbc)(in,out,len,key,ivec,1);
120
121 if (residue==0) return len;
122
123 in += len;
124 out += len;
125
126 memset(tmp.c,0,sizeof(tmp));
127 memcpy(tmp.c,in,residue);
128 (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
129 return len+residue;
130}
131
132size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
133 size_t len, const void *key,
134 unsigned char ivec[16], block128_f block)
135{ size_t residue, n;
136 union { size_t align; unsigned char c[32]; } tmp;
137
138 if (len<=16) return 0;
139
140 if ((residue=len%16) == 0) residue = 16;
141
142 len -= 16+residue;
143
144 if (len) {
145 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
146 in += len;
147 out += len;
148 }
149
150 (*block)(in,tmp.c+16,key);
151
152 memcpy(tmp.c,tmp.c+16,16);
153 memcpy(tmp.c,in+16,residue);
154 (*block)(tmp.c,tmp.c,key);
155
156 for(n=0; n<16; ++n) {
157 unsigned char c = in[n];
158 out[n] = tmp.c[n] ^ ivec[n];
159 ivec[n] = c;
160 }
161 for(residue+=16; n<residue; ++n)
162 out[n] = tmp.c[n] ^ in[n];
163
164 return 16+len+residue;
165}
166
167size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
168 size_t len, const void *key,
169 unsigned char ivec[16], block128_f block)
170{ size_t residue, n;
171 union { size_t align; unsigned char c[32]; } tmp;
172
173 if (len<16) return 0;
174
175 residue=len%16;
176
177 if (residue==0) {
178 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
179 return len;
180 }
181
182 len -= 16+residue;
183
184 if (len) {
185 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
186 in += len;
187 out += len;
188 }
189
190 (*block)(in+residue,tmp.c+16,key);
191
192 memcpy(tmp.c,tmp.c+16,16);
193 memcpy(tmp.c,in,residue);
194 (*block)(tmp.c,tmp.c,key);
195
196 for(n=0; n<16; ++n) {
197 unsigned char c = in[n];
198 out[n] = tmp.c[n] ^ ivec[n];
199 ivec[n] = in[n+residue];
200 tmp.c[n] = c;
201 }
202 for(residue+=16; n<residue; ++n)
203 out[n] = tmp.c[n] ^ tmp.c[n-16];
204
205 return 16+len+residue;
206}
207
208size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
209 size_t len, const void *key,
210 unsigned char ivec[16], cbc128_f cbc)
211{ size_t residue;
212 union { size_t align; unsigned char c[32]; } tmp;
213
214 if (len<=16) return 0;
215
216 if ((residue=len%16) == 0) residue = 16;
217
218 len -= 16+residue;
219
220 if (len) {
221 (*cbc)(in,out,len,key,ivec,0);
222 in += len;
223 out += len;
224 }
225
226 memset(tmp.c,0,sizeof(tmp));
227 /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
228 (*cbc)(in,tmp.c,16,key,tmp.c+16,0);
229
230 memcpy(tmp.c,in+16,residue);
231 (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
232 memcpy(out,tmp.c,16+residue);
233 return 16+len+residue;
234}
235
236size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
237 size_t len, const void *key,
238 unsigned char ivec[16], cbc128_f cbc)
239{ size_t residue;
240 union { size_t align; unsigned char c[32]; } tmp;
241
242 if (len<16) return 0;
243
244 residue=len%16;
245
246 if (residue==0) {
247 (*cbc)(in,out,len,key,ivec,0);
248 return len;
249 }
250
251 len -= 16+residue;
252
253 if (len) {
254 (*cbc)(in,out,len,key,ivec,0);
255 in += len;
256 out += len;
257 }
258
259 memset(tmp.c,0,sizeof(tmp));
260 /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
261 (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
262
263 memcpy(tmp.c,in,residue);
264 (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
265 memcpy(out,tmp.c,16+residue);
266 return 16+len+residue;
267}
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
deleted file mode 100644
index dd6d91e880..0000000000
--- a/src/lib/libcrypto/modes/gcm128.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/* $OpenBSD: gcm128.c,v 1.13 2015/09/10 15:56:25 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51#define OPENSSL_FIPSAPI
52
53#include <openssl/crypto.h>
54#include "modes_lcl.h"
55#include <string.h>
56
57#ifndef MODES_DEBUG
58# ifndef NDEBUG
59# define NDEBUG
60# endif
61#endif
62
63#if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65#undef GETU32
66#define GETU32(p) BSWAP4(*(const u32 *)(p))
67#undef PUTU32
68#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V) \
73 do { \
74 if (sizeof(size_t)==8) { \
75 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
76 V.lo = (V.hi<<63)|(V.lo>>1); \
77 V.hi = (V.hi>>1 )^T; \
78 } else { \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
82 } \
83 } while(0)
84
85/*
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 * handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 * subsystem penalties (for example on Windows large enough free
111 * results in VM working set trimming, meaning that consequent
112 * malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 * performance of other code paths (not necessarily even from same
115 * thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123 int i, j;
124 u128 V;
125
126 Htable[0].hi = 0;
127 Htable[0].lo = 0;
128 V.hi = H[0];
129 V.lo = H[1];
130
131 for (Htable[128]=V, i=64; i>0; i>>=1) {
132 REDUCE1BIT(V);
133 Htable[i] = V;
134 }
135
136 for (i=2; i<256; i<<=1) {
137 u128 *Hi = Htable+i, H0 = *Hi;
138 for (j=1; j<i; ++j) {
139 Hi[j].hi = H0.hi^Htable[j].hi;
140 Hi[j].lo = H0.lo^Htable[j].lo;
141 }
142 }
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147 u128 Z = { 0, 0};
148 const u8 *xi = (const u8 *)Xi+15;
149 size_t rem, n = *xi;
150 static const size_t rem_8bit[256] = {
151 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
215
216 while (1) {
217 Z.hi ^= Htable[n].hi;
218 Z.lo ^= Htable[n].lo;
219
220 if ((u8 *)Xi==xi) break;
221
222 n = *(--xi);
223
224 rem = (size_t)Z.lo&0xff;
225 Z.lo = (Z.hi<<56)|(Z.lo>>8);
226 Z.hi = (Z.hi>>8);
227 if (sizeof(size_t)==8)
228 Z.hi ^= rem_8bit[rem];
229 else
230 Z.hi ^= (u64)rem_8bit[rem]<<32;
231 }
232
233 if (BYTE_ORDER == LITTLE_ENDIAN) {
234#ifdef BSWAP8
235 Xi[0] = BSWAP8(Z.hi);
236 Xi[1] = BSWAP8(Z.lo);
237#else
238 u8 *p = (u8 *)Xi;
239 u32 v;
240 v = (u32)(Z.hi>>32); PUTU32(p,v);
241 v = (u32)(Z.hi); PUTU32(p+4,v);
242 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
243 v = (u32)(Z.lo); PUTU32(p+12,v);
244#endif
245 }
246 else {
247 Xi[0] = Z.hi;
248 Xi[1] = Z.lo;
249 }
250}
251#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
252
253#elif TABLE_BITS==4
254
255static void gcm_init_4bit(u128 Htable[16], u64 H[2])
256{
257 u128 V;
258#if defined(OPENSSL_SMALL_FOOTPRINT)
259 int i;
260#endif
261
262 Htable[0].hi = 0;
263 Htable[0].lo = 0;
264 V.hi = H[0];
265 V.lo = H[1];
266
267#if defined(OPENSSL_SMALL_FOOTPRINT)
268 for (Htable[8]=V, i=4; i>0; i>>=1) {
269 REDUCE1BIT(V);
270 Htable[i] = V;
271 }
272
273 for (i=2; i<16; i<<=1) {
274 u128 *Hi = Htable+i;
275 int j;
276 for (V=*Hi, j=1; j<i; ++j) {
277 Hi[j].hi = V.hi^Htable[j].hi;
278 Hi[j].lo = V.lo^Htable[j].lo;
279 }
280 }
281#else
282 Htable[8] = V;
283 REDUCE1BIT(V);
284 Htable[4] = V;
285 REDUCE1BIT(V);
286 Htable[2] = V;
287 REDUCE1BIT(V);
288 Htable[1] = V;
289 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
290 V=Htable[4];
291 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
292 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
293 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
294 V=Htable[8];
295 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
296 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
297 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
298 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
299 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
300 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
301 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
302#endif
303#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
304 /*
305 * ARM assembler expects specific dword order in Htable.
306 */
307 {
308 int j;
309
310 if (BYTE_ORDER == LITTLE_ENDIAN)
311 for (j=0;j<16;++j) {
312 V = Htable[j];
313 Htable[j].hi = V.lo;
314 Htable[j].lo = V.hi;
315 }
316 else
317 for (j=0;j<16;++j) {
318 V = Htable[j];
319 Htable[j].hi = V.lo<<32|V.lo>>32;
320 Htable[j].lo = V.hi<<32|V.hi>>32;
321 }
322 }
323#endif
324}
325
326#ifndef GHASH_ASM
327static const size_t rem_4bit[16] = {
328 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
329 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
330 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
331 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
332
333static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
334{
335 u128 Z;
336 int cnt = 15;
337 size_t rem, nlo, nhi;
338
339 nlo = ((const u8 *)Xi)[15];
340 nhi = nlo>>4;
341 nlo &= 0xf;
342
343 Z.hi = Htable[nlo].hi;
344 Z.lo = Htable[nlo].lo;
345
346 while (1) {
347 rem = (size_t)Z.lo&0xf;
348 Z.lo = (Z.hi<<60)|(Z.lo>>4);
349 Z.hi = (Z.hi>>4);
350 if (sizeof(size_t)==8)
351 Z.hi ^= rem_4bit[rem];
352 else
353 Z.hi ^= (u64)rem_4bit[rem]<<32;
354
355 Z.hi ^= Htable[nhi].hi;
356 Z.lo ^= Htable[nhi].lo;
357
358 if (--cnt<0) break;
359
360 nlo = ((const u8 *)Xi)[cnt];
361 nhi = nlo>>4;
362 nlo &= 0xf;
363
364 rem = (size_t)Z.lo&0xf;
365 Z.lo = (Z.hi<<60)|(Z.lo>>4);
366 Z.hi = (Z.hi>>4);
367 if (sizeof(size_t)==8)
368 Z.hi ^= rem_4bit[rem];
369 else
370 Z.hi ^= (u64)rem_4bit[rem]<<32;
371
372 Z.hi ^= Htable[nlo].hi;
373 Z.lo ^= Htable[nlo].lo;
374 }
375
376 if (BYTE_ORDER == LITTLE_ENDIAN) {
377#ifdef BSWAP8
378 Xi[0] = BSWAP8(Z.hi);
379 Xi[1] = BSWAP8(Z.lo);
380#else
381 u8 *p = (u8 *)Xi;
382 u32 v;
383 v = (u32)(Z.hi>>32); PUTU32(p,v);
384 v = (u32)(Z.hi); PUTU32(p+4,v);
385 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
386 v = (u32)(Z.lo); PUTU32(p+12,v);
387#endif
388 }
389 else {
390 Xi[0] = Z.hi;
391 Xi[1] = Z.lo;
392 }
393}
394
395#if !defined(OPENSSL_SMALL_FOOTPRINT)
396/*
397 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
398 * details... Compiler-generated code doesn't seem to give any
399 * performance improvement, at least not on x86[_64]. It's here
400 * mostly as reference and a placeholder for possible future
401 * non-trivial optimization[s]...
402 */
403static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
404 const u8 *inp,size_t len)
405{
406 u128 Z;
407 int cnt;
408 size_t rem, nlo, nhi;
409
410#if 1
411 do {
412 cnt = 15;
413 nlo = ((const u8 *)Xi)[15];
414 nlo ^= inp[15];
415 nhi = nlo>>4;
416 nlo &= 0xf;
417
418 Z.hi = Htable[nlo].hi;
419 Z.lo = Htable[nlo].lo;
420
421 while (1) {
422 rem = (size_t)Z.lo&0xf;
423 Z.lo = (Z.hi<<60)|(Z.lo>>4);
424 Z.hi = (Z.hi>>4);
425 if (sizeof(size_t)==8)
426 Z.hi ^= rem_4bit[rem];
427 else
428 Z.hi ^= (u64)rem_4bit[rem]<<32;
429
430 Z.hi ^= Htable[nhi].hi;
431 Z.lo ^= Htable[nhi].lo;
432
433 if (--cnt<0) break;
434
435 nlo = ((const u8 *)Xi)[cnt];
436 nlo ^= inp[cnt];
437 nhi = nlo>>4;
438 nlo &= 0xf;
439
440 rem = (size_t)Z.lo&0xf;
441 Z.lo = (Z.hi<<60)|(Z.lo>>4);
442 Z.hi = (Z.hi>>4);
443 if (sizeof(size_t)==8)
444 Z.hi ^= rem_4bit[rem];
445 else
446 Z.hi ^= (u64)rem_4bit[rem]<<32;
447
448 Z.hi ^= Htable[nlo].hi;
449 Z.lo ^= Htable[nlo].lo;
450 }
451#else
452 /*
453 * Extra 256+16 bytes per-key plus 512 bytes shared tables
454 * [should] give ~50% improvement... One could have PACK()-ed
455 * the rem_8bit even here, but the priority is to minimize
456 * cache footprint...
457 */
458 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
459 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
460 static const unsigned short rem_8bit[256] = {
461 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
462 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
463 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
464 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
465 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
466 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
467 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
468 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
469 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
470 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
471 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
472 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
473 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
474 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
475 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
476 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
477 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
478 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
479 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
480 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
481 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
482 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
483 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
484 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
485 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
486 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
487 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
488 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
489 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
490 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
491 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
492 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
493 /*
494 * This pre-processing phase slows down procedure by approximately
495 * same time as it makes each loop spin faster. In other words
496 * single block performance is approximately same as straightforward
497 * "4-bit" implementation, and then it goes only faster...
498 */
499 for (cnt=0; cnt<16; ++cnt) {
500 Z.hi = Htable[cnt].hi;
501 Z.lo = Htable[cnt].lo;
502 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
503 Hshr4[cnt].hi = (Z.hi>>4);
504 Hshl4[cnt] = (u8)(Z.lo<<4);
505 }
506
507 do {
508 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
509 nlo = ((const u8 *)Xi)[cnt];
510 nlo ^= inp[cnt];
511 nhi = nlo>>4;
512 nlo &= 0xf;
513
514 Z.hi ^= Htable[nlo].hi;
515 Z.lo ^= Htable[nlo].lo;
516
517 rem = (size_t)Z.lo&0xff;
518
519 Z.lo = (Z.hi<<56)|(Z.lo>>8);
520 Z.hi = (Z.hi>>8);
521
522 Z.hi ^= Hshr4[nhi].hi;
523 Z.lo ^= Hshr4[nhi].lo;
524 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
525 }
526
527 nlo = ((const u8 *)Xi)[0];
528 nlo ^= inp[0];
529 nhi = nlo>>4;
530 nlo &= 0xf;
531
532 Z.hi ^= Htable[nlo].hi;
533 Z.lo ^= Htable[nlo].lo;
534
535 rem = (size_t)Z.lo&0xf;
536
537 Z.lo = (Z.hi<<60)|(Z.lo>>4);
538 Z.hi = (Z.hi>>4);
539
540 Z.hi ^= Htable[nhi].hi;
541 Z.lo ^= Htable[nhi].lo;
542 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
543#endif
544
545 if (BYTE_ORDER == LITTLE_ENDIAN) {
546#ifdef BSWAP8
547 Xi[0] = BSWAP8(Z.hi);
548 Xi[1] = BSWAP8(Z.lo);
549#else
550 u8 *p = (u8 *)Xi;
551 u32 v;
552 v = (u32)(Z.hi>>32); PUTU32(p,v);
553 v = (u32)(Z.hi); PUTU32(p+4,v);
554 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
555 v = (u32)(Z.lo); PUTU32(p+12,v);
556#endif
557 }
558 else {
559 Xi[0] = Z.hi;
560 Xi[1] = Z.lo;
561 }
562 } while (inp+=16, len-=16);
563}
564#endif
565#else
566void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
567void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
568#endif
569
570#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
571#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
572#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
573/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
574 * trashing effect. In other words idea is to hash data while it's
575 * still in L1 cache after encryption pass... */
576#define GHASH_CHUNK (3*1024)
577#endif
578
579#else /* TABLE_BITS */
580
581static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
582{
583 u128 V,Z = { 0,0 };
584 long X;
585 int i,j;
586 const long *xi = (const long *)Xi;
587
588 V.hi = H[0]; /* H is in host byte order, no byte swapping */
589 V.lo = H[1];
590
591 for (j=0; j<16/sizeof(long); ++j) {
592 if (BYTE_ORDER == LITTLE_ENDIAN) {
593 if (sizeof(long)==8) {
594#ifdef BSWAP8
595 X = (long)(BSWAP8(xi[j]));
596#else
597 const u8 *p = (const u8 *)(xi+j);
598 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
599#endif
600 }
601 else {
602 const u8 *p = (const u8 *)(xi+j);
603 X = (long)GETU32(p);
604 }
605 }
606 else
607 X = xi[j];
608
609 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
610 u64 M = (u64)(X>>(8*sizeof(long)-1));
611 Z.hi ^= V.hi&M;
612 Z.lo ^= V.lo&M;
613
614 REDUCE1BIT(V);
615 }
616 }
617
618 if (BYTE_ORDER == LITTLE_ENDIAN) {
619#ifdef BSWAP8
620 Xi[0] = BSWAP8(Z.hi);
621 Xi[1] = BSWAP8(Z.lo);
622#else
623 u8 *p = (u8 *)Xi;
624 u32 v;
625 v = (u32)(Z.hi>>32); PUTU32(p,v);
626 v = (u32)(Z.hi); PUTU32(p+4,v);
627 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
628 v = (u32)(Z.lo); PUTU32(p+12,v);
629#endif
630 }
631 else {
632 Xi[0] = Z.hi;
633 Xi[1] = Z.lo;
634 }
635}
636#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
637
638#endif
639
640#if TABLE_BITS==4 && defined(GHASH_ASM)
641# if !defined(I386_ONLY) && \
642 (defined(__i386) || defined(__i386__) || \
643 defined(__x86_64) || defined(__x86_64__) || \
644 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
645# define GHASH_ASM_X86_OR_64
646# define GCM_FUNCREF_4BIT
647extern unsigned int OPENSSL_ia32cap_P[2];
648
649void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
650void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
651void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
652
653# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
654# define GHASH_ASM_X86
655void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
656void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
659void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660# endif
661# elif defined(__arm__) || defined(__arm)
662# include "arm_arch.h"
663# if __ARM_ARCH__>=7
664# define GHASH_ASM_ARM
665# define GCM_FUNCREF_4BIT
666void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
667void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668# endif
669# endif
670#endif
671
672#ifdef GCM_FUNCREF_4BIT
673# undef GCM_MUL
674# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
675# ifdef GHASH
676# undef GHASH
677# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
678# endif
679#endif
680
681void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
682{
683 memset(ctx,0,sizeof(*ctx));
684 ctx->block = block;
685 ctx->key = key;
686
687 (*block)(ctx->H.c,ctx->H.c,key);
688
689 if (BYTE_ORDER == LITTLE_ENDIAN) {
690 /* H is stored in host byte order */
691#ifdef BSWAP8
692 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
693 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
694#else
695 u8 *p = ctx->H.c;
696 u64 hi,lo;
697 hi = (u64)GETU32(p) <<32|GETU32(p+4);
698 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
699 ctx->H.u[0] = hi;
700 ctx->H.u[1] = lo;
701#endif
702 }
703
704#if TABLE_BITS==8
705 gcm_init_8bit(ctx->Htable,ctx->H.u);
706#elif TABLE_BITS==4
707# if defined(GHASH_ASM_X86_OR_64)
708# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
709 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
710 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
711 gcm_init_clmul(ctx->Htable,ctx->H.u);
712 ctx->gmult = gcm_gmult_clmul;
713 ctx->ghash = gcm_ghash_clmul;
714 return;
715 }
716# endif
717 gcm_init_4bit(ctx->Htable,ctx->H.u);
718# if defined(GHASH_ASM_X86) /* x86 only */
719# if defined(OPENSSL_IA32_SSE2)
720 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
721# else
722 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
723# endif
724 ctx->gmult = gcm_gmult_4bit_mmx;
725 ctx->ghash = gcm_ghash_4bit_mmx;
726 } else {
727 ctx->gmult = gcm_gmult_4bit_x86;
728 ctx->ghash = gcm_ghash_4bit_x86;
729 }
730# else
731 ctx->gmult = gcm_gmult_4bit;
732 ctx->ghash = gcm_ghash_4bit;
733# endif
734# elif defined(GHASH_ASM_ARM)
735 if (OPENSSL_armcap_P & ARMV7_NEON) {
736 ctx->gmult = gcm_gmult_neon;
737 ctx->ghash = gcm_ghash_neon;
738 } else {
739 gcm_init_4bit(ctx->Htable,ctx->H.u);
740 ctx->gmult = gcm_gmult_4bit;
741 ctx->ghash = gcm_ghash_4bit;
742 }
743# else
744 gcm_init_4bit(ctx->Htable,ctx->H.u);
745# endif
746#endif
747}
748
749void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
750{
751 unsigned int ctr;
752#ifdef GCM_FUNCREF_4BIT
753 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
754#endif
755
756 ctx->Yi.u[0] = 0;
757 ctx->Yi.u[1] = 0;
758 ctx->Xi.u[0] = 0;
759 ctx->Xi.u[1] = 0;
760 ctx->len.u[0] = 0; /* AAD length */
761 ctx->len.u[1] = 0; /* message length */
762 ctx->ares = 0;
763 ctx->mres = 0;
764
765 if (len==12) {
766 memcpy(ctx->Yi.c,iv,12);
767 ctx->Yi.c[15]=1;
768 ctr=1;
769 }
770 else {
771 size_t i;
772 u64 len0 = len;
773
774 while (len>=16) {
775 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
776 GCM_MUL(ctx,Yi);
777 iv += 16;
778 len -= 16;
779 }
780 if (len) {
781 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
782 GCM_MUL(ctx,Yi);
783 }
784 len0 <<= 3;
785 if (BYTE_ORDER == LITTLE_ENDIAN) {
786#ifdef BSWAP8
787 ctx->Yi.u[1] ^= BSWAP8(len0);
788#else
789 ctx->Yi.c[8] ^= (u8)(len0>>56);
790 ctx->Yi.c[9] ^= (u8)(len0>>48);
791 ctx->Yi.c[10] ^= (u8)(len0>>40);
792 ctx->Yi.c[11] ^= (u8)(len0>>32);
793 ctx->Yi.c[12] ^= (u8)(len0>>24);
794 ctx->Yi.c[13] ^= (u8)(len0>>16);
795 ctx->Yi.c[14] ^= (u8)(len0>>8);
796 ctx->Yi.c[15] ^= (u8)(len0);
797#endif
798 }
799 else
800 ctx->Yi.u[1] ^= len0;
801
802 GCM_MUL(ctx,Yi);
803
804 if (BYTE_ORDER == LITTLE_ENDIAN)
805#ifdef BSWAP4
806 ctr = BSWAP4(ctx->Yi.d[3]);
807#else
808 ctr = GETU32(ctx->Yi.c+12);
809#endif
810 else
811 ctr = ctx->Yi.d[3];
812 }
813
814 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
815 ++ctr;
816 if (BYTE_ORDER == LITTLE_ENDIAN)
817#ifdef BSWAP4
818 ctx->Yi.d[3] = BSWAP4(ctr);
819#else
820 PUTU32(ctx->Yi.c+12,ctr);
821#endif
822 else
823 ctx->Yi.d[3] = ctr;
824}
825
826int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827{
828 size_t i;
829 unsigned int n;
830 u64 alen = ctx->len.u[0];
831#ifdef GCM_FUNCREF_4BIT
832 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
833# ifdef GHASH
834 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
835 const u8 *inp,size_t len) = ctx->ghash;
836# endif
837#endif
838
839 if (ctx->len.u[1]) return -2;
840
841 alen += len;
842 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
843 return -1;
844 ctx->len.u[0] = alen;
845
846 n = ctx->ares;
847 if (n) {
848 while (n && len) {
849 ctx->Xi.c[n] ^= *(aad++);
850 --len;
851 n = (n+1)%16;
852 }
853 if (n==0) GCM_MUL(ctx,Xi);
854 else {
855 ctx->ares = n;
856 return 0;
857 }
858 }
859
860#ifdef GHASH
861 if ((i = (len&(size_t)-16))) {
862 GHASH(ctx,aad,i);
863 aad += i;
864 len -= i;
865 }
866#else
867 while (len>=16) {
868 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
869 GCM_MUL(ctx,Xi);
870 aad += 16;
871 len -= 16;
872 }
873#endif
874 if (len) {
875 n = (unsigned int)len;
876 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
877 }
878
879 ctx->ares = n;
880 return 0;
881}
882
883int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
884 const unsigned char *in, unsigned char *out,
885 size_t len)
886{
887 unsigned int n, ctr;
888 size_t i;
889 u64 mlen = ctx->len.u[1];
890 block128_f block = ctx->block;
891 void *key = ctx->key;
892#ifdef GCM_FUNCREF_4BIT
893 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
894# ifdef GHASH
895 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
896 const u8 *inp,size_t len) = ctx->ghash;
897# endif
898#endif
899
900 mlen += len;
901 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
902 return -1;
903 ctx->len.u[1] = mlen;
904
905 if (ctx->ares) {
906 /* First call to encrypt finalizes GHASH(AAD) */
907 GCM_MUL(ctx,Xi);
908 ctx->ares = 0;
909 }
910
911 if (BYTE_ORDER == LITTLE_ENDIAN)
912#ifdef BSWAP4
913 ctr = BSWAP4(ctx->Yi.d[3]);
914#else
915 ctr = GETU32(ctx->Yi.c+12);
916#endif
917 else
918 ctr = ctx->Yi.d[3];
919
920 n = ctx->mres;
921#if !defined(OPENSSL_SMALL_FOOTPRINT)
922 if (16%sizeof(size_t) == 0) do { /* always true actually */
923 if (n) {
924 while (n && len) {
925 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
926 --len;
927 n = (n+1)%16;
928 }
929 if (n==0) GCM_MUL(ctx,Xi);
930 else {
931 ctx->mres = n;
932 return 0;
933 }
934 }
935#ifdef __STRICT_ALIGNMENT
936 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
937 break;
938#endif
939#if defined(GHASH) && defined(GHASH_CHUNK)
940 while (len>=GHASH_CHUNK) {
941 size_t j=GHASH_CHUNK;
942
943 while (j) {
944 size_t *out_t=(size_t *)out;
945 const size_t *in_t=(const size_t *)in;
946
947 (*block)(ctx->Yi.c,ctx->EKi.c,key);
948 ++ctr;
949 if (BYTE_ORDER == LITTLE_ENDIAN)
950#ifdef BSWAP4
951 ctx->Yi.d[3] = BSWAP4(ctr);
952#else
953 PUTU32(ctx->Yi.c+12,ctr);
954#endif
955 else
956 ctx->Yi.d[3] = ctr;
957 for (i=0; i<16/sizeof(size_t); ++i)
958 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
959 out += 16;
960 in += 16;
961 j -= 16;
962 }
963 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
964 len -= GHASH_CHUNK;
965 }
966 if ((i = (len&(size_t)-16))) {
967 size_t j=i;
968
969 while (len>=16) {
970 size_t *out_t=(size_t *)out;
971 const size_t *in_t=(const size_t *)in;
972
973 (*block)(ctx->Yi.c,ctx->EKi.c,key);
974 ++ctr;
975 if (BYTE_ORDER == LITTLE_ENDIAN)
976#ifdef BSWAP4
977 ctx->Yi.d[3] = BSWAP4(ctr);
978#else
979 PUTU32(ctx->Yi.c+12,ctr);
980#endif
981 else
982 ctx->Yi.d[3] = ctr;
983 for (i=0; i<16/sizeof(size_t); ++i)
984 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
985 out += 16;
986 in += 16;
987 len -= 16;
988 }
989 GHASH(ctx,out-j,j);
990 }
991#else
992 while (len>=16) {
993 size_t *out_t=(size_t *)out;
994 const size_t *in_t=(const size_t *)in;
995
996 (*block)(ctx->Yi.c,ctx->EKi.c,key);
997 ++ctr;
998 if (BYTE_ORDER == LITTLE_ENDIAN)
999#ifdef BSWAP4
1000 ctx->Yi.d[3] = BSWAP4(ctr);
1001#else
1002 PUTU32(ctx->Yi.c+12,ctr);
1003#endif
1004 else
1005 ctx->Yi.d[3] = ctr;
1006 for (i=0; i<16/sizeof(size_t); ++i)
1007 ctx->Xi.t[i] ^=
1008 out_t[i] = in_t[i]^ctx->EKi.t[i];
1009 GCM_MUL(ctx,Xi);
1010 out += 16;
1011 in += 16;
1012 len -= 16;
1013 }
1014#endif
1015 if (len) {
1016 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1017 ++ctr;
1018 if (BYTE_ORDER == LITTLE_ENDIAN)
1019#ifdef BSWAP4
1020 ctx->Yi.d[3] = BSWAP4(ctr);
1021#else
1022 PUTU32(ctx->Yi.c+12,ctr);
1023#endif
1024 else
1025 ctx->Yi.d[3] = ctr;
1026 while (len--) {
1027 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1028 ++n;
1029 }
1030 }
1031
1032 ctx->mres = n;
1033 return 0;
1034 } while(0);
1035#endif
1036 for (i=0;i<len;++i) {
1037 if (n==0) {
1038 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1039 ++ctr;
1040 if (BYTE_ORDER == LITTLE_ENDIAN)
1041#ifdef BSWAP4
1042 ctx->Yi.d[3] = BSWAP4(ctr);
1043#else
1044 PUTU32(ctx->Yi.c+12,ctr);
1045#endif
1046 else
1047 ctx->Yi.d[3] = ctr;
1048 }
1049 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1050 n = (n+1)%16;
1051 if (n==0)
1052 GCM_MUL(ctx,Xi);
1053 }
1054
1055 ctx->mres = n;
1056 return 0;
1057}
1058
1059int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1060 const unsigned char *in, unsigned char *out,
1061 size_t len)
1062{
1063 unsigned int n, ctr;
1064 size_t i;
1065 u64 mlen = ctx->len.u[1];
1066 block128_f block = ctx->block;
1067 void *key = ctx->key;
1068#ifdef GCM_FUNCREF_4BIT
1069 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1070# ifdef GHASH
1071 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1072 const u8 *inp,size_t len) = ctx->ghash;
1073# endif
1074#endif
1075
1076 mlen += len;
1077 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1078 return -1;
1079 ctx->len.u[1] = mlen;
1080
1081 if (ctx->ares) {
1082 /* First call to decrypt finalizes GHASH(AAD) */
1083 GCM_MUL(ctx,Xi);
1084 ctx->ares = 0;
1085 }
1086
1087 if (BYTE_ORDER == LITTLE_ENDIAN)
1088#ifdef BSWAP4
1089 ctr = BSWAP4(ctx->Yi.d[3]);
1090#else
1091 ctr = GETU32(ctx->Yi.c+12);
1092#endif
1093 else
1094 ctr = ctx->Yi.d[3];
1095
1096 n = ctx->mres;
1097#if !defined(OPENSSL_SMALL_FOOTPRINT)
1098 if (16%sizeof(size_t) == 0) do { /* always true actually */
1099 if (n) {
1100 while (n && len) {
1101 u8 c = *(in++);
1102 *(out++) = c^ctx->EKi.c[n];
1103 ctx->Xi.c[n] ^= c;
1104 --len;
1105 n = (n+1)%16;
1106 }
1107 if (n==0) GCM_MUL (ctx,Xi);
1108 else {
1109 ctx->mres = n;
1110 return 0;
1111 }
1112 }
1113#ifdef __STRICT_ALIGNMENT
1114 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1115 break;
1116#endif
1117#if defined(GHASH) && defined(GHASH_CHUNK)
1118 while (len>=GHASH_CHUNK) {
1119 size_t j=GHASH_CHUNK;
1120
1121 GHASH(ctx,in,GHASH_CHUNK);
1122 while (j) {
1123 size_t *out_t=(size_t *)out;
1124 const size_t *in_t=(const size_t *)in;
1125
1126 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1127 ++ctr;
1128 if (BYTE_ORDER == LITTLE_ENDIAN)
1129#ifdef BSWAP4
1130 ctx->Yi.d[3] = BSWAP4(ctr);
1131#else
1132 PUTU32(ctx->Yi.c+12,ctr);
1133#endif
1134 else
1135 ctx->Yi.d[3] = ctr;
1136 for (i=0; i<16/sizeof(size_t); ++i)
1137 out_t[i] = in_t[i]^ctx->EKi.t[i];
1138 out += 16;
1139 in += 16;
1140 j -= 16;
1141 }
1142 len -= GHASH_CHUNK;
1143 }
1144 if ((i = (len&(size_t)-16))) {
1145 GHASH(ctx,in,i);
1146 while (len>=16) {
1147 size_t *out_t=(size_t *)out;
1148 const size_t *in_t=(const size_t *)in;
1149
1150 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1151 ++ctr;
1152 if (BYTE_ORDER == LITTLE_ENDIAN)
1153#ifdef BSWAP4
1154 ctx->Yi.d[3] = BSWAP4(ctr);
1155#else
1156 PUTU32(ctx->Yi.c+12,ctr);
1157#endif
1158 else
1159 ctx->Yi.d[3] = ctr;
1160 for (i=0; i<16/sizeof(size_t); ++i)
1161 out_t[i] = in_t[i]^ctx->EKi.t[i];
1162 out += 16;
1163 in += 16;
1164 len -= 16;
1165 }
1166 }
1167#else
1168 while (len>=16) {
1169 size_t *out_t=(size_t *)out;
1170 const size_t *in_t=(const size_t *)in;
1171
1172 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1173 ++ctr;
1174 if (BYTE_ORDER == LITTLE_ENDIAN)
1175#ifdef BSWAP4
1176 ctx->Yi.d[3] = BSWAP4(ctr);
1177#else
1178 PUTU32(ctx->Yi.c+12,ctr);
1179#endif
1180 else
1181 ctx->Yi.d[3] = ctr;
1182 for (i=0; i<16/sizeof(size_t); ++i) {
1183 size_t c = in[i];
1184 out[i] = c^ctx->EKi.t[i];
1185 ctx->Xi.t[i] ^= c;
1186 }
1187 GCM_MUL(ctx,Xi);
1188 out += 16;
1189 in += 16;
1190 len -= 16;
1191 }
1192#endif
1193 if (len) {
1194 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1195 ++ctr;
1196 if (BYTE_ORDER == LITTLE_ENDIAN)
1197#ifdef BSWAP4
1198 ctx->Yi.d[3] = BSWAP4(ctr);
1199#else
1200 PUTU32(ctx->Yi.c+12,ctr);
1201#endif
1202 else
1203 ctx->Yi.d[3] = ctr;
1204 while (len--) {
1205 u8 c = in[n];
1206 ctx->Xi.c[n] ^= c;
1207 out[n] = c^ctx->EKi.c[n];
1208 ++n;
1209 }
1210 }
1211
1212 ctx->mres = n;
1213 return 0;
1214 } while(0);
1215#endif
1216 for (i=0;i<len;++i) {
1217 u8 c;
1218 if (n==0) {
1219 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1220 ++ctr;
1221 if (BYTE_ORDER == LITTLE_ENDIAN)
1222#ifdef BSWAP4
1223 ctx->Yi.d[3] = BSWAP4(ctr);
1224#else
1225 PUTU32(ctx->Yi.c+12,ctr);
1226#endif
1227 else
1228 ctx->Yi.d[3] = ctr;
1229 }
1230 c = in[i];
1231 out[i] = c^ctx->EKi.c[n];
1232 ctx->Xi.c[n] ^= c;
1233 n = (n+1)%16;
1234 if (n==0)
1235 GCM_MUL(ctx,Xi);
1236 }
1237
1238 ctx->mres = n;
1239 return 0;
1240}
1241
1242int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1243 const unsigned char *in, unsigned char *out,
1244 size_t len, ctr128_f stream)
1245{
1246 unsigned int n, ctr;
1247 size_t i;
1248 u64 mlen = ctx->len.u[1];
1249 void *key = ctx->key;
1250#ifdef GCM_FUNCREF_4BIT
1251 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1252# ifdef GHASH
1253 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1254 const u8 *inp,size_t len) = ctx->ghash;
1255# endif
1256#endif
1257
1258 mlen += len;
1259 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1260 return -1;
1261 ctx->len.u[1] = mlen;
1262
1263 if (ctx->ares) {
1264 /* First call to encrypt finalizes GHASH(AAD) */
1265 GCM_MUL(ctx,Xi);
1266 ctx->ares = 0;
1267 }
1268
1269 if (BYTE_ORDER == LITTLE_ENDIAN)
1270#ifdef BSWAP4
1271 ctr = BSWAP4(ctx->Yi.d[3]);
1272#else
1273 ctr = GETU32(ctx->Yi.c+12);
1274#endif
1275 else
1276 ctr = ctx->Yi.d[3];
1277
1278 n = ctx->mres;
1279 if (n) {
1280 while (n && len) {
1281 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1282 --len;
1283 n = (n+1)%16;
1284 }
1285 if (n==0) GCM_MUL(ctx,Xi);
1286 else {
1287 ctx->mres = n;
1288 return 0;
1289 }
1290 }
1291#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1292 while (len>=GHASH_CHUNK) {
1293 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1294 ctr += GHASH_CHUNK/16;
1295 if (BYTE_ORDER == LITTLE_ENDIAN)
1296#ifdef BSWAP4
1297 ctx->Yi.d[3] = BSWAP4(ctr);
1298#else
1299 PUTU32(ctx->Yi.c+12,ctr);
1300#endif
1301 else
1302 ctx->Yi.d[3] = ctr;
1303 GHASH(ctx,out,GHASH_CHUNK);
1304 out += GHASH_CHUNK;
1305 in += GHASH_CHUNK;
1306 len -= GHASH_CHUNK;
1307 }
1308#endif
1309 if ((i = (len&(size_t)-16))) {
1310 size_t j=i/16;
1311
1312 (*stream)(in,out,j,key,ctx->Yi.c);
1313 ctr += (unsigned int)j;
1314 if (BYTE_ORDER == LITTLE_ENDIAN)
1315#ifdef BSWAP4
1316 ctx->Yi.d[3] = BSWAP4(ctr);
1317#else
1318 PUTU32(ctx->Yi.c+12,ctr);
1319#endif
1320 else
1321 ctx->Yi.d[3] = ctr;
1322 in += i;
1323 len -= i;
1324#if defined(GHASH)
1325 GHASH(ctx,out,i);
1326 out += i;
1327#else
1328 while (j--) {
1329 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1330 GCM_MUL(ctx,Xi);
1331 out += 16;
1332 }
1333#endif
1334 }
1335 if (len) {
1336 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1337 ++ctr;
1338 if (BYTE_ORDER == LITTLE_ENDIAN)
1339#ifdef BSWAP4
1340 ctx->Yi.d[3] = BSWAP4(ctr);
1341#else
1342 PUTU32(ctx->Yi.c+12,ctr);
1343#endif
1344 else
1345 ctx->Yi.d[3] = ctr;
1346 while (len--) {
1347 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1348 ++n;
1349 }
1350 }
1351
1352 ctx->mres = n;
1353 return 0;
1354}
1355
1356int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1357 const unsigned char *in, unsigned char *out,
1358 size_t len,ctr128_f stream)
1359{
1360 unsigned int n, ctr;
1361 size_t i;
1362 u64 mlen = ctx->len.u[1];
1363 void *key = ctx->key;
1364#ifdef GCM_FUNCREF_4BIT
1365 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1366# ifdef GHASH
1367 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1368 const u8 *inp,size_t len) = ctx->ghash;
1369# endif
1370#endif
1371
1372 mlen += len;
1373 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1374 return -1;
1375 ctx->len.u[1] = mlen;
1376
1377 if (ctx->ares) {
1378 /* First call to decrypt finalizes GHASH(AAD) */
1379 GCM_MUL(ctx,Xi);
1380 ctx->ares = 0;
1381 }
1382
1383 if (BYTE_ORDER == LITTLE_ENDIAN)
1384#ifdef BSWAP4
1385 ctr = BSWAP4(ctx->Yi.d[3]);
1386#else
1387 ctr = GETU32(ctx->Yi.c+12);
1388#endif
1389 else
1390 ctr = ctx->Yi.d[3];
1391
1392 n = ctx->mres;
1393 if (n) {
1394 while (n && len) {
1395 u8 c = *(in++);
1396 *(out++) = c^ctx->EKi.c[n];
1397 ctx->Xi.c[n] ^= c;
1398 --len;
1399 n = (n+1)%16;
1400 }
1401 if (n==0) GCM_MUL (ctx,Xi);
1402 else {
1403 ctx->mres = n;
1404 return 0;
1405 }
1406 }
1407#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1408 while (len>=GHASH_CHUNK) {
1409 GHASH(ctx,in,GHASH_CHUNK);
1410 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1411 ctr += GHASH_CHUNK/16;
1412 if (BYTE_ORDER == LITTLE_ENDIAN)
1413#ifdef BSWAP4
1414 ctx->Yi.d[3] = BSWAP4(ctr);
1415#else
1416 PUTU32(ctx->Yi.c+12,ctr);
1417#endif
1418 else
1419 ctx->Yi.d[3] = ctr;
1420 out += GHASH_CHUNK;
1421 in += GHASH_CHUNK;
1422 len -= GHASH_CHUNK;
1423 }
1424#endif
1425 if ((i = (len&(size_t)-16))) {
1426 size_t j=i/16;
1427
1428#if defined(GHASH)
1429 GHASH(ctx,in,i);
1430#else
1431 while (j--) {
1432 size_t k;
1433 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1434 GCM_MUL(ctx,Xi);
1435 in += 16;
1436 }
1437 j = i/16;
1438 in -= i;
1439#endif
1440 (*stream)(in,out,j,key,ctx->Yi.c);
1441 ctr += (unsigned int)j;
1442 if (BYTE_ORDER == LITTLE_ENDIAN)
1443#ifdef BSWAP4
1444 ctx->Yi.d[3] = BSWAP4(ctr);
1445#else
1446 PUTU32(ctx->Yi.c+12,ctr);
1447#endif
1448 else
1449 ctx->Yi.d[3] = ctr;
1450 out += i;
1451 in += i;
1452 len -= i;
1453 }
1454 if (len) {
1455 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1456 ++ctr;
1457 if (BYTE_ORDER == LITTLE_ENDIAN)
1458#ifdef BSWAP4
1459 ctx->Yi.d[3] = BSWAP4(ctr);
1460#else
1461 PUTU32(ctx->Yi.c+12,ctr);
1462#endif
1463 else
1464 ctx->Yi.d[3] = ctr;
1465 while (len--) {
1466 u8 c = in[n];
1467 ctx->Xi.c[n] ^= c;
1468 out[n] = c^ctx->EKi.c[n];
1469 ++n;
1470 }
1471 }
1472
1473 ctx->mres = n;
1474 return 0;
1475}
1476
1477int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1478 size_t len)
1479{
1480 u64 alen = ctx->len.u[0]<<3;
1481 u64 clen = ctx->len.u[1]<<3;
1482#ifdef GCM_FUNCREF_4BIT
1483 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1484#endif
1485
1486 if (ctx->mres || ctx->ares)
1487 GCM_MUL(ctx,Xi);
1488
1489 if (BYTE_ORDER == LITTLE_ENDIAN) {
1490#ifdef BSWAP8
1491 alen = BSWAP8(alen);
1492 clen = BSWAP8(clen);
1493#else
1494 u8 *p = ctx->len.c;
1495
1496 ctx->len.u[0] = alen;
1497 ctx->len.u[1] = clen;
1498
1499 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1500 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1501#endif
1502 }
1503
1504 ctx->Xi.u[0] ^= alen;
1505 ctx->Xi.u[1] ^= clen;
1506 GCM_MUL(ctx,Xi);
1507
1508 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1509 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1510
1511 if (tag && len<=sizeof(ctx->Xi))
1512 return memcmp(ctx->Xi.c,tag,len);
1513 else
1514 return -1;
1515}
1516
1517void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1518{
1519 CRYPTO_gcm128_finish(ctx, NULL, 0);
1520 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1521}
1522
1523GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1524{
1525 GCM128_CONTEXT *ret;
1526
1527 if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1528 CRYPTO_gcm128_init(ret,key,block);
1529
1530 return ret;
1531}
1532
1533void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1534{
1535 if (ctx) {
1536 explicit_bzero(ctx,sizeof(*ctx));
1537 free(ctx);
1538 }
1539}
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
deleted file mode 100644
index a532cb3f41..0000000000
--- a/src/lib/libcrypto/modes/modes.h
+++ /dev/null
@@ -1,136 +0,0 @@
1/* $OpenBSD: modes.h,v 1.2 2014/06/12 15:49:30 deraadt Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Rights for redistribution and usage in source and binary
6 * forms are granted according to the OpenSSL license.
7 */
8
9#include <stddef.h>
10
11typedef void (*block128_f)(const unsigned char in[16],
12 unsigned char out[16],
13 const void *key);
14
15typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
16 size_t len, const void *key,
17 unsigned char ivec[16], int enc);
18
19typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
20 size_t blocks, const void *key,
21 const unsigned char ivec[16]);
22
23typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
24 size_t blocks, const void *key,
25 const unsigned char ivec[16],unsigned char cmac[16]);
26
27void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
28 size_t len, const void *key,
29 unsigned char ivec[16], block128_f block);
30void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
31 size_t len, const void *key,
32 unsigned char ivec[16], block128_f block);
33
34void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
35 size_t len, const void *key,
36 unsigned char ivec[16], unsigned char ecount_buf[16],
37 unsigned int *num, block128_f block);
38
39void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
40 size_t len, const void *key,
41 unsigned char ivec[16], unsigned char ecount_buf[16],
42 unsigned int *num, ctr128_f ctr);
43
44void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
45 size_t len, const void *key,
46 unsigned char ivec[16], int *num,
47 block128_f block);
48
49void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
50 size_t len, const void *key,
51 unsigned char ivec[16], int *num,
52 int enc, block128_f block);
53void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
54 size_t length, const void *key,
55 unsigned char ivec[16], int *num,
56 int enc, block128_f block);
57void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
58 size_t bits, const void *key,
59 unsigned char ivec[16], int *num,
60 int enc, block128_f block);
61
62size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
63 size_t len, const void *key,
64 unsigned char ivec[16], block128_f block);
65size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
66 size_t len, const void *key,
67 unsigned char ivec[16], cbc128_f cbc);
68size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
69 size_t len, const void *key,
70 unsigned char ivec[16], block128_f block);
71size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
72 size_t len, const void *key,
73 unsigned char ivec[16], cbc128_f cbc);
74
75size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
76 size_t len, const void *key,
77 unsigned char ivec[16], block128_f block);
78size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
79 size_t len, const void *key,
80 unsigned char ivec[16], cbc128_f cbc);
81size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
82 size_t len, const void *key,
83 unsigned char ivec[16], block128_f block);
84size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
85 size_t len, const void *key,
86 unsigned char ivec[16], cbc128_f cbc);
87
88typedef struct gcm128_context GCM128_CONTEXT;
89
90GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
91void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
92void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
93 size_t len);
94int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
95 size_t len);
96int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
97 const unsigned char *in, unsigned char *out,
98 size_t len);
99int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
100 const unsigned char *in, unsigned char *out,
101 size_t len);
102int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
103 const unsigned char *in, unsigned char *out,
104 size_t len, ctr128_f stream);
105int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
106 const unsigned char *in, unsigned char *out,
107 size_t len, ctr128_f stream);
108int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
109 size_t len);
110void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
111void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
112
113typedef struct ccm128_context CCM128_CONTEXT;
114
115void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
116 unsigned int M, unsigned int L, void *key,block128_f block);
117int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
118 const unsigned char *nonce, size_t nlen, size_t mlen);
119void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
120 const unsigned char *aad, size_t alen);
121int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
122 const unsigned char *inp, unsigned char *out, size_t len);
123int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
124 const unsigned char *inp, unsigned char *out, size_t len);
125int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
126 const unsigned char *inp, unsigned char *out, size_t len,
127 ccm128_f stream);
128int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
129 const unsigned char *inp, unsigned char *out, size_t len,
130 ccm128_f stream);
131size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
132
133typedef struct xts128_context XTS128_CONTEXT;
134
135int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
136 const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h
deleted file mode 100644
index 8e43e480fc..0000000000
--- a/src/lib/libcrypto/modes/modes_lcl.h
+++ /dev/null
@@ -1,108 +0,0 @@
1/* $OpenBSD: modes_lcl.h,v 1.8 2014/07/10 22:45:57 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use is governed by OpenSSL license.
6 * ====================================================================
7 */
8
9#include <machine/endian.h>
10
11#include <openssl/opensslconf.h>
12
13#include <openssl/modes.h>
14
15#if defined(_LP64)
16typedef long i64;
17typedef unsigned long u64;
18#define U64(C) C##UL
19#else
20typedef long long i64;
21typedef unsigned long long u64;
22#define U64(C) C##ULL
23#endif
24
25typedef unsigned int u32;
26typedef unsigned char u8;
27
28#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
29#if defined(__GNUC__) && __GNUC__>=2
30# if defined(__x86_64) || defined(__x86_64__)
31# define BSWAP8(x) ({ u64 ret=(x); \
32 asm ("bswapq %0" \
33 : "+r"(ret)); ret; })
34# define BSWAP4(x) ({ u32 ret=(x); \
35 asm ("bswapl %0" \
36 : "+r"(ret)); ret; })
37# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
38# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
39 asm ("bswapl %0; bswapl %1" \
40 : "+r"(hi),"+r"(lo)); \
41 (u64)hi<<32|lo; })
42# define BSWAP4(x) ({ u32 ret=(x); \
43 asm ("bswapl %0" \
44 : "+r"(ret)); ret; })
45# elif (defined(__arm__) || defined(__arm)) && !defined(__STRICT_ALIGNMENT)
46# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
47 asm ("rev %0,%0; rev %1,%1" \
48 : "+r"(hi),"+r"(lo)); \
49 (u64)hi<<32|lo; })
50# define BSWAP4(x) ({ u32 ret; \
51 asm ("rev %0,%1" \
52 : "=r"(ret) : "r"((u32)(x))); \
53 ret; })
54# endif
55#endif
56#endif
57
58#if defined(BSWAP4) && !defined(__STRICT_ALIGNMENT)
59#define GETU32(p) BSWAP4(*(const u32 *)(p))
60#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
61#else
62#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
63#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
64#endif
65
66/* GCM definitions */
67
68typedef struct { u64 hi,lo; } u128;
69
70#ifdef TABLE_BITS
71#undef TABLE_BITS
72#endif
73/*
74 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
75 * never be set to 8 [or 1]. For further information see gcm128.c.
76 */
77#define TABLE_BITS 4
78
79struct gcm128_context {
80 /* Following 6 names follow names in GCM specification */
81 union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
82 Yi,EKi,EK0,len,Xi,H;
83 /* Relative position of Xi, H and pre-computed Htable is used
84 * in some assembler modules, i.e. don't change the order! */
85#if TABLE_BITS==8
86 u128 Htable[256];
87#else
88 u128 Htable[16];
89 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
90 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
91#endif
92 unsigned int mres, ares;
93 block128_f block;
94 void *key;
95};
96
97struct xts128_context {
98 void *key1, *key2;
99 block128_f block1,block2;
100};
101
102struct ccm128_context {
103 union { u64 u[2]; u8 c[16]; } nonce, cmac;
104 u64 blocks;
105 block128_f block;
106 void *key;
107};
108
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
deleted file mode 100644
index 1b8a6fd500..0000000000
--- a/src/lib/libcrypto/modes/ofb128.c
+++ /dev/null
@@ -1,119 +0,0 @@
1/* $OpenBSD: ofb128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62/* The input and output encrypted as though 128bit ofb mode is being
63 * used. The extra state information to record how much of the
64 * 128bit block we have used is contained in *num;
65 */
66void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
67 size_t len, const void *key,
68 unsigned char ivec[16], int *num,
69 block128_f block)
70{
71 unsigned int n;
72 size_t l=0;
73
74 n = *num;
75
76#if !defined(OPENSSL_SMALL_FOOTPRINT)
77 if (16%sizeof(size_t) == 0) do { /* always true actually */
78 while (n && len) {
79 *(out++) = *(in++) ^ ivec[n];
80 --len;
81 n = (n+1) % 16;
82 }
83#ifdef __STRICT_ALIGNMENT
84 if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
85 break;
86#endif
87 while (len>=16) {
88 (*block)(ivec, ivec, key);
89 for (; n<16; n+=sizeof(size_t))
90 *(size_t*)(out+n) =
91 *(size_t*)(in+n) ^ *(size_t*)(ivec+n);
92 len -= 16;
93 out += 16;
94 in += 16;
95 n = 0;
96 }
97 if (len) {
98 (*block)(ivec, ivec, key);
99 while (len--) {
100 out[n] = in[n] ^ ivec[n];
101 ++n;
102 }
103 }
104 *num = n;
105 return;
106 } while(0);
107 /* the rest would be commonly eliminated by x86* compiler */
108#endif
109 while (l<len) {
110 if (n==0) {
111 (*block)(ivec, ivec, key);
112 }
113 out[l] = in[l] ^ ivec[n];
114 ++l;
115 n = (n+1) % 16;
116 }
117
118 *num=n;
119}
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
deleted file mode 100644
index 3e2378379e..0000000000
--- a/src/lib/libcrypto/modes/xts128.c
+++ /dev/null
@@ -1,187 +0,0 @@
1/* $OpenBSD: xts128.c,v 1.6 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51#include <machine/endian.h>
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
63 const unsigned char *inp, unsigned char *out,
64 size_t len, int enc)
65{
66 union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
67 unsigned int i;
68
69 if (len<16) return -1;
70
71 memcpy(tweak.c, iv, 16);
72
73 (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
74
75 if (!enc && (len%16)) len-=16;
76
77 while (len>=16) {
78#ifdef __STRICT_ALIGNMENT
79 memcpy(scratch.c,inp,16);
80 scratch.u[0] ^= tweak.u[0];
81 scratch.u[1] ^= tweak.u[1];
82#else
83 scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
84 scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
85#endif
86 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
87#ifdef __STRICT_ALIGNMENT
88 scratch.u[0] ^= tweak.u[0];
89 scratch.u[1] ^= tweak.u[1];
90 memcpy(out,scratch.c,16);
91#else
92 ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
93 ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
94#endif
95 inp += 16;
96 out += 16;
97 len -= 16;
98
99 if (len==0) return 0;
100
101 if (BYTE_ORDER == LITTLE_ENDIAN) {
102 unsigned int carry,res;
103
104 res = 0x87&(((int)tweak.d[3])>>31);
105 carry = (unsigned int)(tweak.u[0]>>63);
106 tweak.u[0] = (tweak.u[0]<<1)^res;
107 tweak.u[1] = (tweak.u[1]<<1)|carry;
108 }
109 else {
110 size_t c;
111
112 for (c=0,i=0;i<16;++i) {
113 /*+ substitutes for |, because c is 1 bit */
114 c += ((size_t)tweak.c[i])<<1;
115 tweak.c[i] = (u8)c;
116 c = c>>8;
117 }
118 tweak.c[0] ^= (u8)(0x87&(0-c));
119 }
120 }
121 if (enc) {
122 for (i=0;i<len;++i) {
123 u8 c = inp[i];
124 out[i] = scratch.c[i];
125 scratch.c[i] = c;
126 }
127 scratch.u[0] ^= tweak.u[0];
128 scratch.u[1] ^= tweak.u[1];
129 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
130 scratch.u[0] ^= tweak.u[0];
131 scratch.u[1] ^= tweak.u[1];
132 memcpy(out-16,scratch.c,16);
133 }
134 else {
135 union { u64 u[2]; u8 c[16]; } tweak1;
136
137 if (BYTE_ORDER == LITTLE_ENDIAN) {
138 unsigned int carry,res;
139
140 res = 0x87&(((int)tweak.d[3])>>31);
141 carry = (unsigned int)(tweak.u[0]>>63);
142 tweak1.u[0] = (tweak.u[0]<<1)^res;
143 tweak1.u[1] = (tweak.u[1]<<1)|carry;
144 }
145 else {
146 size_t c;
147
148 for (c=0,i=0;i<16;++i) {
149 /*+ substitutes for |, because c is 1 bit */
150 c += ((size_t)tweak.c[i])<<1;
151 tweak1.c[i] = (u8)c;
152 c = c>>8;
153 }
154 tweak1.c[0] ^= (u8)(0x87&(0-c));
155 }
156#ifdef __STRICT_ALIGNMENT
157 memcpy(scratch.c,inp,16);
158 scratch.u[0] ^= tweak1.u[0];
159 scratch.u[1] ^= tweak1.u[1];
160#else
161 scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
162 scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
163#endif
164 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
165 scratch.u[0] ^= tweak1.u[0];
166 scratch.u[1] ^= tweak1.u[1];
167
168 for (i=0;i<len;++i) {
169 u8 c = inp[16+i];
170 out[16+i] = scratch.c[i];
171 scratch.c[i] = c;
172 }
173 scratch.u[0] ^= tweak.u[0];
174 scratch.u[1] ^= tweak.u[1];
175 (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
176#ifdef __STRICT_ALIGNMENT
177 scratch.u[0] ^= tweak.u[0];
178 scratch.u[1] ^= tweak.u[1];
179 memcpy (out,scratch.c,16);
180#else
181 ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
182 ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
183#endif
184 }
185
186 return 0;
187}