summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/modes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/modes')
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-alpha.pl444
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-armv4.pl430
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-parisc.pl740
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-sparcv9.pl351
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86.pl1326
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86_64.pl812
-rw-r--r--src/lib/libcrypto/modes/cbc128.c214
-rw-r--r--src/lib/libcrypto/modes/ccm128.c498
-rw-r--r--src/lib/libcrypto/modes/cfb128.c251
-rw-r--r--src/lib/libcrypto/modes/ctr128.c267
-rw-r--r--src/lib/libcrypto/modes/gcm128.c1358
-rw-r--r--src/lib/libcrypto/modes/modes.h118
-rw-r--r--src/lib/libcrypto/modes/modes_local.h121
-rw-r--r--src/lib/libcrypto/modes/ofb128.c124
-rw-r--r--src/lib/libcrypto/modes/xts128.c197
15 files changed, 0 insertions, 7251 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index 9d847006c4..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,444 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#include <machine/asm.h>
249
250.text
251
252.set noat
253.set noreorder
254.globl gcm_gmult_4bit
255.align 4
256.ent gcm_gmult_4bit
257gcm_gmult_4bit:
258 .frame sp,0,ra
259 .prologue 0
260
261 ldq $Xlo,8($Xi)
262 ldq $Xhi,0($Xi)
263
264 lda $rem_4bit,rem_4bit
265___
266
267 &loop();
268
269$code.=<<___;
270 srl $Zlo,24,$t0 # byte swap
271 srl $Zlo,8,$t1
272
273 sll $Zlo,8,$t2
274 sll $Zlo,24,$Zlo
275 zapnot $t0,0x11,$t0
276 zapnot $t1,0x22,$t1
277
278 zapnot $Zlo,0x88,$Zlo
279 or $t0,$t1,$t0
280 zapnot $t2,0x44,$t2
281
282 or $Zlo,$t0,$Zlo
283 srl $Zhi,24,$t0
284 srl $Zhi,8,$t1
285
286 or $Zlo,$t2,$Zlo
287 sll $Zhi,8,$t2
288 sll $Zhi,24,$Zhi
289
290 srl $Zlo,32,$Xlo
291 sll $Zlo,32,$Zlo
292
293 zapnot $t0,0x11,$t0
294 zapnot $t1,0x22,$t1
295 or $Zlo,$Xlo,$Xlo
296
297 zapnot $Zhi,0x88,$Zhi
298 or $t0,$t1,$t0
299 zapnot $t2,0x44,$t2
300
301 or $Zhi,$t0,$Zhi
302 or $Zhi,$t2,$Zhi
303
304 srl $Zhi,32,$Xhi
305 sll $Zhi,32,$Zhi
306
307 or $Zhi,$Xhi,$Xhi
308 stq $Xlo,8($Xi)
309 stq $Xhi,0($Xi)
310
311 ret (ra)
312.end gcm_gmult_4bit
313___
314
315$inhi="s0";
316$inlo="s1";
317
318$code.=<<___;
319.globl gcm_ghash_4bit
320.align 4
321.ent gcm_ghash_4bit
322gcm_ghash_4bit:
323 lda sp,-32(sp)
324 stq ra,0(sp)
325 stq s0,8(sp)
326 stq s1,16(sp)
327 .mask 0x04000600,-32
328 .frame sp,32,ra
329 .prologue 0
330
331 ldq_u $inhi,0($inp)
332 ldq_u $Thi0,7($inp)
333 ldq_u $inlo,8($inp)
334 ldq_u $Tlo0,15($inp)
335 ldq $Xhi,0($Xi)
336 ldq $Xlo,8($Xi)
337
338 lda $rem_4bit,rem_4bit
339
340.Louter:
341 extql $inhi,$inp,$inhi
342 extqh $Thi0,$inp,$Thi0
343 or $inhi,$Thi0,$inhi
344 lda $inp,16($inp)
345
346 extql $inlo,$inp,$inlo
347 extqh $Tlo0,$inp,$Tlo0
348 or $inlo,$Tlo0,$inlo
349 subq $len,16,$len
350
351 xor $Xlo,$inlo,$Xlo
352 xor $Xhi,$inhi,$Xhi
353___
354
355 &loop();
356
357$code.=<<___;
358 srl $Zlo,24,$t0 # byte swap
359 srl $Zlo,8,$t1
360
361 sll $Zlo,8,$t2
362 sll $Zlo,24,$Zlo
363 zapnot $t0,0x11,$t0
364 zapnot $t1,0x22,$t1
365
366 zapnot $Zlo,0x88,$Zlo
367 or $t0,$t1,$t0
368 zapnot $t2,0x44,$t2
369
370 or $Zlo,$t0,$Zlo
371 srl $Zhi,24,$t0
372 srl $Zhi,8,$t1
373
374 or $Zlo,$t2,$Zlo
375 sll $Zhi,8,$t2
376 sll $Zhi,24,$Zhi
377
378 srl $Zlo,32,$Xlo
379 sll $Zlo,32,$Zlo
380 beq $len,.Ldone
381
382 zapnot $t0,0x11,$t0
383 zapnot $t1,0x22,$t1
384 or $Zlo,$Xlo,$Xlo
385 ldq_u $inhi,0($inp)
386
387 zapnot $Zhi,0x88,$Zhi
388 or $t0,$t1,$t0
389 zapnot $t2,0x44,$t2
390 ldq_u $Thi0,7($inp)
391
392 or $Zhi,$t0,$Zhi
393 or $Zhi,$t2,$Zhi
394 ldq_u $inlo,8($inp)
395 ldq_u $Tlo0,15($inp)
396
397 srl $Zhi,32,$Xhi
398 sll $Zhi,32,$Zhi
399
400 or $Zhi,$Xhi,$Xhi
401 br zero,.Louter
402
403.Ldone:
404 zapnot $t0,0x11,$t0
405 zapnot $t1,0x22,$t1
406 or $Zlo,$Xlo,$Xlo
407
408 zapnot $Zhi,0x88,$Zhi
409 or $t0,$t1,$t0
410 zapnot $t2,0x44,$t2
411
412 or $Zhi,$t0,$Zhi
413 or $Zhi,$t2,$Zhi
414
415 srl $Zhi,32,$Xhi
416 sll $Zhi,32,$Zhi
417
418 or $Zhi,$Xhi,$Xhi
419
420 stq $Xlo,8($Xi)
421 stq $Xhi,0($Xi)
422
423 .set noreorder
424 /*ldq ra,0(sp)*/
425 ldq s0,8(sp)
426 ldq s1,16(sp)
427 lda sp,32(sp)
428 ret (ra)
429.end gcm_ghash_4bit
430
431 .section .rodata
432 .align 4
433rem_4bit:
434 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
435 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
436 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
437 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
438 .previous
439
440___
441$output=shift and open STDOUT,">$output";
442print $code;
443close STDOUT;
444
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index 2d57806b46..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,430 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+32 bytes shared table]. There is no
15# experimental performance data available yet. The only approximation
16# that can be made at this point is based on code size. Inner loop is
17# 32 instructions long and on single-issue core should execute in <40
18# cycles. Having verified that gcc 3.4 didn't unroll corresponding
19# loop, this assembler loop body was found to be ~3x smaller than
20# compiler-generated one...
21#
22# July 2010
23#
24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
25# Cortex A8 core and ~25 cycles per processed byte (which was observed
26# to be ~3 times faster than gcc-generated code:-)
27#
28# February 2011
29#
30# Profiler-assisted and platform-specific optimization resulted in 7%
31# improvement on Cortex A8 core and ~23.5 cycles per byte.
32#
33# March 2011
34#
35# Add NEON implementation featuring polynomial multiplication, i.e. no
36# lookup tables involved. On Cortex A8 it was measured to process one
37# byte in 15 cycles or 55% faster than integer-only code.
38
39# ====================================================================
40# Note about "528B" variant. In ARM case it makes lesser sense to
41# implement it for following reasons:
42#
43# - performance improvement won't be anywhere near 50%, because 128-
44# bit shift operation is neatly fused with 128-bit xor here, and
45# "538B" variant would eliminate only 4-5 instructions out of 32
46# in the inner loop (meaning that estimated improvement is ~15%);
47# - ARM-based systems are often embedded ones and extra memory
48# consumption might be unappreciated (for so little improvement);
49#
50# Byte order [in]dependence. =========================================
51#
52# Caller is expected to maintain specific *dword* order in Htable,
53# namely with *least* significant dword of 128-bit value at *lower*
54# address. This differs completely from C code and has everything to
55# do with ldm instruction and order in which dwords are "consumed" by
56# algorithm. *Byte* order within these dwords in turn is whatever
57# *native* byte order on current platform. See gcm128.c for working
58# example...
59
60while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61open STDOUT,">$output";
62
63$Xi="r0"; # argument block
64$Htbl="r1";
65$inp="r2";
66$len="r3";
67
68$Zll="r4"; # variables
69$Zlh="r5";
70$Zhl="r6";
71$Zhh="r7";
72$Tll="r8";
73$Tlh="r9";
74$Thl="r10";
75$Thh="r11";
76$nlo="r12";
77################# r13 is stack pointer
78$nhi="r14";
79################# r15 is program counter
80
81$rem_4bit=$inp; # used in gcm_gmult_4bit
82$cnt=$len;
83
84sub Zsmash() {
85 my $i=12;
86 my @args=@_;
87 for ($Zll,$Zlh,$Zhl,$Zhh) {
88 $code.=<<___;
89#if __ARM_ARCH__>=7 && defined(__ARMEL__)
90 rev $_,$_
91 str $_,[$Xi,#$i]
92#elif defined(__ARMEB__)
93 str $_,[$Xi,#$i]
94#else
95 mov $Tlh,$_,lsr#8
96 strb $_,[$Xi,#$i+3]
97 mov $Thl,$_,lsr#16
98 strb $Tlh,[$Xi,#$i+2]
99 mov $Thh,$_,lsr#24
100 strb $Thl,[$Xi,#$i+1]
101 strb $Thh,[$Xi,#$i]
102#endif
103___
104 $code.="\t".shift(@args)."\n";
105 $i-=4;
106 }
107}
108
109$code=<<___;
110#include "arm_arch.h"
111
112.text
113.syntax unified
114.code 32
115
116.type rem_4bit,%object
117.align 5
118rem_4bit:
119.short 0x0000,0x1C20,0x3840,0x2460
120.short 0x7080,0x6CA0,0x48C0,0x54E0
121.short 0xE100,0xFD20,0xD940,0xC560
122.short 0x9180,0x8DA0,0xA9C0,0xB5E0
123.size rem_4bit,.-rem_4bit
124
125.type rem_4bit_get,%function
126rem_4bit_get:
127 sub $rem_4bit,pc,#8
128 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
129 b .Lrem_4bit_got
130 nop
131.size rem_4bit_get,.-rem_4bit_get
132
133.global gcm_ghash_4bit
134.type gcm_ghash_4bit,%function
135gcm_ghash_4bit:
136 sub r12,pc,#8
137 add $len,$inp,$len @ $len to point at the end
138 stmdb sp!,{r3-r11,lr} @ save $len/end too
139 sub r12,r12,#48 @ &rem_4bit
140
141 ldmia r12,{r4-r11} @ copy rem_4bit ...
142 stmdb sp!,{r4-r11} @ ... to stack
143
144 ldrb $nlo,[$inp,#15]
145 ldrb $nhi,[$Xi,#15]
146.Louter:
147 eor $nlo,$nlo,$nhi
148 and $nhi,$nlo,#0xf0
149 and $nlo,$nlo,#0x0f
150 mov $cnt,#14
151
152 add $Zhh,$Htbl,$nlo,lsl#4
153 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
154 add $Thh,$Htbl,$nhi
155 ldrb $nlo,[$inp,#14]
156
157 and $nhi,$Zll,#0xf @ rem
158 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
159 add $nhi,$nhi,$nhi
160 eor $Zll,$Tll,$Zll,lsr#4
161 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
162 eor $Zll,$Zll,$Zlh,lsl#28
163 ldrb $nhi,[$Xi,#14]
164 eor $Zlh,$Tlh,$Zlh,lsr#4
165 eor $Zlh,$Zlh,$Zhl,lsl#28
166 eor $Zhl,$Thl,$Zhl,lsr#4
167 eor $Zhl,$Zhl,$Zhh,lsl#28
168 eor $Zhh,$Thh,$Zhh,lsr#4
169 eor $nlo,$nlo,$nhi
170 and $nhi,$nlo,#0xf0
171 and $nlo,$nlo,#0x0f
172 eor $Zhh,$Zhh,$Tll,lsl#16
173
174.Linner:
175 add $Thh,$Htbl,$nlo,lsl#4
176 and $nlo,$Zll,#0xf @ rem
177 subs $cnt,$cnt,#1
178 add $nlo,$nlo,$nlo
179 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
180 eor $Zll,$Tll,$Zll,lsr#4
181 eor $Zll,$Zll,$Zlh,lsl#28
182 eor $Zlh,$Tlh,$Zlh,lsr#4
183 eor $Zlh,$Zlh,$Zhl,lsl#28
184 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
185 eor $Zhl,$Thl,$Zhl,lsr#4
186 ldrbpl $nlo,[$inp,$cnt]
187 eor $Zhl,$Zhl,$Zhh,lsl#28
188 eor $Zhh,$Thh,$Zhh,lsr#4
189
190 add $Thh,$Htbl,$nhi
191 and $nhi,$Zll,#0xf @ rem
192 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
193 add $nhi,$nhi,$nhi
194 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
195 eor $Zll,$Tll,$Zll,lsr#4
196 ldrbpl $Tll,[$Xi,$cnt]
197 eor $Zll,$Zll,$Zlh,lsl#28
198 eor $Zlh,$Tlh,$Zlh,lsr#4
199 ldrh $Tlh,[sp,$nhi]
200 eor $Zlh,$Zlh,$Zhl,lsl#28
201 eor $Zhl,$Thl,$Zhl,lsr#4
202 eor $Zhl,$Zhl,$Zhh,lsl#28
203 eorpl $nlo,$nlo,$Tll
204 eor $Zhh,$Thh,$Zhh,lsr#4
205 andpl $nhi,$nlo,#0xf0
206 andpl $nlo,$nlo,#0x0f
207 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
208 bpl .Linner
209
210 ldr $len,[sp,#32] @ re-load $len/end
211 add $inp,$inp,#16
212 mov $nhi,$Zll
213___
214 &Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]");
215$code.=<<___;
216 bne .Louter
217
218 add sp,sp,#36
219#if __ARM_ARCH__>=5
220 ldmia sp!,{r4-r11,pc}
221#else
222 ldmia sp!,{r4-r11,lr}
223 tst lr,#1
224 moveq pc,lr @ be binary compatible with V4, yet
225 bx lr @ interoperable with Thumb ISA:-)
226#endif
227.size gcm_ghash_4bit,.-gcm_ghash_4bit
228
229.global gcm_gmult_4bit
230.type gcm_gmult_4bit,%function
231gcm_gmult_4bit:
232 stmdb sp!,{r4-r11,lr}
233 ldrb $nlo,[$Xi,#15]
234 b rem_4bit_get
235.Lrem_4bit_got:
236 and $nhi,$nlo,#0xf0
237 and $nlo,$nlo,#0x0f
238 mov $cnt,#14
239
240 add $Zhh,$Htbl,$nlo,lsl#4
241 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
242 ldrb $nlo,[$Xi,#14]
243
244 add $Thh,$Htbl,$nhi
245 and $nhi,$Zll,#0xf @ rem
246 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
247 add $nhi,$nhi,$nhi
248 eor $Zll,$Tll,$Zll,lsr#4
249 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
250 eor $Zll,$Zll,$Zlh,lsl#28
251 eor $Zlh,$Tlh,$Zlh,lsr#4
252 eor $Zlh,$Zlh,$Zhl,lsl#28
253 eor $Zhl,$Thl,$Zhl,lsr#4
254 eor $Zhl,$Zhl,$Zhh,lsl#28
255 eor $Zhh,$Thh,$Zhh,lsr#4
256 and $nhi,$nlo,#0xf0
257 eor $Zhh,$Zhh,$Tll,lsl#16
258 and $nlo,$nlo,#0x0f
259
260.Loop:
261 add $Thh,$Htbl,$nlo,lsl#4
262 and $nlo,$Zll,#0xf @ rem
263 subs $cnt,$cnt,#1
264 add $nlo,$nlo,$nlo
265 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
266 eor $Zll,$Tll,$Zll,lsr#4
267 eor $Zll,$Zll,$Zlh,lsl#28
268 eor $Zlh,$Tlh,$Zlh,lsr#4
269 eor $Zlh,$Zlh,$Zhl,lsl#28
270 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
271 eor $Zhl,$Thl,$Zhl,lsr#4
272 ldrbpl $nlo,[$Xi,$cnt]
273 eor $Zhl,$Zhl,$Zhh,lsl#28
274 eor $Zhh,$Thh,$Zhh,lsr#4
275
276 add $Thh,$Htbl,$nhi
277 and $nhi,$Zll,#0xf @ rem
278 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
279 add $nhi,$nhi,$nhi
280 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
281 eor $Zll,$Tll,$Zll,lsr#4
282 eor $Zll,$Zll,$Zlh,lsl#28
283 eor $Zlh,$Tlh,$Zlh,lsr#4
284 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
285 eor $Zlh,$Zlh,$Zhl,lsl#28
286 eor $Zhl,$Thl,$Zhl,lsr#4
287 eor $Zhl,$Zhl,$Zhh,lsl#28
288 eor $Zhh,$Thh,$Zhh,lsr#4
289 andpl $nhi,$nlo,#0xf0
290 andpl $nlo,$nlo,#0x0f
291 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
292 bpl .Loop
293___
294 &Zsmash();
295$code.=<<___;
296#if __ARM_ARCH__>=5
297 ldmia sp!,{r4-r11,pc}
298#else
299 ldmia sp!,{r4-r11,lr}
300 tst lr,#1
301 moveq pc,lr @ be binary compatible with V4, yet
302 bx lr @ interoperable with Thumb ISA:-)
303#endif
304.size gcm_gmult_4bit,.-gcm_gmult_4bit
305___
306{
307my $cnt=$Htbl; # $Htbl is used once in the very beginning
308
309my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
310my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
311
312# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
313# in Zo. Or should I say "top bit", because GHASH is specified in
314# reverse bit order? Otherwise straightforward 128-bt H by one input
315# byte multiplication and modulo-reduction, times 16.
316
317sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
318sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
319sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
320
321$code.=<<___;
322#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
323.fpu neon
324
325.global gcm_gmult_neon
326.type gcm_gmult_neon,%function
327.align 4
328gcm_gmult_neon:
329 sub $Htbl,#16 @ point at H in GCM128_CTX
330 vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
331 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
332 vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
333 vshr.u64 $mod,#32
334 vldmia $Htbl,{$Hhi-$Hlo} @ load H
335 veor $zero,$zero
336#ifdef __ARMEL__
337 vrev64.8 $IN,$IN
338#endif
339 veor $Qpost,$Qpost
340 veor $R,$R
341 mov $cnt,#16
342 veor $Z,$Z
343 mov $len,#16
344 veor $Zo,$Zo
345 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
346 b .Linner_neon
347.size gcm_gmult_neon,.-gcm_gmult_neon
348
349.global gcm_ghash_neon
350.type gcm_ghash_neon,%function
351.align 4
352gcm_ghash_neon:
353 vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
354 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
355 vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
356 vshr.u64 $mod,#32
357 vldmia $Xi,{$Hhi-$Hlo} @ load H
358 veor $zero,$zero
359 nop
360#ifdef __ARMEL__
361 vrev64.8 $Z,$Z
362#endif
363.Louter_neon:
364 vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
365 veor $Qpost,$Qpost
366 vld1.64 `&Dlo($IN)`,[$inp]!
367 veor $R,$R
368 mov $cnt,#16
369#ifdef __ARMEL__
370 vrev64.8 $IN,$IN
371#endif
372 veor $Zo,$Zo
373 veor $IN,$Z @ inp^=Xi
374 veor $Z,$Z
375 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
376.Linner_neon:
377 subs $cnt,$cnt,#1
378 vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
379 vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
380 vext.8 $IN,$zero,#1 @ IN>>=8
381
382 veor $Z,$Qpost @ modulo-scheduled part
383 vshl.i64 `&Dlo("$R")`,#48
384 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
385 veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
386
387 veor `&Dhi("$Z")`,`&Dlo("$R")`
388 vuzp.8 $Qlo,$Qhi
389 vsli.8 $Zo,$T,#1 @ compose the "carry" byte
390 vext.8 $Z,$zero,#1 @ Z>>=8
391
392 vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
393 vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
394 vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
395 veor $Z,$Qhi
396 bne .Linner_neon
397
398 veor $Z,$Qpost @ modulo-scheduled artefact
399 vshl.i64 `&Dlo("$R")`,#48
400 veor `&Dhi("$Z")`,`&Dlo("$R")`
401
402 @ finalization, normalize Z:Zo
403 vand $Zo,$mod @ suffices to mask the bit
404 vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
405 vshl.i64 $Z,#1
406 subs $len,#16
407 vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
408 bne .Louter_neon
409
410#ifdef __ARMEL__
411 vrev64.8 $Z,$Z
412#endif
413 sub $Xi,#16
414 vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
415 vst1.64 `&Dlo("$Z")`,[$Xi,:64]
416
417 bx lr
418.size gcm_ghash_neon,.-gcm_ghash_neon
419#endif
420___
421}
422$code.=<<___;
423.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
424.align 2
425___
426
427$code =~ s/\`([^\`]*)\`/eval $1/gem;
428$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
429print $code;
430close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index 3f98513105..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,740 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36 $NREGS =6;
37} else {
38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46 $NREGS =11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50 # [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26"; # argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl; # variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73 $Zhl="%r6";
74 $Zlh="%r7";
75 $Hhl="%r8";
76 $Hlh="%r9";
77 $Thl="%r10";
78 $Tlh="%r11";
79}
80$rem2="%r6"; # used in PA-RISC 2.0 code
81
82$code.=<<___;
83 .LEVEL $LEVEL
84 .text
85
86 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
87 .ALIGN 64
88gcm_gmult_4bit
89 .PROC
90 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
91 .ENTRY
92 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
93 $PUSHMA %r3,$FRAME(%sp)
94 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
95 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
96 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
97___
98$code.=<<___ if ($SIZE_T==4);
99 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
100 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
101 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
102 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
103 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
104___
105$code.=<<___;
106 addl $inp,$len,$len
107#ifdef __PIC__
108 addil LT'L\$rem_4bit, %r19
109 ldw RT'L\$rem_4bit(%r1), $rem_4bit
110#else
111 ldil L'L\$rem_4bit, %t1
112 ldo R'L\$rem_4bit(%t1), $rem_4bit
113#endif
114 ldi 0xf0,$mask0xf0
115___
116$code.=<<___ if ($SIZE_T==4);
117#ifndef __OpenBSD__
118 ldi 31,$rem
119 mtctl $rem,%cr11
120 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
121 b L\$parisc1_gmult
122 nop
123___
124
125$code.=<<___;
126 ldb 15($Xi),$nlo
127 ldo 8($Htbl),$Hll
128
129 and $mask0xf0,$nlo,$nhi
130 depd,z $nlo,59,4,$nlo
131
132 ldd $nlo($Hll),$Zll
133 ldd $nlo($Hhh),$Zhh
134
135 depd,z $Zll,60,4,$rem
136 shrpd $Zhh,$Zll,4,$Zll
137 extrd,u $Zhh,59,60,$Zhh
138 ldb 14($Xi),$nlo
139
140 ldd $nhi($Hll),$Tll
141 ldd $nhi($Hhh),$Thh
142 and $mask0xf0,$nlo,$nhi
143 depd,z $nlo,59,4,$nlo
144
145 xor $Tll,$Zll,$Zll
146 xor $Thh,$Zhh,$Zhh
147 ldd $rem($rem_4bit),$rem
148 b L\$oop_gmult_pa2
149 ldi 13,$cnt
150
151 .ALIGN 8
152L\$oop_gmult_pa2
153 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
154 depd,z $Zll,60,4,$rem
155
156 shrpd $Zhh,$Zll,4,$Zll
157 extrd,u $Zhh,59,60,$Zhh
158 ldd $nlo($Hll),$Tll
159 ldd $nlo($Hhh),$Thh
160
161 xor $Tll,$Zll,$Zll
162 xor $Thh,$Zhh,$Zhh
163 ldd $rem($rem_4bit),$rem
164
165 xor $rem,$Zhh,$Zhh
166 depd,z $Zll,60,4,$rem
167 ldbx $cnt($Xi),$nlo
168
169 shrpd $Zhh,$Zll,4,$Zll
170 extrd,u $Zhh,59,60,$Zhh
171 ldd $nhi($Hll),$Tll
172 ldd $nhi($Hhh),$Thh
173
174 and $mask0xf0,$nlo,$nhi
175 depd,z $nlo,59,4,$nlo
176 ldd $rem($rem_4bit),$rem
177
178 xor $Tll,$Zll,$Zll
179 addib,uv -1,$cnt,L\$oop_gmult_pa2
180 xor $Thh,$Zhh,$Zhh
181
182 xor $rem,$Zhh,$Zhh
183 depd,z $Zll,60,4,$rem
184
185 shrpd $Zhh,$Zll,4,$Zll
186 extrd,u $Zhh,59,60,$Zhh
187 ldd $nlo($Hll),$Tll
188 ldd $nlo($Hhh),$Thh
189
190 xor $Tll,$Zll,$Zll
191 xor $Thh,$Zhh,$Zhh
192 ldd $rem($rem_4bit),$rem
193
194 xor $rem,$Zhh,$Zhh
195 depd,z $Zll,60,4,$rem
196
197 shrpd $Zhh,$Zll,4,$Zll
198 extrd,u $Zhh,59,60,$Zhh
199 ldd $nhi($Hll),$Tll
200 ldd $nhi($Hhh),$Thh
201
202 xor $Tll,$Zll,$Zll
203 xor $Thh,$Zhh,$Zhh
204 ldd $rem($rem_4bit),$rem
205
206 xor $rem,$Zhh,$Zhh
207 std $Zll,8($Xi)
208 std $Zhh,0($Xi)
209___
210
211$code.=<<___ if ($SIZE_T==4);
212 b L\$done_gmult
213 nop
214
215L\$parisc1_gmult
216#endif
217 ldb 15($Xi),$nlo
218 ldo 12($Htbl),$Hll
219 ldo 8($Htbl),$Hlh
220 ldo 4($Htbl),$Hhl
221
222 and $mask0xf0,$nlo,$nhi
223 zdep $nlo,27,4,$nlo
224
225 ldwx $nlo($Hll),$Zll
226 ldwx $nlo($Hlh),$Zlh
227 ldwx $nlo($Hhl),$Zhl
228 ldwx $nlo($Hhh),$Zhh
229 zdep $Zll,28,4,$rem
230 ldb 14($Xi),$nlo
231 ldwx $rem($rem_4bit),$rem
232 shrpw $Zlh,$Zll,4,$Zll
233 ldwx $nhi($Hll),$Tll
234 shrpw $Zhl,$Zlh,4,$Zlh
235 ldwx $nhi($Hlh),$Tlh
236 shrpw $Zhh,$Zhl,4,$Zhl
237 ldwx $nhi($Hhl),$Thl
238 extru $Zhh,27,28,$Zhh
239 ldwx $nhi($Hhh),$Thh
240 xor $rem,$Zhh,$Zhh
241 and $mask0xf0,$nlo,$nhi
242 zdep $nlo,27,4,$nlo
243
244 xor $Tll,$Zll,$Zll
245 ldwx $nlo($Hll),$Tll
246 xor $Tlh,$Zlh,$Zlh
247 ldwx $nlo($Hlh),$Tlh
248 xor $Thl,$Zhl,$Zhl
249 b L\$oop_gmult_pa1
250 ldi 13,$cnt
251
252 .ALIGN 8
253L\$oop_gmult_pa1
254 zdep $Zll,28,4,$rem
255 ldwx $nlo($Hhl),$Thl
256 xor $Thh,$Zhh,$Zhh
257 ldwx $rem($rem_4bit),$rem
258 shrpw $Zlh,$Zll,4,$Zll
259 ldwx $nlo($Hhh),$Thh
260 shrpw $Zhl,$Zlh,4,$Zlh
261 ldbx $cnt($Xi),$nlo
262 xor $Tll,$Zll,$Zll
263 ldwx $nhi($Hll),$Tll
264 shrpw $Zhh,$Zhl,4,$Zhl
265 xor $Tlh,$Zlh,$Zlh
266 ldwx $nhi($Hlh),$Tlh
267 extru $Zhh,27,28,$Zhh
268 xor $Thl,$Zhl,$Zhl
269 ldwx $nhi($Hhl),$Thl
270 xor $rem,$Zhh,$Zhh
271 zdep $Zll,28,4,$rem
272 xor $Thh,$Zhh,$Zhh
273 ldwx $nhi($Hhh),$Thh
274 shrpw $Zlh,$Zll,4,$Zll
275 ldwx $rem($rem_4bit),$rem
276 shrpw $Zhl,$Zlh,4,$Zlh
277 shrpw $Zhh,$Zhl,4,$Zhl
278 and $mask0xf0,$nlo,$nhi
279 extru $Zhh,27,28,$Zhh
280 zdep $nlo,27,4,$nlo
281 xor $Tll,$Zll,$Zll
282 ldwx $nlo($Hll),$Tll
283 xor $Tlh,$Zlh,$Zlh
284 ldwx $nlo($Hlh),$Tlh
285 xor $rem,$Zhh,$Zhh
286 addib,uv -1,$cnt,L\$oop_gmult_pa1
287 xor $Thl,$Zhl,$Zhl
288
289 zdep $Zll,28,4,$rem
290 ldwx $nlo($Hhl),$Thl
291 xor $Thh,$Zhh,$Zhh
292 ldwx $rem($rem_4bit),$rem
293 shrpw $Zlh,$Zll,4,$Zll
294 ldwx $nlo($Hhh),$Thh
295 shrpw $Zhl,$Zlh,4,$Zlh
296 xor $Tll,$Zll,$Zll
297 ldwx $nhi($Hll),$Tll
298 shrpw $Zhh,$Zhl,4,$Zhl
299 xor $Tlh,$Zlh,$Zlh
300 ldwx $nhi($Hlh),$Tlh
301 extru $Zhh,27,28,$Zhh
302 xor $rem,$Zhh,$Zhh
303 xor $Thl,$Zhl,$Zhl
304 ldwx $nhi($Hhl),$Thl
305 xor $Thh,$Zhh,$Zhh
306 ldwx $nhi($Hhh),$Thh
307 zdep $Zll,28,4,$rem
308 ldwx $rem($rem_4bit),$rem
309 shrpw $Zlh,$Zll,4,$Zll
310 shrpw $Zhl,$Zlh,4,$Zlh
311 shrpw $Zhh,$Zhl,4,$Zhl
312 extru $Zhh,27,28,$Zhh
313 xor $Tll,$Zll,$Zll
314 xor $Tlh,$Zlh,$Zlh
315 xor $rem,$Zhh,$Zhh
316 stw $Zll,12($Xi)
317 xor $Thl,$Zhl,$Zhl
318 stw $Zlh,8($Xi)
319 xor $Thh,$Zhh,$Zhh
320 stw $Zhl,4($Xi)
321 stw $Zhh,0($Xi)
322___
323$code.=<<___;
324L\$done_gmult
325 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
326 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
327 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
328 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
329___
330$code.=<<___ if ($SIZE_T==4);
331 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
332 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
333 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
334 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
335 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
336___
337$code.=<<___;
338 bv (%r2)
339 .EXIT
340 $POPMB -$FRAME(%sp),%r3
341 .PROCEND
342
343 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
344 .ALIGN 64
345gcm_ghash_4bit
346 .PROC
347 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
348 .ENTRY
349 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
350 $PUSHMA %r3,$FRAME(%sp)
351 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
352 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
353 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
354___
355$code.=<<___ if ($SIZE_T==4);
356 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
357 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
358 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
359 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
360 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
361___
362$code.=<<___;
363 addl $inp,$len,$len
364#ifdef __PIC__
365 addil LT'L\$rem_4bit, %r19
366 ldw RT'L\$rem_4bit(%r1), $rem_4bit
367#else
368 ldil L'L\$rem_4bit, %t1
369 ldo R'L\$rem_4bit(%t1), $rem_4bit
370#endif
371 ldi 0xf0,$mask0xf0
372___
373$code.=<<___ if ($SIZE_T==4);
374#ifndef __OpenBSD__
375 ldi 31,$rem
376 mtctl $rem,%cr11
377 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
378 b L\$parisc1_ghash
379 nop
380___
381
382$code.=<<___;
383 ldb 15($Xi),$nlo
384 ldo 8($Htbl),$Hll
385
386L\$outer_ghash_pa2
387 ldb 15($inp),$nhi
388 xor $nhi,$nlo,$nlo
389 and $mask0xf0,$nlo,$nhi
390 depd,z $nlo,59,4,$nlo
391
392 ldd $nlo($Hll),$Zll
393 ldd $nlo($Hhh),$Zhh
394
395 depd,z $Zll,60,4,$rem
396 shrpd $Zhh,$Zll,4,$Zll
397 extrd,u $Zhh,59,60,$Zhh
398 ldb 14($Xi),$nlo
399 ldb 14($inp),$byte
400
401 ldd $nhi($Hll),$Tll
402 ldd $nhi($Hhh),$Thh
403 xor $byte,$nlo,$nlo
404 and $mask0xf0,$nlo,$nhi
405 depd,z $nlo,59,4,$nlo
406
407 xor $Tll,$Zll,$Zll
408 xor $Thh,$Zhh,$Zhh
409 ldd $rem($rem_4bit),$rem
410 b L\$oop_ghash_pa2
411 ldi 13,$cnt
412
413 .ALIGN 8
414L\$oop_ghash_pa2
415 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
416 depd,z $Zll,60,4,$rem2
417
418 shrpd $Zhh,$Zll,4,$Zll
419 extrd,u $Zhh,59,60,$Zhh
420 ldd $nlo($Hll),$Tll
421 ldd $nlo($Hhh),$Thh
422
423 xor $Tll,$Zll,$Zll
424 xor $Thh,$Zhh,$Zhh
425 ldbx $cnt($Xi),$nlo
426 ldbx $cnt($inp),$byte
427
428 depd,z $Zll,60,4,$rem
429 shrpd $Zhh,$Zll,4,$Zll
430 ldd $rem2($rem_4bit),$rem2
431
432 xor $rem2,$Zhh,$Zhh
433 xor $byte,$nlo,$nlo
434 ldd $nhi($Hll),$Tll
435 ldd $nhi($Hhh),$Thh
436
437 and $mask0xf0,$nlo,$nhi
438 depd,z $nlo,59,4,$nlo
439
440 extrd,u $Zhh,59,60,$Zhh
441 xor $Tll,$Zll,$Zll
442
443 ldd $rem($rem_4bit),$rem
444 addib,uv -1,$cnt,L\$oop_ghash_pa2
445 xor $Thh,$Zhh,$Zhh
446
447 xor $rem,$Zhh,$Zhh
448 depd,z $Zll,60,4,$rem2
449
450 shrpd $Zhh,$Zll,4,$Zll
451 extrd,u $Zhh,59,60,$Zhh
452 ldd $nlo($Hll),$Tll
453 ldd $nlo($Hhh),$Thh
454
455 xor $Tll,$Zll,$Zll
456 xor $Thh,$Zhh,$Zhh
457
458 depd,z $Zll,60,4,$rem
459 shrpd $Zhh,$Zll,4,$Zll
460 ldd $rem2($rem_4bit),$rem2
461
462 xor $rem2,$Zhh,$Zhh
463 ldd $nhi($Hll),$Tll
464 ldd $nhi($Hhh),$Thh
465
466 extrd,u $Zhh,59,60,$Zhh
467 xor $Tll,$Zll,$Zll
468 xor $Thh,$Zhh,$Zhh
469 ldd $rem($rem_4bit),$rem
470
471 xor $rem,$Zhh,$Zhh
472 std $Zll,8($Xi)
473 ldo 16($inp),$inp
474 std $Zhh,0($Xi)
475 cmpb,*<> $inp,$len,L\$outer_ghash_pa2
476 copy $Zll,$nlo
477___
478
479$code.=<<___ if ($SIZE_T==4);
480 b L\$done_ghash
481 nop
482
483L\$parisc1_ghash
484#endif
485 ldb 15($Xi),$nlo
486 ldo 12($Htbl),$Hll
487 ldo 8($Htbl),$Hlh
488 ldo 4($Htbl),$Hhl
489
490L\$outer_ghash_pa1
491 ldb 15($inp),$byte
492 xor $byte,$nlo,$nlo
493 and $mask0xf0,$nlo,$nhi
494 zdep $nlo,27,4,$nlo
495
496 ldwx $nlo($Hll),$Zll
497 ldwx $nlo($Hlh),$Zlh
498 ldwx $nlo($Hhl),$Zhl
499 ldwx $nlo($Hhh),$Zhh
500 zdep $Zll,28,4,$rem
501 ldb 14($Xi),$nlo
502 ldb 14($inp),$byte
503 ldwx $rem($rem_4bit),$rem
504 shrpw $Zlh,$Zll,4,$Zll
505 ldwx $nhi($Hll),$Tll
506 shrpw $Zhl,$Zlh,4,$Zlh
507 ldwx $nhi($Hlh),$Tlh
508 shrpw $Zhh,$Zhl,4,$Zhl
509 ldwx $nhi($Hhl),$Thl
510 extru $Zhh,27,28,$Zhh
511 ldwx $nhi($Hhh),$Thh
512 xor $byte,$nlo,$nlo
513 xor $rem,$Zhh,$Zhh
514 and $mask0xf0,$nlo,$nhi
515 zdep $nlo,27,4,$nlo
516
517 xor $Tll,$Zll,$Zll
518 ldwx $nlo($Hll),$Tll
519 xor $Tlh,$Zlh,$Zlh
520 ldwx $nlo($Hlh),$Tlh
521 xor $Thl,$Zhl,$Zhl
522 b L\$oop_ghash_pa1
523 ldi 13,$cnt
524
525 .ALIGN 8
526L\$oop_ghash_pa1
527 zdep $Zll,28,4,$rem
528 ldwx $nlo($Hhl),$Thl
529 xor $Thh,$Zhh,$Zhh
530 ldwx $rem($rem_4bit),$rem
531 shrpw $Zlh,$Zll,4,$Zll
532 ldwx $nlo($Hhh),$Thh
533 shrpw $Zhl,$Zlh,4,$Zlh
534 ldbx $cnt($Xi),$nlo
535 xor $Tll,$Zll,$Zll
536 ldwx $nhi($Hll),$Tll
537 shrpw $Zhh,$Zhl,4,$Zhl
538 ldbx $cnt($inp),$byte
539 xor $Tlh,$Zlh,$Zlh
540 ldwx $nhi($Hlh),$Tlh
541 extru $Zhh,27,28,$Zhh
542 xor $Thl,$Zhl,$Zhl
543 ldwx $nhi($Hhl),$Thl
544 xor $rem,$Zhh,$Zhh
545 zdep $Zll,28,4,$rem
546 xor $Thh,$Zhh,$Zhh
547 ldwx $nhi($Hhh),$Thh
548 shrpw $Zlh,$Zll,4,$Zll
549 ldwx $rem($rem_4bit),$rem
550 shrpw $Zhl,$Zlh,4,$Zlh
551 xor $byte,$nlo,$nlo
552 shrpw $Zhh,$Zhl,4,$Zhl
553 and $mask0xf0,$nlo,$nhi
554 extru $Zhh,27,28,$Zhh
555 zdep $nlo,27,4,$nlo
556 xor $Tll,$Zll,$Zll
557 ldwx $nlo($Hll),$Tll
558 xor $Tlh,$Zlh,$Zlh
559 ldwx $nlo($Hlh),$Tlh
560 xor $rem,$Zhh,$Zhh
561 addib,uv -1,$cnt,L\$oop_ghash_pa1
562 xor $Thl,$Zhl,$Zhl
563
564 zdep $Zll,28,4,$rem
565 ldwx $nlo($Hhl),$Thl
566 xor $Thh,$Zhh,$Zhh
567 ldwx $rem($rem_4bit),$rem
568 shrpw $Zlh,$Zll,4,$Zll
569 ldwx $nlo($Hhh),$Thh
570 shrpw $Zhl,$Zlh,4,$Zlh
571 xor $Tll,$Zll,$Zll
572 ldwx $nhi($Hll),$Tll
573 shrpw $Zhh,$Zhl,4,$Zhl
574 xor $Tlh,$Zlh,$Zlh
575 ldwx $nhi($Hlh),$Tlh
576 extru $Zhh,27,28,$Zhh
577 xor $rem,$Zhh,$Zhh
578 xor $Thl,$Zhl,$Zhl
579 ldwx $nhi($Hhl),$Thl
580 xor $Thh,$Zhh,$Zhh
581 ldwx $nhi($Hhh),$Thh
582 zdep $Zll,28,4,$rem
583 ldwx $rem($rem_4bit),$rem
584 shrpw $Zlh,$Zll,4,$Zll
585 shrpw $Zhl,$Zlh,4,$Zlh
586 shrpw $Zhh,$Zhl,4,$Zhl
587 extru $Zhh,27,28,$Zhh
588 xor $Tll,$Zll,$Zll
589 xor $Tlh,$Zlh,$Zlh
590 xor $rem,$Zhh,$Zhh
591 stw $Zll,12($Xi)
592 xor $Thl,$Zhl,$Zhl
593 stw $Zlh,8($Xi)
594 xor $Thh,$Zhh,$Zhh
595 stw $Zhl,4($Xi)
596 ldo 16($inp),$inp
597 stw $Zhh,0($Xi)
598 comb,<> $inp,$len,L\$outer_ghash_pa1
599 copy $Zll,$nlo
600___
601$code.=<<___;
602L\$done_ghash
603 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
604 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
605 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
606 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
607___
608$code.=<<___ if ($SIZE_T==4);
609 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
610 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
611 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
612 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
613 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
614___
615$code.=<<___;
616 bv (%r2)
617 .EXIT
618 $POPMB -$FRAME(%sp),%r3
619 .PROCEND
620
621 .section .rodata
622 .ALIGN 64
623L\$rem_4bit
624 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
625 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
626 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
627 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
628 .previous
629
630 .ALIGN 64
631___
632
633# Explicitly encode PA-RISC 2.0 instructions used in this module, so
634# that it can be compiled with .LEVEL 1.0. It should be noted that I
635# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
636# directive...
637
638my $ldd = sub {
639 my ($mod,$args) = @_;
640 my $orig = "ldd$mod\t$args";
641
642 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
643 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
644 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
645 }
646 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
647 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
648 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
649 $opcode|=(1<<5) if ($mod =~ /^,m/);
650 $opcode|=(1<<13) if ($mod =~ /^,mb/);
651 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
652 }
653 else { "\t".$orig; }
654};
655
656my $std = sub {
657 my ($mod,$args) = @_;
658 my $orig = "std$mod\t$args";
659
660 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
661 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
662 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
663 }
664 else { "\t".$orig; }
665};
666
667my $extrd = sub {
668 my ($mod,$args) = @_;
669 my $orig = "extrd$mod\t$args";
670
671 # I only have ",u" completer, it's implicitly encoded...
672 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
673 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
674 my $len=32-$3;
675 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
676 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
677 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
678 }
679 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
680 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
681 my $len=32-$2;
682 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
683 $opcode |= (1<<13) if ($mod =~ /,\**=/);
684 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
685 }
686 else { "\t".$orig; }
687};
688
689my $shrpd = sub {
690 my ($mod,$args) = @_;
691 my $orig = "shrpd$mod\t$args";
692
693 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
694 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
695 my $cpos=63-$3;
696 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
697 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
698 }
699 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
700 { sprintf "\t.WORD\t0x%08x\t; %s",
701 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
702 }
703 else { "\t".$orig; }
704};
705
706my $depd = sub {
707 my ($mod,$args) = @_;
708 my $orig = "depd$mod\t$args";
709
710 # I only have ",z" completer, it's implicitly encoded...
711 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
712 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
713 my $cpos=63-$2;
714 my $len=32-$3;
715 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
716 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
717 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
718 }
719 else { "\t".$orig; }
720};
721
722sub assemble {
723 my ($mnemonic,$mod,$args)=@_;
724 my $opcode = eval("\$$mnemonic");
725
726 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
727}
728
729foreach (split("\n",$code)) {
730 s/\`([^\`]*)\`/eval $1/ge;
731 if ($SIZE_T==4) {
732 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
733 s/cmpb,\*/comb,/;
734 s/,\*/,/;
735 }
736 s/\bbv\b/bve/ if ($SIZE_T==8);
737 print $_,"\n";
738}
739
740close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index ce75045f09..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,351 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18# gcc 3.3.x cc 5.2 this assembler
19#
20# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
21# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25# gcc 4.4.1 this assembler
26#
27# 32-bit build 566 50 (+1000%)
28# 64-bit build 56 50 (+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64) { $bias=2047; $frame=192; }
43else { $bias=0; $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0"; # 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0"; # small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0"; # input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section ".rodata",#alloc
71
72.align 64
73rem_4bit:
74 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type rem_4bit,#object
79.size rem_4bit,(.-rem_4bit)
80
81.section ".text",#alloc,#execinstr
82.globl gcm_ghash_4bit
83.align 32
84gcm_ghash_4bit:
85 save %sp,-$frame,%sp
86#ifdef __PIC__
87 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), $tmp
88 rd %pc, $rem
89 or $tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp
90 add $tmp, $rem, $tmp
91#endif
92
93 ldub [$inp+15],$nlo
94 ldub [$Xi+15],$xi0
95 ldub [$Xi+14],$xi1
96 add $len,$inp,$len
97 add $Htbl,8,$Htblo
98
99#ifdef __PIC__
100 set rem_4bit, $rem_4bit
101 ldx [$rem_4bit+$tmp], $rem_4bit
102#else
103 set rem_4bit, $rem_4bit
104#endif
105
106.Louter:
107 xor $xi0,$nlo,$nlo
108 and $nlo,0xf0,$nhi
109 and $nlo,0x0f,$nlo
110 sll $nlo,4,$nlo
111 ldx [$Htblo+$nlo],$Zlo
112 ldx [$Htbl+$nlo],$Zhi
113
114 ldub [$inp+14],$nlo
115
116 ldx [$Htblo+$nhi],$Tlo
117 and $Zlo,0xf,$remi
118 ldx [$Htbl+$nhi],$Thi
119 sll $remi,3,$remi
120 ldx [$rem_4bit+$remi],$rem
121 srlx $Zlo,4,$Zlo
122 mov 13,$cnt
123 sllx $Zhi,60,$tmp
124 xor $Tlo,$Zlo,$Zlo
125 srlx $Zhi,4,$Zhi
126 xor $Zlo,$tmp,$Zlo
127
128 xor $xi1,$nlo,$nlo
129 and $Zlo,0xf,$remi
130 and $nlo,0xf0,$nhi
131 and $nlo,0x0f,$nlo
132 ba .Lghash_inner
133 sll $nlo,4,$nlo
134.align 32
135.Lghash_inner:
136 ldx [$Htblo+$nlo],$Tlo
137 sll $remi,3,$remi
138 xor $Thi,$Zhi,$Zhi
139 ldx [$Htbl+$nlo],$Thi
140 srlx $Zlo,4,$Zlo
141 xor $rem,$Zhi,$Zhi
142 ldx [$rem_4bit+$remi],$rem
143 sllx $Zhi,60,$tmp
144 xor $Tlo,$Zlo,$Zlo
145 ldub [$inp+$cnt],$nlo
146 srlx $Zhi,4,$Zhi
147 xor $Zlo,$tmp,$Zlo
148 ldub [$Xi+$cnt],$xi1
149 xor $Thi,$Zhi,$Zhi
150 and $Zlo,0xf,$remi
151
152 ldx [$Htblo+$nhi],$Tlo
153 sll $remi,3,$remi
154 xor $rem,$Zhi,$Zhi
155 ldx [$Htbl+$nhi],$Thi
156 srlx $Zlo,4,$Zlo
157 ldx [$rem_4bit+$remi],$rem
158 sllx $Zhi,60,$tmp
159 xor $xi1,$nlo,$nlo
160 srlx $Zhi,4,$Zhi
161 and $nlo,0xf0,$nhi
162 addcc $cnt,-1,$cnt
163 xor $Zlo,$tmp,$Zlo
164 and $nlo,0x0f,$nlo
165 xor $Tlo,$Zlo,$Zlo
166 sll $nlo,4,$nlo
167 blu .Lghash_inner
168 and $Zlo,0xf,$remi
169
170 ldx [$Htblo+$nlo],$Tlo
171 sll $remi,3,$remi
172 xor $Thi,$Zhi,$Zhi
173 ldx [$Htbl+$nlo],$Thi
174 srlx $Zlo,4,$Zlo
175 xor $rem,$Zhi,$Zhi
176 ldx [$rem_4bit+$remi],$rem
177 sllx $Zhi,60,$tmp
178 xor $Tlo,$Zlo,$Zlo
179 srlx $Zhi,4,$Zhi
180 xor $Zlo,$tmp,$Zlo
181 xor $Thi,$Zhi,$Zhi
182
183 add $inp,16,$inp
184 cmp $inp,$len
185 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
186 and $Zlo,0xf,$remi
187
188 ldx [$Htblo+$nhi],$Tlo
189 sll $remi,3,$remi
190 xor $rem,$Zhi,$Zhi
191 ldx [$Htbl+$nhi],$Thi
192 srlx $Zlo,4,$Zlo
193 ldx [$rem_4bit+$remi],$rem
194 sllx $Zhi,60,$tmp
195 xor $Tlo,$Zlo,$Zlo
196 ldub [$inp+15],$nlo
197 srlx $Zhi,4,$Zhi
198 xor $Zlo,$tmp,$Zlo
199 xor $Thi,$Zhi,$Zhi
200 stx $Zlo,[$Xi+8]
201 xor $rem,$Zhi,$Zhi
202 stx $Zhi,[$Xi]
203 srl $Zlo,8,$xi1
204 and $Zlo,0xff,$xi0
205 ba .Louter
206 and $xi1,0xff,$xi1
207.align 32
208.Ldone:
209 ldx [$Htblo+$nhi],$Tlo
210 sll $remi,3,$remi
211 xor $rem,$Zhi,$Zhi
212 ldx [$Htbl+$nhi],$Thi
213 srlx $Zlo,4,$Zlo
214 ldx [$rem_4bit+$remi],$rem
215 sllx $Zhi,60,$tmp
216 xor $Tlo,$Zlo,$Zlo
217 srlx $Zhi,4,$Zhi
218 xor $Zlo,$tmp,$Zlo
219 xor $Thi,$Zhi,$Zhi
220 stx $Zlo,[$Xi+8]
221 xor $rem,$Zhi,$Zhi
222 stx $Zhi,[$Xi]
223
224 ret
225 restore
226.type gcm_ghash_4bit,#function
227.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
228___
229
230undef $inp;
231undef $len;
232
233$code.=<<___;
234.globl gcm_gmult_4bit
235.align 32
236gcm_gmult_4bit:
237 save %sp,-$frame,%sp
238#ifdef __PIC__
239 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), $tmp
240 rd %pc, $rem
241 or $tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp
242 add $tmp, $rem, $tmp
243#endif
244
245 ldub [$Xi+15],$nlo
246 add $Htbl,8,$Htblo
247
248#ifdef __PIC__
249 set rem_4bit, $rem_4bit
250 ldx [$rem_4bit+$tmp], $rem_4bit
251#else
252 set rem_4bit, $rem_4bit
253#endif
254
255 and $nlo,0xf0,$nhi
256 and $nlo,0x0f,$nlo
257 sll $nlo,4,$nlo
258 ldx [$Htblo+$nlo],$Zlo
259 ldx [$Htbl+$nlo],$Zhi
260
261 ldub [$Xi+14],$nlo
262
263 ldx [$Htblo+$nhi],$Tlo
264 and $Zlo,0xf,$remi
265 ldx [$Htbl+$nhi],$Thi
266 sll $remi,3,$remi
267 ldx [$rem_4bit+$remi],$rem
268 srlx $Zlo,4,$Zlo
269 mov 13,$cnt
270 sllx $Zhi,60,$tmp
271 xor $Tlo,$Zlo,$Zlo
272 srlx $Zhi,4,$Zhi
273 xor $Zlo,$tmp,$Zlo
274
275 and $Zlo,0xf,$remi
276 and $nlo,0xf0,$nhi
277 and $nlo,0x0f,$nlo
278 ba .Lgmult_inner
279 sll $nlo,4,$nlo
280.align 32
281.Lgmult_inner:
282 ldx [$Htblo+$nlo],$Tlo
283 sll $remi,3,$remi
284 xor $Thi,$Zhi,$Zhi
285 ldx [$Htbl+$nlo],$Thi
286 srlx $Zlo,4,$Zlo
287 xor $rem,$Zhi,$Zhi
288 ldx [$rem_4bit+$remi],$rem
289 sllx $Zhi,60,$tmp
290 xor $Tlo,$Zlo,$Zlo
291 ldub [$Xi+$cnt],$nlo
292 srlx $Zhi,4,$Zhi
293 xor $Zlo,$tmp,$Zlo
294 xor $Thi,$Zhi,$Zhi
295 and $Zlo,0xf,$remi
296
297 ldx [$Htblo+$nhi],$Tlo
298 sll $remi,3,$remi
299 xor $rem,$Zhi,$Zhi
300 ldx [$Htbl+$nhi],$Thi
301 srlx $Zlo,4,$Zlo
302 ldx [$rem_4bit+$remi],$rem
303 sllx $Zhi,60,$tmp
304 srlx $Zhi,4,$Zhi
305 and $nlo,0xf0,$nhi
306 addcc $cnt,-1,$cnt
307 xor $Zlo,$tmp,$Zlo
308 and $nlo,0x0f,$nlo
309 xor $Tlo,$Zlo,$Zlo
310 sll $nlo,4,$nlo
311 blu .Lgmult_inner
312 and $Zlo,0xf,$remi
313
314 ldx [$Htblo+$nlo],$Tlo
315 sll $remi,3,$remi
316 xor $Thi,$Zhi,$Zhi
317 ldx [$Htbl+$nlo],$Thi
318 srlx $Zlo,4,$Zlo
319 xor $rem,$Zhi,$Zhi
320 ldx [$rem_4bit+$remi],$rem
321 sllx $Zhi,60,$tmp
322 xor $Tlo,$Zlo,$Zlo
323 srlx $Zhi,4,$Zhi
324 xor $Zlo,$tmp,$Zlo
325 xor $Thi,$Zhi,$Zhi
326 and $Zlo,0xf,$remi
327
328 ldx [$Htblo+$nhi],$Tlo
329 sll $remi,3,$remi
330 xor $rem,$Zhi,$Zhi
331 ldx [$Htbl+$nhi],$Thi
332 srlx $Zlo,4,$Zlo
333 ldx [$rem_4bit+$remi],$rem
334 sllx $Zhi,60,$tmp
335 xor $Tlo,$Zlo,$Zlo
336 srlx $Zhi,4,$Zhi
337 xor $Zlo,$tmp,$Zlo
338 xor $Thi,$Zhi,$Zhi
339 stx $Zlo,[$Xi+8]
340 xor $rem,$Zhi,$Zhi
341 stx $Zhi,[$Xi]
342
343 ret
344 restore
345.type gcm_gmult_4bit,#function
346.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
347___
348
349$code =~ s/\`([^\`]*)\`/eval $1/gem;
350print $code;
351close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 47833582b6..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1326 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, May, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15# code paths: vanilla x86 and vanilla MMX. Former will be executed on
16# 486 and Pentium, latter on all others. MMX GHASH features so called
17# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18# of per-key storage [+512 bytes shared table]. Performance results
19# are for streamed GHASH subroutine and are expressed in cycles per
20# processed byte, less is better:
21#
22# gcc 2.95.3(*) MMX assembler x86 assembler
23#
24# Pentium 105/111(**) - 50
25# PIII 68 /75 12.2 24
26# P4 125/125 17.8 84(***)
27# Opteron 66 /70 10.1 30
28# Core2 54 /67 8.4 18
29#
30# (*) gcc 3.4.x was observed to generate few percent slower code,
31# which is one of reasons why 2.95.3 results were chosen,
32# another reason is lack of 3.4.x results for older CPUs;
33# comparison with MMX results is not completely fair, because C
34# results are for vanilla "256B" implementation, while
35# assembler results are for "528B";-)
36# (**) second number is result for code compiled with -fPIC flag,
37# which is actually more relevant, because assembler code is
38# position-independent;
39# (***) see comment in non-MMX routine for further details;
40#
41# To summarize, it's >2-5 times faster than gcc-generated code. To
42# anchor it to something else SHA1 assembler processes one byte in
43# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
44# particular, see comment at the end of the file...
45
46# May 2010
47#
48# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
49# The question is how close is it to theoretical limit? The pclmulqdq
50# instruction latency appears to be 14 cycles and there can't be more
51# than 2 of them executing at any given time. This means that single
52# Karatsuba multiplication would take 28 cycles *plus* few cycles for
53# pre- and post-processing. Then multiplication has to be followed by
54# modulo-reduction. Given that aggregated reduction method [see
55# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
56# white paper by Intel] allows you to perform reduction only once in
57# a while we can assume that asymptotic performance can be estimated
58# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
59# and Naggr is the aggregation factor.
60#
61# Before we proceed to this implementation let's have closer look at
62# the best-performing code suggested by Intel in their white paper.
63# By tracing inter-register dependencies Tmod is estimated as ~19
64# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
65# processed byte. As implied, this is quite optimistic estimate,
66# because it does not account for Karatsuba pre- and post-processing,
67# which for a single multiplication is ~5 cycles. Unfortunately Intel
68# does not provide performance data for GHASH alone. But benchmarking
69# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
70# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
71# the result accounts even for pre-computing of degrees of the hash
72# key H, but its portion is negligible at 16KB buffer size.
73#
74# Moving on to the implementation in question. Tmod is estimated as
75# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
76# 2.16. How is it possible that measured performance is better than
77# optimistic theoretical estimate? There is one thing Intel failed
78# to recognize. By serializing GHASH with CTR in same subroutine
79# former's performance is really limited to above (Tmul + Tmod/Naggr)
80# equation. But if GHASH procedure is detached, the modulo-reduction
81# can be interleaved with Naggr-1 multiplications at instruction level
82# and under ideal conditions even disappear from the equation. So that
83# optimistic theoretical estimate for this implementation is ...
84# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
85# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
86# where Tproc is time required for Karatsuba pre- and post-processing,
87# is more realistic estimate. In this case it gives ... 1.91 cycles.
88# Or in other words, depending on how well we can interleave reduction
89# and one of the two multiplications the performance should be between
90# 1.91 and 2.16. As already mentioned, this implementation processes
91# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
92# - in 2.02. x86_64 performance is better, because larger register
93# bank allows to interleave reduction and multiplication better.
94#
95# Does it make sense to increase Naggr? To start with it's virtually
96# impossible in 32-bit mode, because of limited register bank
97# capacity. Otherwise improvement has to be weighed agiainst slower
98# setup, as well as code size and complexity increase. As even
99# optimistic estimate doesn't promise 30% performance improvement,
100# there are currently no plans to increase Naggr.
101#
102# Special thanks to David Woodhouse <dwmw2@infradead.org> for
103# providing access to a Westmere-based system on behalf of Intel
104# Open Source Technology Centre.
105
106# January 2010
107#
108# Tweaked to optimize transitions between integer and FP operations
109# on same XMM register, PCLMULQDQ subroutine was measured to process
110# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
111# The minor regression on Westmere is outweighed by ~15% improvement
112# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
113# similar manner resulted in almost 20% degradation on Sandy Bridge,
114# where original 64-bit code processes one byte in 1.95 cycles.
115
116$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117push(@INC,"${dir}","${dir}../../perlasm");
118require "x86asm.pl";
119
120&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
121
122$sse2=0;
123for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
124
125($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
126$inp = "edi";
127$Htbl = "esi";
128
129$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
130 # than unrolled, which has to be weighted against
131 # 2.5x x86-specific code size reduction.
132
133sub x86_loop {
134 my $off = shift;
135 my $rem = "eax";
136
137 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
138 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
139 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
140 &mov ($Zll,&DWP(8,$Htbl,$Zll));
141 &xor ($rem,$rem); # avoid partial register stalls on PIII
142
143 # shrd practically kills P4, 2.5x deterioration, but P4 has
144 # MMX code-path to execute. shrd runs tad faster [than twice
145 # the shifts, move's and or's] on pre-MMX Pentium (as well as
146 # PIII and Core2), *but* minimizes code size, spares register
147 # and thus allows to fold the loop...
148 if (!$unroll) {
149 my $cnt = $inp;
150 &mov ($cnt,15);
151 &jmp (&label("x86_loop"));
152 &set_label("x86_loop",16);
153 for($i=1;$i<=2;$i++) {
154 &mov (&LB($rem),&LB($Zll));
155 &shrd ($Zll,$Zlh,4);
156 &and (&LB($rem),0xf);
157 &shrd ($Zlh,$Zhl,4);
158 &shrd ($Zhl,$Zhh,4);
159 &shr ($Zhh,4);
160 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
161
162 &mov (&LB($rem),&BP($off,"esp",$cnt));
163 if ($i&1) {
164 &and (&LB($rem),0xf0);
165 } else {
166 &shl (&LB($rem),4);
167 }
168
169 &xor ($Zll,&DWP(8,$Htbl,$rem));
170 &xor ($Zlh,&DWP(12,$Htbl,$rem));
171 &xor ($Zhl,&DWP(0,$Htbl,$rem));
172 &xor ($Zhh,&DWP(4,$Htbl,$rem));
173
174 if ($i&1) {
175 &dec ($cnt);
176 &js (&label("x86_break"));
177 } else {
178 &jmp (&label("x86_loop"));
179 }
180 }
181 &set_label("x86_break",16);
182 } else {
183 for($i=1;$i<32;$i++) {
184 &comment($i);
185 &mov (&LB($rem),&LB($Zll));
186 &shrd ($Zll,$Zlh,4);
187 &and (&LB($rem),0xf);
188 &shrd ($Zlh,$Zhl,4);
189 &shrd ($Zhl,$Zhh,4);
190 &shr ($Zhh,4);
191 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
192
193 if ($i&1) {
194 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
195 &and (&LB($rem),0xf0);
196 } else {
197 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
198 &shl (&LB($rem),4);
199 }
200
201 &xor ($Zll,&DWP(8,$Htbl,$rem));
202 &xor ($Zlh,&DWP(12,$Htbl,$rem));
203 &xor ($Zhl,&DWP(0,$Htbl,$rem));
204 &xor ($Zhh,&DWP(4,$Htbl,$rem));
205 }
206 }
207 &bswap ($Zll);
208 &bswap ($Zlh);
209 &bswap ($Zhl);
210 if (!$x86only) {
211 &bswap ($Zhh);
212 } else {
213 &mov ("eax",$Zhh);
214 &bswap ("eax");
215 &mov ($Zhh,"eax");
216 }
217}
218
219if ($unroll) {
220 &function_begin_B("_x86_gmult_4bit_inner");
221 &x86_loop(4);
222 &ret ();
223 &function_end_B("_x86_gmult_4bit_inner");
224}
225
226sub deposit_rem_4bit {
227 my $bias = shift;
228
229 &mov (&DWP($bias+0, "esp"),0x0000<<16);
230 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
231 &mov (&DWP($bias+8, "esp"),0x3840<<16);
232 &mov (&DWP($bias+12,"esp"),0x2460<<16);
233 &mov (&DWP($bias+16,"esp"),0x7080<<16);
234 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
235 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
236 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
237 &mov (&DWP($bias+32,"esp"),0xE100<<16);
238 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
239 &mov (&DWP($bias+40,"esp"),0xD940<<16);
240 &mov (&DWP($bias+44,"esp"),0xC560<<16);
241 &mov (&DWP($bias+48,"esp"),0x9180<<16);
242 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
243 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
244 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
245}
246
247$suffix = $x86only ? "" : "_x86";
248
249&function_begin("gcm_gmult_4bit".$suffix);
250 &stack_push(16+4+1); # +1 for stack alignment
251 &mov ($inp,&wparam(0)); # load Xi
252 &mov ($Htbl,&wparam(1)); # load Htable
253
254 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
255 &mov ($Zhl,&DWP(4,$inp));
256 &mov ($Zlh,&DWP(8,$inp));
257 &mov ($Zll,&DWP(12,$inp));
258
259 &deposit_rem_4bit(16);
260
261 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
262 &mov (&DWP(4,"esp"),$Zhl);
263 &mov (&DWP(8,"esp"),$Zlh);
264 &mov (&DWP(12,"esp"),$Zll);
265 &shr ($Zll,20);
266 &and ($Zll,0xf0);
267
268 if ($unroll) {
269 &call ("_x86_gmult_4bit_inner");
270 } else {
271 &x86_loop(0);
272 &mov ($inp,&wparam(0));
273 }
274
275 &mov (&DWP(12,$inp),$Zll);
276 &mov (&DWP(8,$inp),$Zlh);
277 &mov (&DWP(4,$inp),$Zhl);
278 &mov (&DWP(0,$inp),$Zhh);
279 &stack_pop(16+4+1);
280&function_end("gcm_gmult_4bit".$suffix);
281
282&function_begin("gcm_ghash_4bit".$suffix);
283 &stack_push(16+4+1); # +1 for 64-bit alignment
284 &mov ($Zll,&wparam(0)); # load Xi
285 &mov ($Htbl,&wparam(1)); # load Htable
286 &mov ($inp,&wparam(2)); # load in
287 &mov ("ecx",&wparam(3)); # load len
288 &add ("ecx",$inp);
289 &mov (&wparam(3),"ecx");
290
291 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
292 &mov ($Zhl,&DWP(4,$Zll));
293 &mov ($Zlh,&DWP(8,$Zll));
294 &mov ($Zll,&DWP(12,$Zll));
295
296 &deposit_rem_4bit(16);
297
298 &set_label("x86_outer_loop",16);
299 &xor ($Zll,&DWP(12,$inp)); # xor with input
300 &xor ($Zlh,&DWP(8,$inp));
301 &xor ($Zhl,&DWP(4,$inp));
302 &xor ($Zhh,&DWP(0,$inp));
303 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
304 &mov (&DWP(8,"esp"),$Zlh);
305 &mov (&DWP(4,"esp"),$Zhl);
306 &mov (&DWP(0,"esp"),$Zhh);
307
308 &shr ($Zll,20);
309 &and ($Zll,0xf0);
310
311 if ($unroll) {
312 &call ("_x86_gmult_4bit_inner");
313 } else {
314 &x86_loop(0);
315 &mov ($inp,&wparam(2));
316 }
317 &lea ($inp,&DWP(16,$inp));
318 &cmp ($inp,&wparam(3));
319 &mov (&wparam(2),$inp) if (!$unroll);
320 &jb (&label("x86_outer_loop"));
321
322 &mov ($inp,&wparam(0)); # load Xi
323 &mov (&DWP(12,$inp),$Zll);
324 &mov (&DWP(8,$inp),$Zlh);
325 &mov (&DWP(4,$inp),$Zhl);
326 &mov (&DWP(0,$inp),$Zhh);
327 &stack_pop(16+4+1);
328&function_end("gcm_ghash_4bit".$suffix);
329
330if (!$x86only) {{{
331
332&static_label("rem_4bit");
333
334if (!$sse2) {{ # pure-MMX "May" version...
335
336$S=12; # shift factor for rem_4bit
337
338&function_begin_B("_mmx_gmult_4bit_inner");
339# MMX version performs 3.5 times better on P4 (see comment in non-MMX
340# routine for further details), 100% better on Opteron, ~70% better
341# on Core2 and PIII... In other words effort is considered to be well
342# spent... Since initial release the loop was unrolled in order to
343# "liberate" register previously used as loop counter. Instead it's
344# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
345# The path involves move of Z.lo from MMX to integer register,
346# effective address calculation and finally merge of value to Z.hi.
347# Reference to rem_4bit is scheduled so late that I had to >>4
348# rem_4bit elements. This resulted in 20-45% procent improvement
349# on contemporary µ-archs.
350{
351 my $cnt;
352 my $rem_4bit = "eax";
353 my @rem = ($Zhh,$Zll);
354 my $nhi = $Zhl;
355 my $nlo = $Zlh;
356
357 my ($Zlo,$Zhi) = ("mm0","mm1");
358 my $tmp = "mm2";
359
360 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
361 &mov ($nhi,$Zll);
362 &mov (&LB($nlo),&LB($nhi));
363 &shl (&LB($nlo),4);
364 &and ($nhi,0xf0);
365 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
366 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
367 &movd ($rem[0],$Zlo);
368
369 for ($cnt=28;$cnt>=-2;$cnt--) {
370 my $odd = $cnt&1;
371 my $nix = $odd ? $nlo : $nhi;
372
373 &shl (&LB($nlo),4) if ($odd);
374 &psrlq ($Zlo,4);
375 &movq ($tmp,$Zhi);
376 &psrlq ($Zhi,4);
377 &pxor ($Zlo,&QWP(8,$Htbl,$nix));
378 &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
379 &psllq ($tmp,60);
380 &and ($nhi,0xf0) if ($odd);
381 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
382 &and ($rem[0],0xf);
383 &pxor ($Zhi,&QWP(0,$Htbl,$nix));
384 &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
385 &movd ($rem[1],$Zlo);
386 &pxor ($Zlo,$tmp);
387
388 push (@rem,shift(@rem)); # "rotate" registers
389 }
390
391 &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
392
393 &psrlq ($Zlo,32); # lower part of Zlo is already there
394 &movd ($Zhl,$Zhi);
395 &psrlq ($Zhi,32);
396 &movd ($Zlh,$Zlo);
397 &movd ($Zhh,$Zhi);
398 &shl ($inp,4); # compensate for rem_4bit[i] being >>4
399
400 &bswap ($Zll);
401 &bswap ($Zhl);
402 &bswap ($Zlh);
403 &xor ($Zhh,$inp);
404 &bswap ($Zhh);
405
406 &ret ();
407}
408&function_end_B("_mmx_gmult_4bit_inner");
409
410&function_begin("gcm_gmult_4bit_mmx");
411 &mov ($inp,&wparam(0)); # load Xi
412 &mov ($Htbl,&wparam(1)); # load Htable
413
414 &picsetup("eax");
415 &picsymbol("eax", &label("rem_4bit"), "eax");
416
417 &movz ($Zll,&BP(15,$inp));
418
419 &call ("_mmx_gmult_4bit_inner");
420
421 &mov ($inp,&wparam(0)); # load Xi
422 &emms ();
423 &mov (&DWP(12,$inp),$Zll);
424 &mov (&DWP(4,$inp),$Zhl);
425 &mov (&DWP(8,$inp),$Zlh);
426 &mov (&DWP(0,$inp),$Zhh);
427&function_end("gcm_gmult_4bit_mmx");
428
429# Streamed version performs 20% better on P4, 7% on Opteron,
430# 10% on Core2 and PIII...
431&function_begin("gcm_ghash_4bit_mmx");
432 &mov ($Zhh,&wparam(0)); # load Xi
433 &mov ($Htbl,&wparam(1)); # load Htable
434 &mov ($inp,&wparam(2)); # load in
435 &mov ($Zlh,&wparam(3)); # load len
436
437 &picsetup("eax");
438 &picsymbol("eax", &label("rem_4bit"), "eax");
439
440 &add ($Zlh,$inp);
441 &mov (&wparam(3),$Zlh); # len to point at the end of input
442 &stack_push(4+1); # +1 for stack alignment
443
444 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
445 &mov ($Zhl,&DWP(4,$Zhh));
446 &mov ($Zlh,&DWP(8,$Zhh));
447 &mov ($Zhh,&DWP(0,$Zhh));
448 &jmp (&label("mmx_outer_loop"));
449
450 &set_label("mmx_outer_loop",16);
451 &xor ($Zll,&DWP(12,$inp));
452 &xor ($Zhl,&DWP(4,$inp));
453 &xor ($Zlh,&DWP(8,$inp));
454 &xor ($Zhh,&DWP(0,$inp));
455 &mov (&wparam(2),$inp);
456 &mov (&DWP(12,"esp"),$Zll);
457 &mov (&DWP(4,"esp"),$Zhl);
458 &mov (&DWP(8,"esp"),$Zlh);
459 &mov (&DWP(0,"esp"),$Zhh);
460
461 &mov ($inp,"esp");
462 &shr ($Zll,24);
463
464 &call ("_mmx_gmult_4bit_inner");
465
466 &mov ($inp,&wparam(2));
467 &lea ($inp,&DWP(16,$inp));
468 &cmp ($inp,&wparam(3));
469 &jb (&label("mmx_outer_loop"));
470
471 &mov ($inp,&wparam(0)); # load Xi
472 &emms ();
473 &mov (&DWP(12,$inp),$Zll);
474 &mov (&DWP(4,$inp),$Zhl);
475 &mov (&DWP(8,$inp),$Zlh);
476 &mov (&DWP(0,$inp),$Zhh);
477
478 &stack_pop(4+1);
479&function_end("gcm_ghash_4bit_mmx");
480
481}} else {{ # "June" MMX version...
482 # ... has slower "April" gcm_gmult_4bit_mmx with folded
483 # loop. This is done to conserve code size...
484$S=16; # shift factor for rem_4bit
485
486sub mmx_loop() {
487# MMX version performs 2.8 times better on P4 (see comment in non-MMX
488# routine for further details), 40% better on Opteron and Core2, 50%
489# better on PIII... In other words effort is considered to be well
490# spent...
491 my $inp = shift;
492 my $rem_4bit = shift;
493 my $cnt = $Zhh;
494 my $nhi = $Zhl;
495 my $nlo = $Zlh;
496 my $rem = $Zll;
497
498 my ($Zlo,$Zhi) = ("mm0","mm1");
499 my $tmp = "mm2";
500
501 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
502 &mov ($nhi,$Zll);
503 &mov (&LB($nlo),&LB($nhi));
504 &mov ($cnt,14);
505 &shl (&LB($nlo),4);
506 &and ($nhi,0xf0);
507 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
508 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
509 &movd ($rem,$Zlo);
510 &jmp (&label("mmx_loop"));
511
512 &set_label("mmx_loop",16);
513 &psrlq ($Zlo,4);
514 &and ($rem,0xf);
515 &movq ($tmp,$Zhi);
516 &psrlq ($Zhi,4);
517 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
518 &mov (&LB($nlo),&BP(0,$inp,$cnt));
519 &psllq ($tmp,60);
520 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
521 &dec ($cnt);
522 &movd ($rem,$Zlo);
523 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
524 &mov ($nhi,$nlo);
525 &pxor ($Zlo,$tmp);
526 &js (&label("mmx_break"));
527
528 &shl (&LB($nlo),4);
529 &and ($rem,0xf);
530 &psrlq ($Zlo,4);
531 &and ($nhi,0xf0);
532 &movq ($tmp,$Zhi);
533 &psrlq ($Zhi,4);
534 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
535 &psllq ($tmp,60);
536 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
537 &movd ($rem,$Zlo);
538 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
539 &pxor ($Zlo,$tmp);
540 &jmp (&label("mmx_loop"));
541
542 &set_label("mmx_break",16);
543 &shl (&LB($nlo),4);
544 &and ($rem,0xf);
545 &psrlq ($Zlo,4);
546 &and ($nhi,0xf0);
547 &movq ($tmp,$Zhi);
548 &psrlq ($Zhi,4);
549 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
550 &psllq ($tmp,60);
551 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
552 &movd ($rem,$Zlo);
553 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
554 &pxor ($Zlo,$tmp);
555
556 &psrlq ($Zlo,4);
557 &and ($rem,0xf);
558 &movq ($tmp,$Zhi);
559 &psrlq ($Zhi,4);
560 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
561 &psllq ($tmp,60);
562 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
563 &movd ($rem,$Zlo);
564 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
565 &pxor ($Zlo,$tmp);
566
567 &psrlq ($Zlo,32); # lower part of Zlo is already there
568 &movd ($Zhl,$Zhi);
569 &psrlq ($Zhi,32);
570 &movd ($Zlh,$Zlo);
571 &movd ($Zhh,$Zhi);
572
573 &bswap ($Zll);
574 &bswap ($Zhl);
575 &bswap ($Zlh);
576 &bswap ($Zhh);
577}
578
579&function_begin("gcm_gmult_4bit_mmx");
580 &mov ($inp,&wparam(0)); # load Xi
581 &mov ($Htbl,&wparam(1)); # load Htable
582
583 &picsetup("eax");
584 &picsymbol("eax", &label("rem_4bit"), "eax");
585
586 &movz ($Zll,&BP(15,$inp));
587
588 &mmx_loop($inp,"eax");
589
590 &emms ();
591 &mov (&DWP(12,$inp),$Zll);
592 &mov (&DWP(4,$inp),$Zhl);
593 &mov (&DWP(8,$inp),$Zlh);
594 &mov (&DWP(0,$inp),$Zhh);
595&function_end("gcm_gmult_4bit_mmx");
596
597######################################################################
598# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
599# (see gcm128.c for details). It provides further 20-40% performance
600# improvement over above mentioned "May" version.
601
602&static_label("rem_8bit");
603
604&function_begin("gcm_ghash_4bit_mmx");
605{ my ($Zlo,$Zhi) = ("mm7","mm6");
606 my $rem_8bit = "esi";
607 my $Htbl = "ebx";
608
609 # parameter block
610 &mov ("eax",&wparam(0)); # Xi
611 &mov ("ebx",&wparam(1)); # Htable
612 &mov ("ecx",&wparam(2)); # inp
613 &mov ("edx",&wparam(3)); # len
614 &mov ("ebp","esp"); # original %esp
615
616 &picsetup($rem_8bit);
617 &picsymbol($rem_8bit, &label("rem_8bit"), $rem_8bit);
618
619 &sub ("esp",512+16+16); # allocate stack frame...
620 &and ("esp",-64); # ...and align it
621 &sub ("esp",16); # place for (u8)(H[]<<4)
622
623 &add ("edx","ecx"); # pointer to the end of input
624 &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
625 &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
626 &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
627
628 { my @lo = ("mm0","mm1","mm2");
629 my @hi = ("mm3","mm4","mm5");
630 my @tmp = ("mm6","mm7");
631 my ($off1,$off2,$i) = (0,0,);
632
633 &add ($Htbl,128); # optimize for size
634 &lea ("edi",&DWP(16+128,"esp"));
635 &lea ("ebp",&DWP(16+256+128,"esp"));
636
637 # decompose Htable (low and high parts are kept separately),
638 # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
639 for ($i=0;$i<18;$i++) {
640
641 &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
642 &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
643 &psllq ($tmp[1],60) if ($i>1);
644 &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
645 &por ($lo[2],$tmp[1]) if ($i>1);
646 &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
647 &psrlq ($lo[1],4) if ($i>0 && $i<17);
648 &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
649 &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
650 &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
651 &psrlq ($hi[1],4) if ($i>0 && $i<17);
652 &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
653 &shl ("edx",4) if ($i<16);
654 &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
655
656 unshift (@lo,pop(@lo)); # "rotate" registers
657 unshift (@hi,pop(@hi));
658 unshift (@tmp,pop(@tmp));
659 $off1 += 8 if ($i>0);
660 $off2 += 8 if ($i>1);
661 }
662 }
663
664 &movq ($Zhi,&QWP(0,"eax"));
665 &mov ("ebx",&DWP(8,"eax"));
666 &mov ("edx",&DWP(12,"eax")); # load Xi
667
668&set_label("outer",16);
669 { my $nlo = "eax";
670 my $dat = "edx";
671 my @nhi = ("edi","ebp");
672 my @rem = ("ebx","ecx");
673 my @red = ("mm0","mm1","mm2");
674 my $tmp = "mm3";
675
676 &xor ($dat,&DWP(12,"ecx")); # merge input data
677 &xor ("ebx",&DWP(8,"ecx"));
678 &pxor ($Zhi,&QWP(0,"ecx"));
679 &lea ("ecx",&DWP(16,"ecx")); # inp+=16
680 #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
681 &mov (&DWP(528+8,"esp"),"ebx");
682 &movq (&QWP(528+0,"esp"),$Zhi);
683 &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
684
685 &xor ($nlo,$nlo);
686 &rol ($dat,8);
687 &mov (&LB($nlo),&LB($dat));
688 &mov ($nhi[1],$nlo);
689 &and (&LB($nlo),0x0f);
690 &shr ($nhi[1],4);
691 &pxor ($red[0],$red[0]);
692 &rol ($dat,8); # next byte
693 &pxor ($red[1],$red[1]);
694 &pxor ($red[2],$red[2]);
695
696 # Just like in "May" version modulo-schedule for critical path in
697 # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
698 # is scheduled so late that rem_8bit[] has to be shifted *right*
699 # by 16, which is why last argument to pinsrw is 2, which
700 # corresponds to <<32=<<48>>16...
701 for ($j=11,$i=0;$i<15;$i++) {
702
703 if ($i>0) {
704 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
705 &rol ($dat,8); # next byte
706 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
707
708 &pxor ($Zlo,$tmp);
709 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
710 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
711 } else {
712 &movq ($Zlo,&QWP(16,"esp",$nlo,8));
713 &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
714 }
715
716 &mov (&LB($nlo),&LB($dat));
717 &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0 && $j>=0);
718
719 &movd ($rem[0],$Zlo);
720 &movz ($rem[1],&LB($rem[1])) if ($i>0);
721 &psrlq ($Zlo,8); # Z>>=8
722
723 &movq ($tmp,$Zhi);
724 &mov ($nhi[0],$nlo);
725 &psrlq ($Zhi,8);
726
727 &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
728 &and (&LB($nlo),0x0f);
729 &psllq ($tmp,56);
730
731 &pxor ($Zhi,$red[1]) if ($i>1);
732 &shr ($nhi[0],4);
733 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
734
735 unshift (@red,pop(@red)); # "rotate" registers
736 unshift (@rem,pop(@rem));
737 unshift (@nhi,pop(@nhi));
738 }
739
740 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
741 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
742 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
743
744 &pxor ($Zlo,$tmp);
745 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
746 &movz ($rem[1],&LB($rem[1]));
747
748 &pxor ($red[2],$red[2]); # clear 2nd word
749 &psllq ($red[1],4);
750
751 &movd ($rem[0],$Zlo);
752 &psrlq ($Zlo,4); # Z>>=4
753
754 &movq ($tmp,$Zhi);
755 &psrlq ($Zhi,4);
756 &shl ($rem[0],4); # rem<<4
757
758 &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
759 &psllq ($tmp,60);
760 &movz ($rem[0],&LB($rem[0]));
761
762 &pxor ($Zlo,$tmp);
763 &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
764
765 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
766 &pxor ($Zhi,$red[1]);
767
768 &movd ($dat,$Zlo);
769 &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
770
771 &psllq ($red[0],12); # correct by <<16>>4
772 &pxor ($Zhi,$red[0]);
773 &psrlq ($Zlo,32);
774 &pxor ($Zhi,$red[2]);
775
776 &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
777 &movd ("ebx",$Zlo);
778 &movq ($tmp,$Zhi); # 01234567
779 &psllw ($Zhi,8); # 1.3.5.7.
780 &psrlw ($tmp,8); # .0.2.4.6
781 &por ($Zhi,$tmp); # 10325476
782 &bswap ($dat);
783 &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
784 &bswap ("ebx");
785
786 &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
787 &jne (&label("outer"));
788 }
789
790 &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
791 &mov (&DWP(12,"eax"),"edx");
792 &mov (&DWP(8,"eax"),"ebx");
793 &movq (&QWP(0,"eax"),$Zhi);
794
795 &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
796 &emms ();
797}
798&function_end("gcm_ghash_4bit_mmx");
799}}
800
801if ($sse2) {{
802######################################################################
803# PCLMULQDQ version.
804
805$Xip="eax";
806$Htbl="edx";
807$const="ecx";
808$inp="esi";
809$len="ebx";
810
811($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
812($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
813($Xn,$Xhn)=("xmm6","xmm7");
814
815&static_label("bswap");
816
817sub clmul64x64_T2 { # minimal "register" pressure
818my ($Xhi,$Xi,$Hkey)=@_;
819
820 &movdqa ($Xhi,$Xi); #
821 &pshufd ($T1,$Xi,0b01001110);
822 &pshufd ($T2,$Hkey,0b01001110);
823 &pxor ($T1,$Xi); #
824 &pxor ($T2,$Hkey);
825
826 &pclmulqdq ($Xi,$Hkey,0x00); #######
827 &pclmulqdq ($Xhi,$Hkey,0x11); #######
828 &pclmulqdq ($T1,$T2,0x00); #######
829 &xorps ($T1,$Xi); #
830 &xorps ($T1,$Xhi); #
831
832 &movdqa ($T2,$T1); #
833 &psrldq ($T1,8);
834 &pslldq ($T2,8); #
835 &pxor ($Xhi,$T1);
836 &pxor ($Xi,$T2); #
837}
838
839sub clmul64x64_T3 {
840# Even though this subroutine offers visually better ILP, it
841# was empirically found to be a tad slower than above version.
842# At least in gcm_ghash_clmul context. But it's just as well,
843# because loop modulo-scheduling is possible only thanks to
844# minimized "register" pressure...
845my ($Xhi,$Xi,$Hkey)=@_;
846
847 &movdqa ($T1,$Xi); #
848 &movdqa ($Xhi,$Xi);
849 &pclmulqdq ($Xi,$Hkey,0x00); #######
850 &pclmulqdq ($Xhi,$Hkey,0x11); #######
851 &pshufd ($T2,$T1,0b01001110); #
852 &pshufd ($T3,$Hkey,0b01001110);
853 &pxor ($T2,$T1); #
854 &pxor ($T3,$Hkey);
855 &pclmulqdq ($T2,$T3,0x00); #######
856 &pxor ($T2,$Xi); #
857 &pxor ($T2,$Xhi); #
858
859 &movdqa ($T3,$T2); #
860 &psrldq ($T2,8);
861 &pslldq ($T3,8); #
862 &pxor ($Xhi,$T2);
863 &pxor ($Xi,$T3); #
864}
865
866if (1) { # Algorithm 9 with <<1 twist.
867 # Reduction is shorter and uses only two
868 # temporary registers, which makes it better
869 # candidate for interleaving with 64x64
870 # multiplication. Pre-modulo-scheduled loop
871 # was found to be ~20% faster than Algorithm 5
872 # below. Algorithm 9 was therefore chosen for
873 # further optimization...
874
875sub reduction_alg9 { # 17/13 times faster than Intel version
876my ($Xhi,$Xi) = @_;
877
878 # 1st phase
879 &movdqa ($T1,$Xi); #
880 &psllq ($Xi,1);
881 &pxor ($Xi,$T1); #
882 &psllq ($Xi,5); #
883 &pxor ($Xi,$T1); #
884 &psllq ($Xi,57); #
885 &movdqa ($T2,$Xi); #
886 &pslldq ($Xi,8);
887 &psrldq ($T2,8); #
888 &pxor ($Xi,$T1);
889 &pxor ($Xhi,$T2); #
890
891 # 2nd phase
892 &movdqa ($T2,$Xi);
893 &psrlq ($Xi,5);
894 &pxor ($Xi,$T2); #
895 &psrlq ($Xi,1); #
896 &pxor ($Xi,$T2); #
897 &pxor ($T2,$Xhi);
898 &psrlq ($Xi,1); #
899 &pxor ($Xi,$T2); #
900}
901
902&function_begin_B("gcm_init_clmul");
903 &mov ($Htbl,&wparam(0));
904 &mov ($Xip,&wparam(1));
905
906 &picsetup($const);
907 &picsymbol($const, &label("bswap"), $const);
908
909 &movdqu ($Hkey,&QWP(0,$Xip));
910 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
911
912 # <<1 twist
913 &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
914 &movdqa ($T1,$Hkey);
915 &psllq ($Hkey,1);
916 &pxor ($T3,$T3); #
917 &psrlq ($T1,63);
918 &pcmpgtd ($T3,$T2); # broadcast carry bit
919 &pslldq ($T1,8);
920 &por ($Hkey,$T1); # H<<=1
921
922 # magic reduction
923 &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
924 &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
925
926 # calculate H^2
927 &movdqa ($Xi,$Hkey);
928 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
929 &reduction_alg9 ($Xhi,$Xi);
930
931 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
932 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
933
934 &ret ();
935&function_end_B("gcm_init_clmul");
936
937&function_begin_B("gcm_gmult_clmul");
938 &mov ($Xip,&wparam(0));
939 &mov ($Htbl,&wparam(1));
940
941 &picsetup($const);
942 &picsymbol($const, &label("bswap"), $const);
943
944 &movdqu ($Xi,&QWP(0,$Xip));
945 &movdqa ($T3,&QWP(0,$const));
946 &movups ($Hkey,&QWP(0,$Htbl));
947 &pshufb ($Xi,$T3);
948
949 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
950 &reduction_alg9 ($Xhi,$Xi);
951
952 &pshufb ($Xi,$T3);
953 &movdqu (&QWP(0,$Xip),$Xi);
954
955 &ret ();
956&function_end_B("gcm_gmult_clmul");
957
958&function_begin("gcm_ghash_clmul");
959 &mov ($Xip,&wparam(0));
960 &mov ($Htbl,&wparam(1));
961 &mov ($inp,&wparam(2));
962 &mov ($len,&wparam(3));
963
964 &picsetup($const);
965 &picsymbol($const, &label("bswap"), $const);
966
967 &movdqu ($Xi,&QWP(0,$Xip));
968 &movdqa ($T3,&QWP(0,$const));
969 &movdqu ($Hkey,&QWP(0,$Htbl));
970 &pshufb ($Xi,$T3);
971
972 &sub ($len,0x10);
973 &jz (&label("odd_tail"));
974
975 #######
976 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
977 # [(H*Ii+1) + (H*Xi+1)] mod P =
978 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
979 #
980 &movdqu ($T1,&QWP(0,$inp)); # Ii
981 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
982 &pshufb ($T1,$T3);
983 &pshufb ($Xn,$T3);
984 &pxor ($Xi,$T1); # Ii+Xi
985
986 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
987 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
988
989 &lea ($inp,&DWP(32,$inp)); # i+=2
990 &sub ($len,0x20);
991 &jbe (&label("even_tail"));
992
993&set_label("mod_loop");
994 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
995 &movdqu ($T1,&QWP(0,$inp)); # Ii
996 &movups ($Hkey,&QWP(0,$Htbl)); # load H
997
998 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
999 &pxor ($Xhi,$Xhn);
1000
1001 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1002 &pshufb ($T1,$T3);
1003 &pshufb ($Xn,$T3);
1004
1005 &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1006 &movdqa ($Xhn,$Xn);
1007 &pxor ($Xhi,$T1); # "Ii+Xi", consume early
1008
1009 &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
1010 &psllq ($Xi,1);
1011 &pxor ($Xi,$T1); #
1012 &psllq ($Xi,5); #
1013 &pxor ($Xi,$T1); #
1014 &pclmulqdq ($Xn,$Hkey,0x00); #######
1015 &psllq ($Xi,57); #
1016 &movdqa ($T2,$Xi); #
1017 &pslldq ($Xi,8);
1018 &psrldq ($T2,8); #
1019 &pxor ($Xi,$T1);
1020 &pshufd ($T1,$T3,0b01001110);
1021 &pxor ($Xhi,$T2); #
1022 &pxor ($T1,$T3);
1023 &pshufd ($T3,$Hkey,0b01001110);
1024 &pxor ($T3,$Hkey); #
1025
1026 &pclmulqdq ($Xhn,$Hkey,0x11); #######
1027 &movdqa ($T2,$Xi); # 2nd phase
1028 &psrlq ($Xi,5);
1029 &pxor ($Xi,$T2); #
1030 &psrlq ($Xi,1); #
1031 &pxor ($Xi,$T2); #
1032 &pxor ($T2,$Xhi);
1033 &psrlq ($Xi,1); #
1034 &pxor ($Xi,$T2); #
1035
1036 &pclmulqdq ($T1,$T3,0x00); #######
1037 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1038 &xorps ($T1,$Xn); #
1039 &xorps ($T1,$Xhn); #
1040
1041 &movdqa ($T3,$T1); #
1042 &psrldq ($T1,8);
1043 &pslldq ($T3,8); #
1044 &pxor ($Xhn,$T1);
1045 &pxor ($Xn,$T3); #
1046 &movdqa ($T3,&QWP(0,$const));
1047
1048 &lea ($inp,&DWP(32,$inp));
1049 &sub ($len,0x20);
1050 &ja (&label("mod_loop"));
1051
1052&set_label("even_tail");
1053 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1054
1055 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1056 &pxor ($Xhi,$Xhn);
1057
1058 &reduction_alg9 ($Xhi,$Xi);
1059
1060 &test ($len,$len);
1061 &jnz (&label("done"));
1062
1063 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1064&set_label("odd_tail");
1065 &movdqu ($T1,&QWP(0,$inp)); # Ii
1066 &pshufb ($T1,$T3);
1067 &pxor ($Xi,$T1); # Ii+Xi
1068
1069 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1070 &reduction_alg9 ($Xhi,$Xi);
1071
1072&set_label("done");
1073 &pshufb ($Xi,$T3);
1074 &movdqu (&QWP(0,$Xip),$Xi);
1075&function_end("gcm_ghash_clmul");
1076
1077} else { # Algorithm 5. Kept for reference purposes.
1078
1079sub reduction_alg5 { # 19/16 times faster than Intel version
1080my ($Xhi,$Xi)=@_;
1081
1082 # <<1
1083 &movdqa ($T1,$Xi); #
1084 &movdqa ($T2,$Xhi);
1085 &pslld ($Xi,1);
1086 &pslld ($Xhi,1); #
1087 &psrld ($T1,31);
1088 &psrld ($T2,31); #
1089 &movdqa ($T3,$T1);
1090 &pslldq ($T1,4);
1091 &psrldq ($T3,12); #
1092 &pslldq ($T2,4);
1093 &por ($Xhi,$T3); #
1094 &por ($Xi,$T1);
1095 &por ($Xhi,$T2); #
1096
1097 # 1st phase
1098 &movdqa ($T1,$Xi);
1099 &movdqa ($T2,$Xi);
1100 &movdqa ($T3,$Xi); #
1101 &pslld ($T1,31);
1102 &pslld ($T2,30);
1103 &pslld ($Xi,25); #
1104 &pxor ($T1,$T2);
1105 &pxor ($T1,$Xi); #
1106 &movdqa ($T2,$T1); #
1107 &pslldq ($T1,12);
1108 &psrldq ($T2,4); #
1109 &pxor ($T3,$T1);
1110
1111 # 2nd phase
1112 &pxor ($Xhi,$T3); #
1113 &movdqa ($Xi,$T3);
1114 &movdqa ($T1,$T3);
1115 &psrld ($Xi,1); #
1116 &psrld ($T1,2);
1117 &psrld ($T3,7); #
1118 &pxor ($Xi,$T1);
1119 &pxor ($Xhi,$T2);
1120 &pxor ($Xi,$T3); #
1121 &pxor ($Xi,$Xhi); #
1122}
1123
1124&function_begin_B("gcm_init_clmul");
1125 &mov ($Htbl,&wparam(0));
1126 &mov ($Xip,&wparam(1));
1127
1128 &picsetup($const);
1129 &picsymbol($const, &label("bswap"), $const);
1130
1131 &movdqu ($Hkey,&QWP(0,$Xip));
1132 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1133
1134 # calculate H^2
1135 &movdqa ($Xi,$Hkey);
1136 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1137 &reduction_alg5 ($Xhi,$Xi);
1138
1139 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1140 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1141
1142 &ret ();
1143&function_end_B("gcm_init_clmul");
1144
1145&function_begin_B("gcm_gmult_clmul");
1146 &mov ($Xip,&wparam(0));
1147 &mov ($Htbl,&wparam(1));
1148
1149 &picsetup($const);
1150 &picsymbol($const, &label("bswap"), $const);
1151
1152 &movdqu ($Xi,&QWP(0,$Xip));
1153 &movdqa ($Xn,&QWP(0,$const));
1154 &movdqu ($Hkey,&QWP(0,$Htbl));
1155 &pshufb ($Xi,$Xn);
1156
1157 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1158 &reduction_alg5 ($Xhi,$Xi);
1159
1160 &pshufb ($Xi,$Xn);
1161 &movdqu (&QWP(0,$Xip),$Xi);
1162
1163 &ret ();
1164&function_end_B("gcm_gmult_clmul");
1165
1166&function_begin("gcm_ghash_clmul");
1167 &mov ($Xip,&wparam(0));
1168 &mov ($Htbl,&wparam(1));
1169 &mov ($inp,&wparam(2));
1170 &mov ($len,&wparam(3));
1171
1172 &picsetup($const);
1173 &picsymbol($const, &label("bswap"), $const);
1174
1175 &movdqu ($Xi,&QWP(0,$Xip));
1176 &movdqa ($T3,&QWP(0,$const));
1177 &movdqu ($Hkey,&QWP(0,$Htbl));
1178 &pshufb ($Xi,$T3);
1179
1180 &sub ($len,0x10);
1181 &jz (&label("odd_tail"));
1182
1183 #######
1184 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1185 # [(H*Ii+1) + (H*Xi+1)] mod P =
1186 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1187 #
1188 &movdqu ($T1,&QWP(0,$inp)); # Ii
1189 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1190 &pshufb ($T1,$T3);
1191 &pshufb ($Xn,$T3);
1192 &pxor ($Xi,$T1); # Ii+Xi
1193
1194 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1195 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1196
1197 &sub ($len,0x20);
1198 &lea ($inp,&DWP(32,$inp)); # i+=2
1199 &jbe (&label("even_tail"));
1200
1201&set_label("mod_loop");
1202 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1203 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1204
1205 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1206 &pxor ($Xhi,$Xhn);
1207
1208 &reduction_alg5 ($Xhi,$Xi);
1209
1210 #######
1211 &movdqa ($T3,&QWP(0,$const));
1212 &movdqu ($T1,&QWP(0,$inp)); # Ii
1213 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1214 &pshufb ($T1,$T3);
1215 &pshufb ($Xn,$T3);
1216 &pxor ($Xi,$T1); # Ii+Xi
1217
1218 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1219 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1220
1221 &sub ($len,0x20);
1222 &lea ($inp,&DWP(32,$inp));
1223 &ja (&label("mod_loop"));
1224
1225&set_label("even_tail");
1226 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1227
1228 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1229 &pxor ($Xhi,$Xhn);
1230
1231 &reduction_alg5 ($Xhi,$Xi);
1232
1233 &movdqa ($T3,&QWP(0,$const));
1234 &test ($len,$len);
1235 &jnz (&label("done"));
1236
1237 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1238&set_label("odd_tail");
1239 &movdqu ($T1,&QWP(0,$inp)); # Ii
1240 &pshufb ($T1,$T3);
1241 &pxor ($Xi,$T1); # Ii+Xi
1242
1243 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1244 &reduction_alg5 ($Xhi,$Xi);
1245
1246 &movdqa ($T3,&QWP(0,$const));
1247&set_label("done");
1248 &pshufb ($Xi,$T3);
1249 &movdqu (&QWP(0,$Xip),$Xi);
1250&function_end("gcm_ghash_clmul");
1251
1252}
1253
1254 &rodataseg();
1255&set_label("bswap",64);
1256 &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1257 &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1258 &previous();
1259}} # $sse2
1260
1261 &rodataseg();
1262&set_label("rem_4bit",64);
1263 &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1264 &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1265 &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1266 &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1267&set_label("rem_8bit",64);
1268 &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1269 &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1270 &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1271 &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1272 &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1273 &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1274 &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1275 &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1276 &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1277 &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1278 &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1279 &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1280 &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1281 &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1282 &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1283 &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1284 &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1285 &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1286 &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1287 &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1288 &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1289 &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1290 &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1291 &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1292 &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1293 &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1294 &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1295 &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1296 &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1297 &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1298 &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1299 &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1300 &previous();
1301}}} # !$x86only
1302
1303&asm_finish();
1304
1305# A question was risen about choice of vanilla MMX. Or rather why wasn't
1306# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1307# CPUs such as PIII, "4-bit" MMX version was observed to provide better
1308# performance than *corresponding* SSE2 one even on contemporary CPUs.
1309# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1310# implementation featuring full range of lookup-table sizes, but with
1311# per-invocation lookup table setup. Latter means that table size is
1312# chosen depending on how much data is to be hashed in every given call,
1313# more data - larger table. Best reported result for Core2 is ~4 cycles
1314# per processed byte out of 64KB block. This number accounts even for
1315# 64KB table setup overhead. As discussed in gcm128.c we choose to be
1316# more conservative in respect to lookup table sizes, but how do the
1317# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1318# on same platform. As also discussed in gcm128.c, next in line "8-bit
1319# Shoup's" or "4KB" method should deliver twice the performance of
1320# "256B" one, in other words not worse than ~6 cycles per byte. It
1321# should be also be noted that in SSE2 case improvement can be "super-
1322# linear," i.e. more than twice, mostly because >>8 maps to single
1323# instruction on SSE2 register. This is unlike "4-bit" case when >>4
1324# maps to same amount of instructions in both MMX and SSE2 cases.
1325# Bottom line is that switch to SSE2 is considered to be justifiable
1326# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index bf547a041b..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,812 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that
14# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15# function features so called "528B" variant utilizing additional
16# 256+16 bytes of per-key storage [+512 bytes shared table].
17# Performance results are for this streamed GHASH subroutine and are
18# expressed in cycles per processed byte, less is better:
19#
20# gcc 3.4.x(*) assembler
21#
22# P4 28.6 14.0 +100%
23# Opteron 19.3 7.7 +150%
24# Core2 17.8 8.1(**) +120%
25#
26# (*) comparison is not completely fair, because C results are
27# for vanilla "256B" implementation, while assembler results
28# are for "528B";-)
29# (**) it's mystery [to me] why Core2 result is not same as for
30# Opteron;
31
32# May 2010
33#
34# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
35# See ghash-x86.pl for background information and details about coding
36# techniques.
37#
38# Special thanks to David Woodhouse <dwmw2@infradead.org> for
39# providing access to a Westmere-based system on behalf of Intel
40# Open Source Technology Centre.
41
42$flavour = shift;
43$output = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour $output";
54*STDOUT=*OUT;
55
56# common register layout
57$nlo="%rax";
58$nhi="%rbx";
59$Zlo="%r8";
60$Zhi="%r9";
61$tmp="%r10";
62$rem_4bit = "%r11";
63
64$Xi="%rdi";
65$Htbl="%rsi";
66
67# per-function register layout
68$cnt="%rcx";
69$rem="%rdx";
70
71sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
72 $r =~ s/%[er]([sd]i)/%\1l/ or
73 $r =~ s/%[er](bp)/%\1l/ or
74 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
75
76sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
77{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
78 my $arg = pop;
79 $arg = "\$$arg" if ($arg*1 eq $arg);
80 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
81}
82
83{ my $N;
84 sub loop() {
85 my $inp = shift;
86
87 $N++;
88$code.=<<___;
89 xor $nlo,$nlo
90 xor $nhi,$nhi
91 mov `&LB("$Zlo")`,`&LB("$nlo")`
92 mov `&LB("$Zlo")`,`&LB("$nhi")`
93 shl \$4,`&LB("$nlo")`
94 mov \$14,$cnt
95 mov 8($Htbl,$nlo),$Zlo
96 mov ($Htbl,$nlo),$Zhi
97 and \$0xf0,`&LB("$nhi")`
98 mov $Zlo,$rem
99 jmp .Loop$N
100
101.align 16
102.Loop$N:
103 shr \$4,$Zlo
104 and \$0xf,$rem
105 mov $Zhi,$tmp
106 mov ($inp,$cnt),`&LB("$nlo")`
107 shr \$4,$Zhi
108 xor 8($Htbl,$nhi),$Zlo
109 shl \$60,$tmp
110 xor ($Htbl,$nhi),$Zhi
111 mov `&LB("$nlo")`,`&LB("$nhi")`
112 xor ($rem_4bit,$rem,8),$Zhi
113 mov $Zlo,$rem
114 shl \$4,`&LB("$nlo")`
115 xor $tmp,$Zlo
116 dec $cnt
117 js .Lbreak$N
118
119 shr \$4,$Zlo
120 and \$0xf,$rem
121 mov $Zhi,$tmp
122 shr \$4,$Zhi
123 xor 8($Htbl,$nlo),$Zlo
124 shl \$60,$tmp
125 xor ($Htbl,$nlo),$Zhi
126 and \$0xf0,`&LB("$nhi")`
127 xor ($rem_4bit,$rem,8),$Zhi
128 mov $Zlo,$rem
129 xor $tmp,$Zlo
130 jmp .Loop$N
131
132.align 16
133.Lbreak$N:
134 shr \$4,$Zlo
135 and \$0xf,$rem
136 mov $Zhi,$tmp
137 shr \$4,$Zhi
138 xor 8($Htbl,$nlo),$Zlo
139 shl \$60,$tmp
140 xor ($Htbl,$nlo),$Zhi
141 and \$0xf0,`&LB("$nhi")`
142 xor ($rem_4bit,$rem,8),$Zhi
143 mov $Zlo,$rem
144 xor $tmp,$Zlo
145
146 shr \$4,$Zlo
147 and \$0xf,$rem
148 mov $Zhi,$tmp
149 shr \$4,$Zhi
150 xor 8($Htbl,$nhi),$Zlo
151 shl \$60,$tmp
152 xor ($Htbl,$nhi),$Zhi
153 xor $tmp,$Zlo
154 xor ($rem_4bit,$rem,8),$Zhi
155
156 bswap $Zlo
157 bswap $Zhi
158___
159}}
160
161$code=<<___;
162.text
163
164.globl gcm_gmult_4bit
165.type gcm_gmult_4bit,\@function,2
166.align 16
167gcm_gmult_4bit:
168 _CET_ENDBR
169 push %rbx
170 push %rbp # %rbp and %r12 are pushed exclusively in
171 push %r12 # order to reuse Win64 exception handler...
172.Lgmult_prologue:
173
174 movzb 15($Xi),$Zlo
175 lea .Lrem_4bit(%rip),$rem_4bit
176___
177 &loop ($Xi);
178$code.=<<___;
179 mov $Zlo,8($Xi)
180 mov $Zhi,($Xi)
181
182 mov 16(%rsp),%rbx
183 lea 24(%rsp),%rsp
184.Lgmult_epilogue:
185 ret
186.size gcm_gmult_4bit,.-gcm_gmult_4bit
187___
188
189# per-function register layout
190$inp="%rdx";
191$len="%rcx";
192$rem_8bit=$rem_4bit;
193
194$code.=<<___;
195.globl gcm_ghash_4bit
196.type gcm_ghash_4bit,\@function,4
197.align 16
198gcm_ghash_4bit:
199 _CET_ENDBR
200 push %rbx
201 push %rbp
202 push %r12
203 push %r13
204 push %r14
205 push %r15
206 sub \$280,%rsp
207.Lghash_prologue:
208 mov $inp,%r14 # reassign couple of args
209 mov $len,%r15
210___
211{ my $inp="%r14";
212 my $dat="%edx";
213 my $len="%r15";
214 my @nhi=("%ebx","%ecx");
215 my @rem=("%r12","%r13");
216 my $Hshr4="%rbp";
217
218 &sub ($Htbl,-128); # size optimization
219 &lea ($Hshr4,"16+128(%rsp)");
220 { my @lo =($nlo,$nhi);
221 my @hi =($Zlo,$Zhi);
222
223 &xor ($dat,$dat);
224 for ($i=0,$j=-2;$i<18;$i++,$j++) {
225 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
226 &or ($lo[0],$tmp) if ($i>1);
227 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
228 &shr ($lo[1],4) if ($i>0 && $i<17);
229 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
230 &shr ($hi[1],4) if ($i>0 && $i<17);
231 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
232 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
233 &shl (&LB($dat),4) if ($i>0 && $i<17);
234 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
235 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
236 &shl ($tmp,60) if ($i>0 && $i<17);
237
238 push (@lo,shift(@lo));
239 push (@hi,shift(@hi));
240 }
241 }
242 &add ($Htbl,-128);
243 &mov ($Zlo,"8($Xi)");
244 &mov ($Zhi,"0($Xi)");
245 &add ($len,$inp); # pointer to the end of data
246 &lea ($rem_8bit,".Lrem_8bit(%rip)");
247 &jmp (".Louter_loop");
248
249$code.=".align 16\n.Louter_loop:\n";
250 &xor ($Zhi,"($inp)");
251 &mov ("%rdx","8($inp)");
252 &lea ($inp,"16($inp)");
253 &xor ("%rdx",$Zlo);
254 &mov ("($Xi)",$Zhi);
255 &mov ("8($Xi)","%rdx");
256 &shr ("%rdx",32);
257
258 &xor ($nlo,$nlo);
259 &rol ($dat,8);
260 &mov (&LB($nlo),&LB($dat));
261 &movz ($nhi[0],&LB($dat));
262 &shl (&LB($nlo),4);
263 &shr ($nhi[0],4);
264
265 for ($j=11,$i=0;$i<15;$i++) {
266 &rol ($dat,8);
267 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
268 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
269 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
270 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
271
272 &mov (&LB($nlo),&LB($dat));
273 &xor ($Zlo,$tmp) if ($i>0);
274 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
275
276 &movz ($nhi[1],&LB($dat));
277 &shl (&LB($nlo),4);
278 &movzb ($rem[0],"(%rsp,$nhi[0])");
279
280 &shr ($nhi[1],4) if ($i<14);
281 &and ($nhi[1],0xf0) if ($i==14);
282 &shl ($rem[1],48) if ($i>0);
283 &xor ($rem[0],$Zlo);
284
285 &mov ($tmp,$Zhi);
286 &xor ($Zhi,$rem[1]) if ($i>0);
287 &shr ($Zlo,8);
288
289 &movz ($rem[0],&LB($rem[0]));
290 &mov ($dat,"$j($Xi)") if (--$j%4==0 && $j>=0);
291 &shr ($Zhi,8);
292
293 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
294 &shl ($tmp,56);
295 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
296
297 unshift (@nhi,pop(@nhi)); # "rotate" registers
298 unshift (@rem,pop(@rem));
299 }
300 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
301 &xor ($Zlo,"8($Htbl,$nlo)");
302 &xor ($Zhi,"($Htbl,$nlo)");
303
304 &shl ($rem[1],48);
305 &xor ($Zlo,$tmp);
306
307 &xor ($Zhi,$rem[1]);
308 &movz ($rem[0],&LB($Zlo));
309 &shr ($Zlo,4);
310
311 &mov ($tmp,$Zhi);
312 &shl (&LB($rem[0]),4);
313 &shr ($Zhi,4);
314
315 &xor ($Zlo,"8($Htbl,$nhi[0])");
316 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
317 &shl ($tmp,60);
318
319 &xor ($Zhi,"($Htbl,$nhi[0])");
320 &xor ($Zlo,$tmp);
321 &shl ($rem[0],48);
322
323 &bswap ($Zlo);
324 &xor ($Zhi,$rem[0]);
325
326 &bswap ($Zhi);
327 &cmp ($inp,$len);
328 &jb (".Louter_loop");
329}
330$code.=<<___;
331 mov $Zlo,8($Xi)
332 mov $Zhi,($Xi)
333
334 lea 280(%rsp),%rsi
335 mov 0(%rsi),%r15
336 mov 8(%rsi),%r14
337 mov 16(%rsi),%r13
338 mov 24(%rsi),%r12
339 mov 32(%rsi),%rbp
340 mov 40(%rsi),%rbx
341 lea 48(%rsi),%rsp
342.Lghash_epilogue:
343 ret
344.size gcm_ghash_4bit,.-gcm_ghash_4bit
345___
346
347######################################################################
348# PCLMULQDQ version.
349
350@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
351 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
352
353($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
354($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
355
356sub clmul64x64_T2 { # minimal register pressure
357my ($Xhi,$Xi,$Hkey,$modulo)=@_;
358
359$code.=<<___ if (!defined($modulo));
360 movdqa $Xi,$Xhi #
361 pshufd \$0b01001110,$Xi,$T1
362 pshufd \$0b01001110,$Hkey,$T2
363 pxor $Xi,$T1 #
364 pxor $Hkey,$T2
365___
366$code.=<<___;
367 pclmulqdq \$0x00,$Hkey,$Xi #######
368 pclmulqdq \$0x11,$Hkey,$Xhi #######
369 pclmulqdq \$0x00,$T2,$T1 #######
370 pxor $Xi,$T1 #
371 pxor $Xhi,$T1 #
372
373 movdqa $T1,$T2 #
374 psrldq \$8,$T1
375 pslldq \$8,$T2 #
376 pxor $T1,$Xhi
377 pxor $T2,$Xi #
378___
379}
380
381sub reduction_alg9 { # 17/13 times faster than Intel version
382my ($Xhi,$Xi) = @_;
383
384$code.=<<___;
385 # 1st phase
386 movdqa $Xi,$T1 #
387 psllq \$1,$Xi
388 pxor $T1,$Xi #
389 psllq \$5,$Xi #
390 pxor $T1,$Xi #
391 psllq \$57,$Xi #
392 movdqa $Xi,$T2 #
393 pslldq \$8,$Xi
394 psrldq \$8,$T2 #
395 pxor $T1,$Xi
396 pxor $T2,$Xhi #
397
398 # 2nd phase
399 movdqa $Xi,$T2
400 psrlq \$5,$Xi
401 pxor $T2,$Xi #
402 psrlq \$1,$Xi #
403 pxor $T2,$Xi #
404 pxor $Xhi,$T2
405 psrlq \$1,$Xi #
406 pxor $T2,$Xi #
407___
408}
409
410{ my ($Htbl,$Xip)=@_4args;
411
412$code.=<<___;
413.globl gcm_init_clmul
414.type gcm_init_clmul,\@abi-omnipotent
415.align 16
416gcm_init_clmul:
417 _CET_ENDBR
418 movdqu ($Xip),$Hkey
419 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
420
421 # <<1 twist
422 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
423 movdqa $Hkey,$T1
424 psllq \$1,$Hkey
425 pxor $T3,$T3 #
426 psrlq \$63,$T1
427 pcmpgtd $T2,$T3 # broadcast carry bit
428 pslldq \$8,$T1
429 por $T1,$Hkey # H<<=1
430
431 # magic reduction
432 pand .L0x1c2_polynomial(%rip),$T3
433 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
434
435 # calculate H^2
436 movdqa $Hkey,$Xi
437___
438 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
439 &reduction_alg9 ($Xhi,$Xi);
440$code.=<<___;
441 movdqu $Hkey,($Htbl) # save H
442 movdqu $Xi,16($Htbl) # save H^2
443 ret
444.size gcm_init_clmul,.-gcm_init_clmul
445___
446}
447
448{ my ($Xip,$Htbl)=@_4args;
449
450$code.=<<___;
451.globl gcm_gmult_clmul
452.type gcm_gmult_clmul,\@abi-omnipotent
453.align 16
454gcm_gmult_clmul:
455 _CET_ENDBR
456 movdqu ($Xip),$Xi
457 movdqa .Lbswap_mask(%rip),$T3
458 movdqu ($Htbl),$Hkey
459 pshufb $T3,$Xi
460___
461 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
462 &reduction_alg9 ($Xhi,$Xi);
463$code.=<<___;
464 pshufb $T3,$Xi
465 movdqu $Xi,($Xip)
466 ret
467.size gcm_gmult_clmul,.-gcm_gmult_clmul
468___
469}
470
471{ my ($Xip,$Htbl,$inp,$len)=@_4args;
472 my $Xn="%xmm6";
473 my $Xhn="%xmm7";
474 my $Hkey2="%xmm8";
475 my $T1n="%xmm9";
476 my $T2n="%xmm10";
477
478$code.=<<___;
479.globl gcm_ghash_clmul
480.type gcm_ghash_clmul,\@abi-omnipotent
481.align 16
482gcm_ghash_clmul:
483 _CET_ENDBR
484___
485$code.=<<___ if ($win64);
486.LSEH_begin_gcm_ghash_clmul:
487 # I can't trust assembler to use specific encoding:-(
488 .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
489 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
490 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
491 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
492 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
493 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
494___
495$code.=<<___;
496 movdqa .Lbswap_mask(%rip),$T3
497
498 movdqu ($Xip),$Xi
499 movdqu ($Htbl),$Hkey
500 pshufb $T3,$Xi
501
502 sub \$0x10,$len
503 jz .Lodd_tail
504
505 movdqu 16($Htbl),$Hkey2
506 #######
507 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
508 # [(H*Ii+1) + (H*Xi+1)] mod P =
509 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
510 #
511 movdqu ($inp),$T1 # Ii
512 movdqu 16($inp),$Xn # Ii+1
513 pshufb $T3,$T1
514 pshufb $T3,$Xn
515 pxor $T1,$Xi # Ii+Xi
516___
517 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
518$code.=<<___;
519 movdqa $Xi,$Xhi #
520 pshufd \$0b01001110,$Xi,$T1
521 pshufd \$0b01001110,$Hkey2,$T2
522 pxor $Xi,$T1 #
523 pxor $Hkey2,$T2
524
525 lea 32($inp),$inp # i+=2
526 sub \$0x20,$len
527 jbe .Leven_tail
528
529.Lmod_loop:
530___
531 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
532$code.=<<___;
533 movdqu ($inp),$T1 # Ii
534 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
535 pxor $Xhn,$Xhi
536
537 movdqu 16($inp),$Xn # Ii+1
538 pshufb $T3,$T1
539 pshufb $T3,$Xn
540
541 movdqa $Xn,$Xhn #
542 pshufd \$0b01001110,$Xn,$T1n
543 pshufd \$0b01001110,$Hkey,$T2n
544 pxor $Xn,$T1n #
545 pxor $Hkey,$T2n
546 pxor $T1,$Xhi # "Ii+Xi", consume early
547
548 movdqa $Xi,$T1 # 1st phase
549 psllq \$1,$Xi
550 pxor $T1,$Xi #
551 psllq \$5,$Xi #
552 pxor $T1,$Xi #
553 pclmulqdq \$0x00,$Hkey,$Xn #######
554 psllq \$57,$Xi #
555 movdqa $Xi,$T2 #
556 pslldq \$8,$Xi
557 psrldq \$8,$T2 #
558 pxor $T1,$Xi
559 pxor $T2,$Xhi #
560
561 pclmulqdq \$0x11,$Hkey,$Xhn #######
562 movdqa $Xi,$T2 # 2nd phase
563 psrlq \$5,$Xi
564 pxor $T2,$Xi #
565 psrlq \$1,$Xi #
566 pxor $T2,$Xi #
567 pxor $Xhi,$T2
568 psrlq \$1,$Xi #
569 pxor $T2,$Xi #
570
571 pclmulqdq \$0x00,$T2n,$T1n #######
572 movdqa $Xi,$Xhi #
573 pshufd \$0b01001110,$Xi,$T1
574 pshufd \$0b01001110,$Hkey2,$T2
575 pxor $Xi,$T1 #
576 pxor $Hkey2,$T2
577
578 pxor $Xn,$T1n #
579 pxor $Xhn,$T1n #
580 movdqa $T1n,$T2n #
581 psrldq \$8,$T1n
582 pslldq \$8,$T2n #
583 pxor $T1n,$Xhn
584 pxor $T2n,$Xn #
585
586 lea 32($inp),$inp
587 sub \$0x20,$len
588 ja .Lmod_loop
589
590.Leven_tail:
591___
592 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
593$code.=<<___;
594 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
595 pxor $Xhn,$Xhi
596___
597 &reduction_alg9 ($Xhi,$Xi);
598$code.=<<___;
599 test $len,$len
600 jnz .Ldone
601
602.Lodd_tail:
603 movdqu ($inp),$T1 # Ii
604 pshufb $T3,$T1
605 pxor $T1,$Xi # Ii+Xi
606___
607 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
608 &reduction_alg9 ($Xhi,$Xi);
609$code.=<<___;
610.Ldone:
611 pshufb $T3,$Xi
612 movdqu $Xi,($Xip)
613___
614$code.=<<___ if ($win64);
615 movaps (%rsp),%xmm6
616 movaps 0x10(%rsp),%xmm7
617 movaps 0x20(%rsp),%xmm8
618 movaps 0x30(%rsp),%xmm9
619 movaps 0x40(%rsp),%xmm10
620 add \$0x58,%rsp
621___
622$code.=<<___;
623 ret
624.LSEH_end_gcm_ghash_clmul:
625.size gcm_ghash_clmul,.-gcm_ghash_clmul
626___
627}
628
629$code.=<<___;
630.section .rodata
631.align 64
632.Lbswap_mask:
633 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
634.L0x1c2_polynomial:
635 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
636.align 64
637.type .Lrem_4bit,\@object
638.Lrem_4bit:
639 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
640 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
641 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
642 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
643.type .Lrem_8bit,\@object
644.Lrem_8bit:
645 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
646 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
647 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
648 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
649 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
650 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
651 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
652 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
653 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
654 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
655 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
656 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
657 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
658 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
659 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
660 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
661 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
662 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
663 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
664 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
665 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
666 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
667 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
668 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
669 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
670 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
671 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
672 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
673 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
674 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
675 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
676 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
677.align 64
678.text
679___
680
681# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
682# CONTEXT *context,DISPATCHER_CONTEXT *disp)
683if ($win64) {
684$rec="%rcx";
685$frame="%rdx";
686$context="%r8";
687$disp="%r9";
688
689$code.=<<___;
690.extern __imp_RtlVirtualUnwind
691.type se_handler,\@abi-omnipotent
692.align 16
693se_handler:
694 _CET_ENDBR
695 push %rsi
696 push %rdi
697 push %rbx
698 push %rbp
699 push %r12
700 push %r13
701 push %r14
702 push %r15
703 pushfq
704 sub \$64,%rsp
705
706 mov 120($context),%rax # pull context->Rax
707 mov 248($context),%rbx # pull context->Rip
708
709 mov 8($disp),%rsi # disp->ImageBase
710 mov 56($disp),%r11 # disp->HandlerData
711
712 mov 0(%r11),%r10d # HandlerData[0]
713 lea (%rsi,%r10),%r10 # prologue label
714 cmp %r10,%rbx # context->Rip<prologue label
715 jb .Lin_prologue
716
717 mov 152($context),%rax # pull context->Rsp
718
719 mov 4(%r11),%r10d # HandlerData[1]
720 lea (%rsi,%r10),%r10 # epilogue label
721 cmp %r10,%rbx # context->Rip>=epilogue label
722 jae .Lin_prologue
723
724 lea 24(%rax),%rax # adjust "rsp"
725
726 mov -8(%rax),%rbx
727 mov -16(%rax),%rbp
728 mov -24(%rax),%r12
729 mov %rbx,144($context) # restore context->Rbx
730 mov %rbp,160($context) # restore context->Rbp
731 mov %r12,216($context) # restore context->R12
732
733.Lin_prologue:
734 mov 8(%rax),%rdi
735 mov 16(%rax),%rsi
736 mov %rax,152($context) # restore context->Rsp
737 mov %rsi,168($context) # restore context->Rsi
738 mov %rdi,176($context) # restore context->Rdi
739
740 mov 40($disp),%rdi # disp->ContextRecord
741 mov $context,%rsi # context
742 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
743 .long 0xa548f3fc # cld; rep movsq
744
745 mov $disp,%rsi
746 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
747 mov 8(%rsi),%rdx # arg2, disp->ImageBase
748 mov 0(%rsi),%r8 # arg3, disp->ControlPc
749 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
750 mov 40(%rsi),%r10 # disp->ContextRecord
751 lea 56(%rsi),%r11 # &disp->HandlerData
752 lea 24(%rsi),%r12 # &disp->EstablisherFrame
753 mov %r10,32(%rsp) # arg5
754 mov %r11,40(%rsp) # arg6
755 mov %r12,48(%rsp) # arg7
756 mov %rcx,56(%rsp) # arg8, (NULL)
757 call *__imp_RtlVirtualUnwind(%rip)
758
759 mov \$1,%eax # ExceptionContinueSearch
760 add \$64,%rsp
761 popfq
762 pop %r15
763 pop %r14
764 pop %r13
765 pop %r12
766 pop %rbp
767 pop %rbx
768 pop %rdi
769 pop %rsi
770 ret
771.size se_handler,.-se_handler
772
773.section .pdata
774.align 4
775 .rva .LSEH_begin_gcm_gmult_4bit
776 .rva .LSEH_end_gcm_gmult_4bit
777 .rva .LSEH_info_gcm_gmult_4bit
778
779 .rva .LSEH_begin_gcm_ghash_4bit
780 .rva .LSEH_end_gcm_ghash_4bit
781 .rva .LSEH_info_gcm_ghash_4bit
782
783 .rva .LSEH_begin_gcm_ghash_clmul
784 .rva .LSEH_end_gcm_ghash_clmul
785 .rva .LSEH_info_gcm_ghash_clmul
786
787.section .xdata
788.align 8
789.LSEH_info_gcm_gmult_4bit:
790 .byte 9,0,0,0
791 .rva se_handler
792 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
793.LSEH_info_gcm_ghash_4bit:
794 .byte 9,0,0,0
795 .rva se_handler
796 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
797.LSEH_info_gcm_ghash_clmul:
798 .byte 0x01,0x1f,0x0b,0x00
799 .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
800 .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
801 .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
802 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
803 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
804 .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
805___
806}
807
808$code =~ s/\`([^\`]*)\`/eval($1)/gem;
809
810print $code;
811
812close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
deleted file mode 100644
index f8ebf79a87..0000000000
--- a/src/lib/libcrypto/modes/cbc128.c
+++ /dev/null
@@ -1,214 +0,0 @@
1/* $OpenBSD: cbc128.c,v 1.8 2023/07/08 14:56:54 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_local.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62#undef STRICT_ALIGNMENT
63#ifdef __STRICT_ALIGNMENT
64#define STRICT_ALIGNMENT 1
65#else
66#define STRICT_ALIGNMENT 0
67#endif
68
69void
70CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
71 size_t len, const void *key,
72 unsigned char ivec[16], block128_f block)
73{
74 size_t n;
75 const unsigned char *iv = ivec;
76
77#if !defined(OPENSSL_SMALL_FOOTPRINT)
78 if (STRICT_ALIGNMENT &&
79 ((size_t)in|(size_t)out|(size_t)ivec) % sizeof(size_t) != 0) {
80 while (len >= 16) {
81 for (n = 0; n < 16; ++n)
82 out[n] = in[n] ^ iv[n];
83 (*block)(out, out, key);
84 iv = out;
85 len -= 16;
86 in += 16;
87 out += 16;
88 }
89 } else {
90 while (len >= 16) {
91 for (n = 0; n < 16; n += sizeof(size_t))
92 *(size_t *)(out + n) =
93 *(size_t *)(in + n) ^ *(size_t *)(iv + n);
94 (*block)(out, out, key);
95 iv = out;
96 len -= 16;
97 in += 16;
98 out += 16;
99 }
100 }
101#endif
102 while (len) {
103 for (n = 0; n < 16 && n < len; ++n)
104 out[n] = in[n] ^ iv[n];
105 for (; n < 16; ++n)
106 out[n] = iv[n];
107 (*block)(out, out, key);
108 iv = out;
109 if (len <= 16)
110 break;
111 len -= 16;
112 in += 16;
113 out += 16;
114 }
115 memmove(ivec, iv, 16);
116}
117LCRYPTO_ALIAS(CRYPTO_cbc128_encrypt);
118
119void
120CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
121 size_t len, const void *key,
122 unsigned char ivec[16], block128_f block)
123{
124 size_t n;
125 union {
126 size_t t[16/sizeof(size_t)];
127 unsigned char c[16];
128 } tmp;
129
130#if !defined(OPENSSL_SMALL_FOOTPRINT)
131 if (in != out) {
132 const unsigned char *iv = ivec;
133
134 if (STRICT_ALIGNMENT &&
135 ((size_t)in|(size_t)out|(size_t)ivec) % sizeof(size_t) !=
136 0) {
137 while (len >= 16) {
138 (*block)(in, out, key);
139 for (n = 0; n < 16; ++n)
140 out[n] ^= iv[n];
141 iv = in;
142 len -= 16;
143 in += 16;
144 out += 16;
145 }
146 } else if (16 % sizeof(size_t) == 0) { /* always true */
147 while (len >= 16) {
148 size_t *out_t = (size_t *)out,
149 *iv_t = (size_t *)iv;
150
151 (*block)(in, out, key);
152 for (n = 0; n < 16/sizeof(size_t); n++)
153 out_t[n] ^= iv_t[n];
154 iv = in;
155 len -= 16;
156 in += 16;
157 out += 16;
158 }
159 }
160 memmove(ivec, iv, 16);
161 } else {
162 if (STRICT_ALIGNMENT &&
163 ((size_t)in|(size_t)out|(size_t)ivec) % sizeof(size_t) !=
164 0) {
165 unsigned char c;
166 while (len >= 16) {
167 (*block)(in, tmp.c, key);
168 for (n = 0; n < 16; ++n) {
169 c = in[n];
170 out[n] = tmp.c[n] ^ ivec[n];
171 ivec[n] = c;
172 }
173 len -= 16;
174 in += 16;
175 out += 16;
176 }
177 } else if (16 % sizeof(size_t) == 0) { /* always true */
178 while (len >= 16) {
179 size_t c, *out_t = (size_t *)out,
180 *ivec_t = (size_t *)ivec;
181 const size_t *in_t = (const size_t *)in;
182
183 (*block)(in, tmp.c, key);
184 for (n = 0; n < 16/sizeof(size_t); n++) {
185 c = in_t[n];
186 out_t[n] = tmp.t[n] ^ ivec_t[n];
187 ivec_t[n] = c;
188 }
189 len -= 16;
190 in += 16;
191 out += 16;
192 }
193 }
194 }
195#endif
196 while (len) {
197 unsigned char c;
198 (*block)(in, tmp.c, key);
199 for (n = 0; n < 16 && n < len; ++n) {
200 c = in[n];
201 out[n] = tmp.c[n] ^ ivec[n];
202 ivec[n] = c;
203 }
204 if (len <= 16) {
205 for (; n < 16; ++n)
206 ivec[n] = in[n];
207 break;
208 }
209 len -= 16;
210 in += 16;
211 out += 16;
212 }
213}
214LCRYPTO_ALIAS(CRYPTO_cbc128_decrypt);
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
deleted file mode 100644
index 68c5cce5da..0000000000
--- a/src/lib/libcrypto/modes/ccm128.c
+++ /dev/null
@@ -1,498 +0,0 @@
1/* $OpenBSD: ccm128.c,v 1.8 2023/07/08 14:56:54 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51#include <openssl/crypto.h>
52#include "modes_local.h"
53#include <string.h>
54
55#ifndef MODES_DEBUG
56# ifndef NDEBUG
57# define NDEBUG
58# endif
59#endif
60
61/* First you setup M and L parameters and pass the key schedule.
62 * This is called once per session setup... */
63void
64CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
65 unsigned int M, unsigned int L, void *key, block128_f block)
66{
67 memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
68 ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2)/2) & 7) << 3;
69 ctx->blocks = 0;
70 ctx->block = block;
71 ctx->key = key;
72}
73LCRYPTO_ALIAS(CRYPTO_ccm128_init);
74
75/* !!! Following interfaces are to be called *once* per packet !!! */
76
77/* Then you setup per-message nonce and pass the length of the message */
78int
79CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
80 const unsigned char *nonce, size_t nlen, size_t mlen)
81{
82 unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
83
84 if (nlen < (14 - L))
85 return -1; /* nonce is too short */
86
87 if (sizeof(mlen) == 8 && L >= 3) {
88 ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen)*8)));
89 ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen)*8)));
90 ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen)*8)));
91 ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen)*8)));
92 } else
93 ctx->nonce.u[1] = 0;
94
95 ctx->nonce.c[12] = (u8)(mlen >> 24);
96 ctx->nonce.c[13] = (u8)(mlen >> 16);
97 ctx->nonce.c[14] = (u8)(mlen >> 8);
98 ctx->nonce.c[15] = (u8)mlen;
99
100 ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
101 memcpy(&ctx->nonce.c[1], nonce, 14 - L);
102
103 return 0;
104}
105LCRYPTO_ALIAS(CRYPTO_ccm128_setiv);
106
107/* Then you pass additional authentication data, this is optional */
108void
109CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
110 const unsigned char *aad, size_t alen)
111{
112 unsigned int i;
113 block128_f block = ctx->block;
114
115 if (alen == 0)
116 return;
117
118 ctx->nonce.c[0] |= 0x40; /* set Adata flag */
119 (*block)(ctx->nonce.c, ctx->cmac.c, ctx->key),
120 ctx->blocks++;
121
122 if (alen < (0x10000 - 0x100)) {
123 ctx->cmac.c[0] ^= (u8)(alen >> 8);
124 ctx->cmac.c[1] ^= (u8)alen;
125 i = 2;
126 } else if (sizeof(alen) == 8 &&
127 alen >= (size_t)1 << (32 % (sizeof(alen)*8))) {
128 ctx->cmac.c[0] ^= 0xFF;
129 ctx->cmac.c[1] ^= 0xFF;
130 ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen)*8)));
131 ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen)*8)));
132 ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen)*8)));
133 ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen)*8)));
134 ctx->cmac.c[6] ^= (u8)(alen >> 24);
135 ctx->cmac.c[7] ^= (u8)(alen >> 16);
136 ctx->cmac.c[8] ^= (u8)(alen >> 8);
137 ctx->cmac.c[9] ^= (u8)alen;
138 i = 10;
139 } else {
140 ctx->cmac.c[0] ^= 0xFF;
141 ctx->cmac.c[1] ^= 0xFE;
142 ctx->cmac.c[2] ^= (u8)(alen >> 24);
143 ctx->cmac.c[3] ^= (u8)(alen >> 16);
144 ctx->cmac.c[4] ^= (u8)(alen >> 8);
145 ctx->cmac.c[5] ^= (u8)alen;
146 i = 6;
147 }
148
149 do {
150 for (; i < 16 && alen; ++i, ++aad, --alen)
151 ctx->cmac.c[i] ^= *aad;
152 (*block)(ctx->cmac.c, ctx->cmac.c, ctx->key),
153 ctx->blocks++;
154 i = 0;
155 } while (alen);
156}
157LCRYPTO_ALIAS(CRYPTO_ccm128_aad);
158
159/* Finally you encrypt or decrypt the message */
160
161/* counter part of nonce may not be larger than L*8 bits,
162 * L is not larger than 8, therefore 64-bit counter... */
163static void
164ctr64_inc(unsigned char *counter)
165{
166 unsigned int n = 8;
167 u8 c;
168
169 counter += 8;
170 do {
171 --n;
172 c = counter[n];
173 ++c;
174 counter[n] = c;
175 if (c)
176 return;
177 } while (n);
178}
179
180int
181CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
182 const unsigned char *inp, unsigned char *out,
183 size_t len)
184{
185 size_t n;
186 unsigned int i, L;
187 unsigned char flags0 = ctx->nonce.c[0];
188 block128_f block = ctx->block;
189 void *key = ctx->key;
190 union {
191 u64 u[2];
192 u8 c[16];
193 } scratch;
194
195 if (!(flags0 & 0x40))
196 (*block)(ctx->nonce.c, ctx->cmac.c, key),
197 ctx->blocks++;
198
199 ctx->nonce.c[0] = L = flags0 & 7;
200 for (n = 0, i = 15 - L; i < 15; ++i) {
201 n |= ctx->nonce.c[i];
202 ctx->nonce.c[i] = 0;
203 n <<= 8;
204 }
205 n |= ctx->nonce.c[15]; /* reconstructed length */
206 ctx->nonce.c[15] = 1;
207
208 if (n != len)
209 return -1; /* length mismatch */
210
211 ctx->blocks += ((len + 15) >> 3)|1;
212 if (ctx->blocks > (U64(1) << 61))
213 return -2; /* too much data */
214
215 while (len >= 16) {
216#ifdef __STRICT_ALIGNMENT
217 union {
218 u64 u[2];
219 u8 c[16];
220 } temp;
221
222 memcpy(temp.c, inp, 16);
223 ctx->cmac.u[0] ^= temp.u[0];
224 ctx->cmac.u[1] ^= temp.u[1];
225#else
226 ctx->cmac.u[0] ^= ((u64 *)inp)[0];
227 ctx->cmac.u[1] ^= ((u64 *)inp)[1];
228#endif
229 (*block)(ctx->cmac.c, ctx->cmac.c, key);
230 (*block)(ctx->nonce.c, scratch.c, key);
231 ctr64_inc(ctx->nonce.c);
232#ifdef __STRICT_ALIGNMENT
233 temp.u[0] ^= scratch.u[0];
234 temp.u[1] ^= scratch.u[1];
235 memcpy(out, temp.c, 16);
236#else
237 ((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0];
238 ((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];
239#endif
240 inp += 16;
241 out += 16;
242 len -= 16;
243 }
244
245 if (len) {
246 for (i = 0; i < len; ++i)
247 ctx->cmac.c[i] ^= inp[i];
248 (*block)(ctx->cmac.c, ctx->cmac.c, key);
249 (*block)(ctx->nonce.c, scratch.c, key);
250 for (i = 0; i < len; ++i)
251 out[i] = scratch.c[i] ^ inp[i];
252 }
253
254 for (i = 15 - L; i < 16; ++i)
255 ctx->nonce.c[i] = 0;
256
257 (*block)(ctx->nonce.c, scratch.c, key);
258 ctx->cmac.u[0] ^= scratch.u[0];
259 ctx->cmac.u[1] ^= scratch.u[1];
260
261 ctx->nonce.c[0] = flags0;
262
263 return 0;
264}
265LCRYPTO_ALIAS(CRYPTO_ccm128_encrypt);
266
267int
268CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
269 const unsigned char *inp, unsigned char *out,
270 size_t len)
271{
272 size_t n;
273 unsigned int i, L;
274 unsigned char flags0 = ctx->nonce.c[0];
275 block128_f block = ctx->block;
276 void *key = ctx->key;
277 union {
278 u64 u[2];
279 u8 c[16];
280 } scratch;
281
282 if (!(flags0 & 0x40))
283 (*block)(ctx->nonce.c, ctx->cmac.c, key);
284
285 ctx->nonce.c[0] = L = flags0 & 7;
286 for (n = 0, i = 15 - L; i < 15; ++i) {
287 n |= ctx->nonce.c[i];
288 ctx->nonce.c[i] = 0;
289 n <<= 8;
290 }
291 n |= ctx->nonce.c[15]; /* reconstructed length */
292 ctx->nonce.c[15] = 1;
293
294 if (n != len)
295 return -1;
296
297 while (len >= 16) {
298#ifdef __STRICT_ALIGNMENT
299 union {
300 u64 u[2];
301 u8 c[16];
302 } temp;
303#endif
304 (*block)(ctx->nonce.c, scratch.c, key);
305 ctr64_inc(ctx->nonce.c);
306#ifdef __STRICT_ALIGNMENT
307 memcpy(temp.c, inp, 16);
308 ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
309 ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
310 memcpy(out, scratch.c, 16);
311#else
312 ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^
313 ((u64 *)inp)[0]);
314 ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^
315 ((u64 *)inp)[1]);
316#endif
317 (*block)(ctx->cmac.c, ctx->cmac.c, key);
318
319 inp += 16;
320 out += 16;
321 len -= 16;
322 }
323
324 if (len) {
325 (*block)(ctx->nonce.c, scratch.c, key);
326 for (i = 0; i < len; ++i)
327 ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
328 (*block)(ctx->cmac.c, ctx->cmac.c, key);
329 }
330
331 for (i = 15 - L; i < 16; ++i)
332 ctx->nonce.c[i] = 0;
333
334 (*block)(ctx->nonce.c, scratch.c, key);
335 ctx->cmac.u[0] ^= scratch.u[0];
336 ctx->cmac.u[1] ^= scratch.u[1];
337
338 ctx->nonce.c[0] = flags0;
339
340 return 0;
341}
342LCRYPTO_ALIAS(CRYPTO_ccm128_decrypt);
343
344static void
345ctr64_add(unsigned char *counter, size_t inc)
346{
347 size_t n = 8, val = 0;
348
349 counter += 8;
350 do {
351 --n;
352 val += counter[n] + (inc & 0xff);
353 counter[n] = (unsigned char)val;
354 val >>= 8; /* carry bit */
355 inc >>= 8;
356 } while (n && (inc || val));
357}
358
359int
360CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
361 const unsigned char *inp, unsigned char *out,
362 size_t len, ccm128_f stream)
363{
364 size_t n;
365 unsigned int i, L;
366 unsigned char flags0 = ctx->nonce.c[0];
367 block128_f block = ctx->block;
368 void *key = ctx->key;
369 union {
370 u64 u[2];
371 u8 c[16];
372 } scratch;
373
374 if (!(flags0 & 0x40))
375 (*block)(ctx->nonce.c, ctx->cmac.c, key),
376 ctx->blocks++;
377
378 ctx->nonce.c[0] = L = flags0 & 7;
379 for (n = 0, i = 15 - L; i < 15; ++i) {
380 n |= ctx->nonce.c[i];
381 ctx->nonce.c[i] = 0;
382 n <<= 8;
383 }
384 n |= ctx->nonce.c[15]; /* reconstructed length */
385 ctx->nonce.c[15] = 1;
386
387 if (n != len)
388 return -1; /* length mismatch */
389
390 ctx->blocks += ((len + 15) >> 3)|1;
391 if (ctx->blocks > (U64(1) << 61))
392 return -2; /* too much data */
393
394 if ((n = len/16)) {
395 (*stream)(inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
396 n *= 16;
397 inp += n;
398 out += n;
399 len -= n;
400 if (len)
401 ctr64_add(ctx->nonce.c, n/16);
402 }
403
404 if (len) {
405 for (i = 0; i < len; ++i)
406 ctx->cmac.c[i] ^= inp[i];
407 (*block)(ctx->cmac.c, ctx->cmac.c, key);
408 (*block)(ctx->nonce.c, scratch.c, key);
409 for (i = 0; i < len; ++i)
410 out[i] = scratch.c[i] ^ inp[i];
411 }
412
413 for (i = 15 - L; i < 16; ++i)
414 ctx->nonce.c[i] = 0;
415
416 (*block)(ctx->nonce.c, scratch.c, key);
417 ctx->cmac.u[0] ^= scratch.u[0];
418 ctx->cmac.u[1] ^= scratch.u[1];
419
420 ctx->nonce.c[0] = flags0;
421
422 return 0;
423}
424LCRYPTO_ALIAS(CRYPTO_ccm128_encrypt_ccm64);
425
426int
427CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
428 const unsigned char *inp, unsigned char *out,
429 size_t len, ccm128_f stream)
430{
431 size_t n;
432 unsigned int i, L;
433 unsigned char flags0 = ctx->nonce.c[0];
434 block128_f block = ctx->block;
435 void *key = ctx->key;
436 union {
437 u64 u[2];
438 u8 c[16];
439 } scratch;
440
441 if (!(flags0 & 0x40))
442 (*block)(ctx->nonce.c, ctx->cmac.c, key);
443
444 ctx->nonce.c[0] = L = flags0 & 7;
445 for (n = 0, i = 15 - L; i < 15; ++i) {
446 n |= ctx->nonce.c[i];
447 ctx->nonce.c[i] = 0;
448 n <<= 8;
449 }
450 n |= ctx->nonce.c[15]; /* reconstructed length */
451 ctx->nonce.c[15] = 1;
452
453 if (n != len)
454 return -1;
455
456 if ((n = len/16)) {
457 (*stream)(inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
458 n *= 16;
459 inp += n;
460 out += n;
461 len -= n;
462 if (len)
463 ctr64_add(ctx->nonce.c, n/16);
464 }
465
466 if (len) {
467 (*block)(ctx->nonce.c, scratch.c, key);
468 for (i = 0; i < len; ++i)
469 ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
470 (*block)(ctx->cmac.c, ctx->cmac.c, key);
471 }
472
473 for (i = 15 - L; i < 16; ++i)
474 ctx->nonce.c[i] = 0;
475
476 (*block)(ctx->nonce.c, scratch.c, key);
477 ctx->cmac.u[0] ^= scratch.u[0];
478 ctx->cmac.u[1] ^= scratch.u[1];
479
480 ctx->nonce.c[0] = flags0;
481
482 return 0;
483}
484LCRYPTO_ALIAS(CRYPTO_ccm128_decrypt_ccm64);
485
486size_t
487CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
488{
489 unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
490
491 M *= 2;
492 M += 2;
493 if (len != M)
494 return 0;
495 memcpy(tag, ctx->cmac.c, M);
496 return M;
497}
498LCRYPTO_ALIAS(CRYPTO_ccm128_tag);
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
deleted file mode 100644
index 931353a620..0000000000
--- a/src/lib/libcrypto/modes/cfb128.c
+++ /dev/null
@@ -1,251 +0,0 @@
1/* $OpenBSD: cfb128.c,v 1.7 2023/07/08 14:56:54 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_local.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62/* The input and output encrypted as though 128bit cfb mode is being
63 * used. The extra state information to record how much of the
64 * 128bit block we have used is contained in *num;
65 */
66void
67CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
68 size_t len, const void *key,
69 unsigned char ivec[16], int *num,
70 int enc, block128_f block)
71{
72 unsigned int n;
73 size_t l = 0;
74
75 n = *num;
76
77 if (enc) {
78#if !defined(OPENSSL_SMALL_FOOTPRINT)
79 if (16 % sizeof(size_t) == 0)
80 do { /* always true actually */
81 while (n && len) {
82 *(out++) = ivec[n] ^= *(in++);
83 --len;
84 n = (n + 1) % 16;
85 }
86#ifdef __STRICT_ALIGNMENT
87 if (((size_t)in|(size_t)out|(size_t)ivec) %
88 sizeof(size_t) != 0)
89 break;
90#endif
91 while (len >= 16) {
92 (*block)(ivec, ivec, key);
93 for (; n < 16; n += sizeof(size_t)) {
94 *(size_t *)(out + n) =
95 *(size_t *)(ivec + n) ^= *(size_t *)(in +
96 n);
97 }
98 len -= 16;
99 out += 16;
100 in += 16;
101 n = 0;
102 }
103 if (len) {
104 (*block)(ivec, ivec, key);
105 while (len--) {
106 out[n] = ivec[n] ^= in[n];
107 ++n;
108 }
109 }
110 *num = n;
111 return;
112 } while (0);
113 /* the rest would be commonly eliminated by x86* compiler */
114#endif
115 while (l < len) {
116 if (n == 0) {
117 (*block)(ivec, ivec, key);
118 }
119 out[l] = ivec[n] ^= in[l];
120 ++l;
121 n = (n + 1) % 16;
122 }
123 *num = n;
124 } else {
125#if !defined(OPENSSL_SMALL_FOOTPRINT)
126 if (16 % sizeof(size_t) == 0)
127 do { /* always true actually */
128 while (n && len) {
129 unsigned char c;
130 *(out++) = ivec[n] ^ (c = *(in++));
131 ivec[n] = c;
132 --len;
133 n = (n + 1) % 16;
134 }
135#ifdef __STRICT_ALIGNMENT
136 if (((size_t)in|(size_t)out|(size_t)ivec) %
137 sizeof(size_t) != 0)
138 break;
139#endif
140 while (len >= 16) {
141 (*block)(ivec, ivec, key);
142 for (; n < 16; n += sizeof(size_t)) {
143 size_t t = *(size_t *)(in + n);
144 *(size_t *)(out + n) = *(size_t *)(ivec +
145 n) ^ t;
146 *(size_t *)(ivec + n) = t;
147 }
148 len -= 16;
149 out += 16;
150 in += 16;
151 n = 0;
152 }
153 if (len) {
154 (*block)(ivec, ivec, key);
155 while (len--) {
156 unsigned char c;
157 out[n] = ivec[n] ^ (c = in[n]);
158 ivec[n] = c;
159 ++n;
160 }
161 }
162 *num = n;
163 return;
164 } while (0);
165 /* the rest would be commonly eliminated by x86* compiler */
166#endif
167 while (l < len) {
168 unsigned char c;
169 if (n == 0) {
170 (*block)(ivec, ivec, key);
171 }
172 out[l] = ivec[n] ^ (c = in[l]);
173 ivec[n] = c;
174 ++l;
175 n = (n + 1) % 16;
176 }
177 *num = n;
178 }
179}
180LCRYPTO_ALIAS(CRYPTO_cfb128_encrypt);
181
182/* This expects a single block of size nbits for both in and out. Note that
183 it corrupts any extra bits in the last byte of out */
184static void
185cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
186 int nbits, const void *key,
187 unsigned char ivec[16], int enc,
188 block128_f block)
189{
190 int n, rem, num;
191 unsigned char ovec[16*2 + 1]; /* +1 because we dererefence (but don't use) one byte off the end */
192
193 if (nbits <= 0 || nbits > 128)
194 return;
195
196 /* fill in the first half of the new IV with the current IV */
197 memcpy(ovec, ivec, 16);
198 /* construct the new IV */
199 (*block)(ivec, ivec, key);
200 num = (nbits + 7)/8;
201 if (enc) /* encrypt the input */
202 for (n = 0; n < num; ++n)
203 out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
204 else /* decrypt the input */
205 for (n = 0; n < num; ++n)
206 out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
207 /* shift ovec left... */
208 rem = nbits % 8;
209 num = nbits/8;
210 if (rem == 0)
211 memcpy(ivec, ovec + num, 16);
212 else
213 for (n = 0; n < 16; ++n)
214 ivec[n] = ovec[n + num] << rem |
215 ovec[n + num + 1] >> (8 - rem);
216
217 /* it is not necessary to cleanse ovec, since the IV is not secret */
218}
219
220/* N.B. This expects the input to be packed, MS bit first */
221void
222CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
223 size_t bits, const void *key,
224 unsigned char ivec[16], int *num,
225 int enc, block128_f block)
226{
227 size_t n;
228 unsigned char c[1], d[1];
229
230 for (n = 0; n < bits; ++n)
231 {
232 c[0] = (in[n/8] & (1 << (7 - n % 8))) ? 0x80 : 0;
233 cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
234 out[n/8] = (out[n/8] & ~(1 << (unsigned int)(7 - n % 8))) |
235 ((d[0] & 0x80) >> (unsigned int)(n % 8));
236 }
237}
238LCRYPTO_ALIAS(CRYPTO_cfb128_1_encrypt);
239
240void
241CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
242 size_t length, const void *key,
243 unsigned char ivec[16], int *num,
244 int enc, block128_f block)
245{
246 size_t n;
247
248 for (n = 0; n < length; ++n)
249 cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
250}
251LCRYPTO_ALIAS(CRYPTO_cfb128_8_encrypt);
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
deleted file mode 100644
index 6d507dfc3a..0000000000
--- a/src/lib/libcrypto/modes/ctr128.c
+++ /dev/null
@@ -1,267 +0,0 @@
1/* $OpenBSD: ctr128.c,v 1.11 2023/07/08 14:56:54 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_local.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63/* NOTE: the IV/counter CTR mode is big-endian. The code itself
64 * is endian-neutral. */
65
66/* increment counter (128-bit int) by 1 */
67static void
68ctr128_inc(unsigned char *counter)
69{
70 u32 n = 16;
71 u8 c;
72
73 do {
74 --n;
75 c = counter[n];
76 ++c;
77 counter[n] = c;
78 if (c)
79 return;
80 } while (n);
81}
82
83#if !defined(OPENSSL_SMALL_FOOTPRINT)
84static void
85ctr128_inc_aligned(unsigned char *counter)
86{
87#if BYTE_ORDER == LITTLE_ENDIAN
88 ctr128_inc(counter);
89#else
90 size_t *data, c, n;
91 data = (size_t *)counter;
92 n = 16 / sizeof(size_t);
93 do {
94 --n;
95 c = data[n];
96 ++c;
97 data[n] = c;
98 if (c)
99 return;
100 } while (n);
101#endif
102}
103#endif
104
105/* The input encrypted as though 128bit counter mode is being
106 * used. The extra state information to record how much of the
107 * 128bit block we have used is contained in *num, and the
108 * encrypted counter is kept in ecount_buf. Both *num and
109 * ecount_buf must be initialised with zeros before the first
110 * call to CRYPTO_ctr128_encrypt().
111 *
112 * This algorithm assumes that the counter is in the x lower bits
113 * of the IV (ivec), and that the application has full control over
114 * overflow and the rest of the IV. This implementation takes NO
115 * responsibility for checking that the counter doesn't overflow
116 * into the rest of the IV when incremented.
117 */
118void
119CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
120 size_t len, const void *key,
121 unsigned char ivec[16], unsigned char ecount_buf[16],
122 unsigned int *num, block128_f block)
123{
124 unsigned int n;
125 size_t l = 0;
126
127 assert(*num < 16);
128
129 n = *num;
130
131#if !defined(OPENSSL_SMALL_FOOTPRINT)
132 if (16 % sizeof(size_t) == 0)
133 do { /* always true actually */
134 while (n && len) {
135 *(out++) = *(in++) ^ ecount_buf[n];
136 --len;
137 n = (n + 1) % 16;
138 }
139
140#ifdef __STRICT_ALIGNMENT
141 if (((size_t)in|(size_t)out|(size_t)ivec) %
142 sizeof(size_t) != 0)
143 break;
144#endif
145 while (len >= 16) {
146 (*block)(ivec, ecount_buf, key);
147 ctr128_inc_aligned(ivec);
148 for (; n < 16; n += sizeof(size_t))
149 *(size_t *)(out + n) =
150 *(size_t *)(in + n) ^ *(size_t *)(ecount_buf +
151 n);
152 len -= 16;
153 out += 16;
154 in += 16;
155 n = 0;
156 }
157 if (len) {
158 (*block)(ivec, ecount_buf, key);
159 ctr128_inc_aligned(ivec);
160 while (len--) {
161 out[n] = in[n] ^ ecount_buf[n];
162 ++n;
163 }
164 }
165 *num = n;
166 return;
167 } while (0);
168 /* the rest would be commonly eliminated by x86* compiler */
169#endif
170 while (l < len) {
171 if (n == 0) {
172 (*block)(ivec, ecount_buf, key);
173 ctr128_inc(ivec);
174 }
175 out[l] = in[l] ^ ecount_buf[n];
176 ++l;
177 n = (n + 1) % 16;
178 }
179
180 *num = n;
181}
182LCRYPTO_ALIAS(CRYPTO_ctr128_encrypt);
183
184/* increment upper 96 bits of 128-bit counter by 1 */
185static void
186ctr96_inc(unsigned char *counter)
187{
188 u32 n = 12;
189 u8 c;
190
191 do {
192 --n;
193 c = counter[n];
194 ++c;
195 counter[n] = c;
196 if (c)
197 return;
198 } while (n);
199}
200
201void
202CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
203 size_t len, const void *key,
204 unsigned char ivec[16], unsigned char ecount_buf[16],
205 unsigned int *num, ctr128_f func)
206{
207 unsigned int n, ctr32;
208
209 assert(*num < 16);
210
211 n = *num;
212
213 while (n && len) {
214 *(out++) = *(in++) ^ ecount_buf[n];
215 --len;
216 n = (n + 1) % 16;
217 }
218
219 ctr32 = GETU32(ivec + 12);
220 while (len >= 16) {
221 size_t blocks = len/16;
222 /*
223 * 1<<28 is just a not-so-small yet not-so-large number...
224 * Below condition is practically never met, but it has to
225 * be checked for code correctness.
226 */
227 if (sizeof(size_t) > sizeof(unsigned int) &&
228 blocks > (1U << 28))
229 blocks = (1U << 28);
230 /*
231 * As (*func) operates on 32-bit counter, caller
232 * has to handle overflow. 'if' below detects the
233 * overflow, which is then handled by limiting the
234 * amount of blocks to the exact overflow point...
235 */
236 ctr32 += (u32)blocks;
237 if (ctr32 < blocks) {
238 blocks -= ctr32;
239 ctr32 = 0;
240 }
241 (*func)(in, out, blocks, key, ivec);
242 /* (*ctr) does not update ivec, caller does: */
243 PUTU32(ivec + 12, ctr32);
244 /* ... overflow was detected, propagate carry. */
245 if (ctr32 == 0)
246 ctr96_inc(ivec);
247 blocks *= 16;
248 len -= blocks;
249 out += blocks;
250 in += blocks;
251 }
252 if (len) {
253 memset(ecount_buf, 0, 16);
254 (*func)(ecount_buf, ecount_buf, 1, key, ivec);
255 ++ctr32;
256 PUTU32(ivec + 12, ctr32);
257 if (ctr32 == 0)
258 ctr96_inc(ivec);
259 while (len--) {
260 out[n] = in[n] ^ ecount_buf[n];
261 ++n;
262 }
263 }
264
265 *num = n;
266}
267LCRYPTO_ALIAS(CRYPTO_ctr128_encrypt_ctr32);
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
deleted file mode 100644
index 6c89bd44b7..0000000000
--- a/src/lib/libcrypto/modes/gcm128.c
+++ /dev/null
@@ -1,1358 +0,0 @@
1/* $OpenBSD: gcm128.c,v 1.27 2024/09/06 09:57:32 tb Exp $ */
2/* ====================================================================
3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51#define OPENSSL_FIPSAPI
52
53#include <string.h>
54
55#include <openssl/crypto.h>
56
57#include "crypto_internal.h"
58#include "modes_local.h"
59
60#ifndef MODES_DEBUG
61# ifndef NDEBUG
62# define NDEBUG
63# endif
64#endif
65
66#if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
67/* redefine, because alignment is ensured */
68#undef GETU32
69#define GETU32(p) BSWAP4(*(const u32 *)(p))
70#endif
71
72#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
73#define REDUCE1BIT(V) \
74 do { \
75 if (sizeof(size_t)==8) { \
76 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77 V.lo = (V.hi<<63)|(V.lo>>1); \
78 V.hi = (V.hi>>1 )^T; \
79 } else { \
80 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
81 V.lo = (V.hi<<63)|(V.lo>>1); \
82 V.hi = (V.hi>>1 )^((u64)T<<32); \
83 } \
84 } while(0)
85
86/*
87 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
88 * never be set to 8. 8 is effectively reserved for testing purposes.
89 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
90 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
91 * whole spectrum of possible table driven implementations. Why? In
92 * non-"Shoup's" case memory access pattern is segmented in such manner,
93 * that it's trivial to see that cache timing information can reveal
94 * fair portion of intermediate hash value. Given that ciphertext is
95 * always available to attacker, it's possible for him to attempt to
96 * deduce secret parameter H and if successful, tamper with messages
97 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
98 * not as trivial, but there is no reason to believe that it's resistant
99 * to cache-timing attack. And the thing about "8-bit" implementation is
100 * that it consumes 16 (sixteen) times more memory, 4KB per individual
101 * key + 1KB shared. Well, on pros side it should be twice as fast as
102 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
103 * was observed to run ~75% faster, closer to 100% for commercial
104 * compilers... Yet "4-bit" procedure is preferred, because it's
105 * believed to provide better security-performance balance and adequate
106 * all-round performance. "All-round" refers to things like:
107 *
108 * - shorter setup time effectively improves overall timing for
109 * handling short messages;
110 * - larger table allocation can become unbearable because of VM
111 * subsystem penalties (for example on Windows large enough free
112 * results in VM working set trimming, meaning that consequent
113 * malloc would immediately incur working set expansion);
114 * - larger table has larger cache footprint, which can affect
115 * performance of other code paths (not necessarily even from same
116 * thread in Hyper-Threading world);
117 *
118 * Value of 1 is not appropriate for performance reasons.
119 */
120#if TABLE_BITS==8
121
122static void
123gcm_init_8bit(u128 Htable[256], u64 H[2])
124{
125 int i, j;
126 u128 V;
127
128 Htable[0].hi = 0;
129 Htable[0].lo = 0;
130 V.hi = H[0];
131 V.lo = H[1];
132
133 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
134 REDUCE1BIT(V);
135 Htable[i] = V;
136 }
137
138 for (i = 2; i < 256; i <<= 1) {
139 u128 *Hi = Htable + i, H0 = *Hi;
140 for (j = 1; j < i; ++j) {
141 Hi[j].hi = H0.hi ^ Htable[j].hi;
142 Hi[j].lo = H0.lo ^ Htable[j].lo;
143 }
144 }
145}
146
147static void
148gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
149{
150 u128 Z = { 0, 0};
151 const u8 *xi = (const u8 *)Xi + 15;
152 size_t rem, n = *xi;
153 static const size_t rem_8bit[256] = {
154 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
155 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
156 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
157 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
158 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
159 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
160 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
161 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
162 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
163 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
164 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
165 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
166 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
167 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
168 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
169 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
170 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
171 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
172 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
173 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
174 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
175 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
176 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
177 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
178 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
179 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
180 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
181 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
182 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
183 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
184 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
185 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
186 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
187 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
188 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
189 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
190 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
191 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
192 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
193 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
194 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
195 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
196 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
197 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
198 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
199 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
200 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
201 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
202 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
203 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
204 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
205 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
206 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
207 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
208 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
209 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
210 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
211 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
212 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
213 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
214 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
215 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
216 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
217 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
218
219 while (1) {
220 Z.hi ^= Htable[n].hi;
221 Z.lo ^= Htable[n].lo;
222
223 if ((u8 *)Xi == xi)
224 break;
225
226 n = *(--xi);
227
228 rem = (size_t)Z.lo & 0xff;
229 Z.lo = (Z.hi << 56)|(Z.lo >> 8);
230 Z.hi = (Z.hi >> 8);
231#if SIZE_MAX == 0xffffffffffffffff
232 Z.hi ^= rem_8bit[rem];
233#else
234 Z.hi ^= (u64)rem_8bit[rem] << 32;
235#endif
236 }
237
238 Xi[0] = htobe64(Z.hi);
239 Xi[1] = htobe64(Z.lo);
240}
241#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
242
243#elif TABLE_BITS==4
244
245static void
246gcm_init_4bit(u128 Htable[16], u64 H[2])
247{
248 u128 V;
249#if defined(OPENSSL_SMALL_FOOTPRINT)
250 int i;
251#endif
252
253 Htable[0].hi = 0;
254 Htable[0].lo = 0;
255 V.hi = H[0];
256 V.lo = H[1];
257
258#if defined(OPENSSL_SMALL_FOOTPRINT)
259 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
260 REDUCE1BIT(V);
261 Htable[i] = V;
262 }
263
264 for (i = 2; i < 16; i <<= 1) {
265 u128 *Hi = Htable + i;
266 int j;
267 for (V = *Hi, j = 1; j < i; ++j) {
268 Hi[j].hi = V.hi ^ Htable[j].hi;
269 Hi[j].lo = V.lo ^ Htable[j].lo;
270 }
271 }
272#else
273 Htable[8] = V;
274 REDUCE1BIT(V);
275 Htable[4] = V;
276 REDUCE1BIT(V);
277 Htable[2] = V;
278 REDUCE1BIT(V);
279 Htable[1] = V;
280 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
281 V = Htable[4];
282 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
283 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
284 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
285 V = Htable[8];
286 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
287 Htable[10].hi = V.hi ^ Htable[2].hi,
288 Htable[10].lo = V.lo ^ Htable[2].lo;
289 Htable[11].hi = V.hi ^ Htable[3].hi,
290 Htable[11].lo = V.lo ^ Htable[3].lo;
291 Htable[12].hi = V.hi ^ Htable[4].hi,
292 Htable[12].lo = V.lo ^ Htable[4].lo;
293 Htable[13].hi = V.hi ^ Htable[5].hi,
294 Htable[13].lo = V.lo ^ Htable[5].lo;
295 Htable[14].hi = V.hi ^ Htable[6].hi,
296 Htable[14].lo = V.lo ^ Htable[6].lo;
297 Htable[15].hi = V.hi ^ Htable[7].hi,
298 Htable[15].lo = V.lo ^ Htable[7].lo;
299#endif
300#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
301 /*
302 * ARM assembler expects specific dword order in Htable.
303 */
304 {
305 int j;
306#if BYTE_ORDER == LITTLE_ENDIAN
307 for (j = 0; j < 16; ++j) {
308 V = Htable[j];
309 Htable[j].hi = V.lo;
310 Htable[j].lo = V.hi;
311 }
312#else /* BIG_ENDIAN */
313 for (j = 0; j < 16; ++j) {
314 V = Htable[j];
315 Htable[j].hi = V.lo << 32|V.lo >> 32;
316 Htable[j].lo = V.hi << 32|V.hi >> 32;
317 }
318#endif
319 }
320#endif
321}
322
323#ifndef GHASH_ASM
324static const size_t rem_4bit[16] = {
325 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
326 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
327 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
328 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
329
330static void
331gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
332{
333 u128 Z;
334 int cnt = 15;
335 size_t rem, nlo, nhi;
336
337 nlo = ((const u8 *)Xi)[15];
338 nhi = nlo >> 4;
339 nlo &= 0xf;
340
341 Z.hi = Htable[nlo].hi;
342 Z.lo = Htable[nlo].lo;
343
344 while (1) {
345 rem = (size_t)Z.lo & 0xf;
346 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
347 Z.hi = (Z.hi >> 4);
348#if SIZE_MAX == 0xffffffffffffffff
349 Z.hi ^= rem_4bit[rem];
350#else
351 Z.hi ^= (u64)rem_4bit[rem] << 32;
352#endif
353 Z.hi ^= Htable[nhi].hi;
354 Z.lo ^= Htable[nhi].lo;
355
356 if (--cnt < 0)
357 break;
358
359 nlo = ((const u8 *)Xi)[cnt];
360 nhi = nlo >> 4;
361 nlo &= 0xf;
362
363 rem = (size_t)Z.lo & 0xf;
364 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
365 Z.hi = (Z.hi >> 4);
366#if SIZE_MAX == 0xffffffffffffffff
367 Z.hi ^= rem_4bit[rem];
368#else
369 Z.hi ^= (u64)rem_4bit[rem] << 32;
370#endif
371 Z.hi ^= Htable[nlo].hi;
372 Z.lo ^= Htable[nlo].lo;
373 }
374
375 Xi[0] = htobe64(Z.hi);
376 Xi[1] = htobe64(Z.lo);
377}
378
379#if !defined(OPENSSL_SMALL_FOOTPRINT)
380/*
381 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
382 * details... Compiler-generated code doesn't seem to give any
383 * performance improvement, at least not on x86[_64]. It's here
384 * mostly as reference and a placeholder for possible future
385 * non-trivial optimization[s]...
386 */
387static void
388gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
389 const u8 *inp, size_t len)
390{
391 u128 Z;
392 int cnt;
393 size_t rem, nlo, nhi;
394
395#if 1
396 do {
397 cnt = 15;
398 nlo = ((const u8 *)Xi)[15];
399 nlo ^= inp[15];
400 nhi = nlo >> 4;
401 nlo &= 0xf;
402
403 Z.hi = Htable[nlo].hi;
404 Z.lo = Htable[nlo].lo;
405
406 while (1) {
407 rem = (size_t)Z.lo & 0xf;
408 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
409 Z.hi = (Z.hi >> 4);
410#if SIZE_MAX == 0xffffffffffffffff
411 Z.hi ^= rem_4bit[rem];
412#else
413 Z.hi ^= (u64)rem_4bit[rem] << 32;
414#endif
415 Z.hi ^= Htable[nhi].hi;
416 Z.lo ^= Htable[nhi].lo;
417
418 if (--cnt < 0)
419 break;
420
421 nlo = ((const u8 *)Xi)[cnt];
422 nlo ^= inp[cnt];
423 nhi = nlo >> 4;
424 nlo &= 0xf;
425
426 rem = (size_t)Z.lo & 0xf;
427 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
428 Z.hi = (Z.hi >> 4);
429#if SIZE_MAX == 0xffffffffffffffff
430 Z.hi ^= rem_4bit[rem];
431#else
432 Z.hi ^= (u64)rem_4bit[rem] << 32;
433#endif
434 Z.hi ^= Htable[nlo].hi;
435 Z.lo ^= Htable[nlo].lo;
436 }
437#else
438 /*
439 * Extra 256+16 bytes per-key plus 512 bytes shared tables
440 * [should] give ~50% improvement... One could have PACK()-ed
441 * the rem_8bit even here, but the priority is to minimize
442 * cache footprint...
443 */
444 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
445 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
446 static const unsigned short rem_8bit[256] = {
447 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
448 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
449 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
450 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
451 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
452 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
453 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
454 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
455 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
456 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
457 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
458 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
459 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
460 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
461 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
462 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
463 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
464 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
465 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
466 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
467 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
468 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
469 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
470 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
471 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
472 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
473 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
474 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
475 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
476 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
477 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
478 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
479 /*
480 * This pre-processing phase slows down procedure by approximately
481 * same time as it makes each loop spin faster. In other words
482 * single block performance is approximately same as straightforward
483 * "4-bit" implementation, and then it goes only faster...
484 */
485 for (cnt = 0; cnt < 16; ++cnt) {
486 Z.hi = Htable[cnt].hi;
487 Z.lo = Htable[cnt].lo;
488 Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4);
489 Hshr4[cnt].hi = (Z.hi >> 4);
490 Hshl4[cnt] = (u8)(Z.lo << 4);
491 }
492
493 do {
494 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
495 nlo = ((const u8 *)Xi)[cnt];
496 nlo ^= inp[cnt];
497 nhi = nlo >> 4;
498 nlo &= 0xf;
499
500 Z.hi ^= Htable[nlo].hi;
501 Z.lo ^= Htable[nlo].lo;
502
503 rem = (size_t)Z.lo & 0xff;
504
505 Z.lo = (Z.hi << 56)|(Z.lo >> 8);
506 Z.hi = (Z.hi >> 8);
507
508 Z.hi ^= Hshr4[nhi].hi;
509 Z.lo ^= Hshr4[nhi].lo;
510 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
511 }
512
513 nlo = ((const u8 *)Xi)[0];
514 nlo ^= inp[0];
515 nhi = nlo >> 4;
516 nlo &= 0xf;
517
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
520
521 rem = (size_t)Z.lo & 0xf;
522
523 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
524 Z.hi = (Z.hi >> 4);
525
526 Z.hi ^= Htable[nhi].hi;
527 Z.lo ^= Htable[nhi].lo;
528 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
529#endif
530
531 Xi[0] = htobe64(Z.hi);
532 Xi[1] = htobe64(Z.lo);
533 } while (inp += 16, len -= 16);
534}
535#endif
536#else
537void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
538void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
539 size_t len);
540#endif
541
542#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
543#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
544#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
545/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
546 * trashing effect. In other words idea is to hash data while it's
547 * still in L1 cache after encryption pass... */
548#define GHASH_CHUNK (3*1024)
549#endif
550
551#else /* TABLE_BITS */
552
553static void
554gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
555{
556 u128 V, Z = { 0,0 };
557 long X;
558 int i, j;
559 const long *xi = (const long *)Xi;
560
561 V.hi = H[0]; /* H is in host byte order, no byte swapping */
562 V.lo = H[1];
563
564 for (j = 0; j < 16/sizeof(long); ++j) {
565#if BYTE_ORDER == LITTLE_ENDIAN
566#if SIZE_MAX == 0xffffffffffffffff
567#ifdef BSWAP8
568 X = (long)(BSWAP8(xi[j]));
569#else
570 const u8 *p = (const u8 *)(xi + j);
571 X = (long)((u64)GETU32(p) << 32|GETU32(p + 4));
572#endif
573#else
574 const u8 *p = (const u8 *)(xi + j);
575 X = (long)GETU32(p);
576#endif
577#else /* BIG_ENDIAN */
578 X = xi[j];
579#endif
580
581 for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) {
582 u64 M = (u64)(X >> (8*sizeof(long) - 1));
583 Z.hi ^= V.hi & M;
584 Z.lo ^= V.lo & M;
585
586 REDUCE1BIT(V);
587 }
588 }
589
590 Xi[0] = htobe64(Z.hi);
591 Xi[1] = htobe64(Z.lo);
592}
593#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
594
595#endif
596
597#if defined(GHASH_ASM) && \
598 (defined(__i386) || defined(__i386__) || \
599 defined(__x86_64) || defined(__x86_64__) || \
600 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
601#include "x86_arch.h"
602#endif
603
604#if TABLE_BITS==4 && defined(GHASH_ASM)
605# if (defined(__i386) || defined(__i386__) || \
606 defined(__x86_64) || defined(__x86_64__) || \
607 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
608# define GHASH_ASM_X86_OR_64
609# define GCM_FUNCREF_4BIT
610
611void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
612void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
613void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
614 size_t len);
615
616# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
617# define GHASH_ASM_X86
618void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
619void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
620 size_t len);
621
622void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
623void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
624 size_t len);
625# endif
626# elif defined(__arm__) || defined(__arm)
627# include "arm_arch.h"
628# if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
629# define GHASH_ASM_ARM
630# define GCM_FUNCREF_4BIT
631void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
632void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
633 size_t len);
634# endif
635# endif
636#endif
637
638#ifdef GCM_FUNCREF_4BIT
639# undef GCM_MUL
640# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
641# ifdef GHASH
642# undef GHASH
643# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
644# endif
645#endif
646
647void
648CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
649{
650 memset(ctx, 0, sizeof(*ctx));
651 ctx->block = block;
652 ctx->key = key;
653
654 (*block)(ctx->H.c, ctx->H.c, key);
655
656 /* H is stored in host byte order */
657 ctx->H.u[0] = be64toh(ctx->H.u[0]);
658 ctx->H.u[1] = be64toh(ctx->H.u[1]);
659
660#if TABLE_BITS==8
661 gcm_init_8bit(ctx->Htable, ctx->H.u);
662#elif TABLE_BITS==4
663# if defined(GHASH_ASM_X86_OR_64)
664# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
665 /* check FXSR and PCLMULQDQ bits */
666 if ((crypto_cpu_caps_ia32() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
667 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
668 gcm_init_clmul(ctx->Htable, ctx->H.u);
669 ctx->gmult = gcm_gmult_clmul;
670 ctx->ghash = gcm_ghash_clmul;
671 return;
672 }
673# endif
674 gcm_init_4bit(ctx->Htable, ctx->H.u);
675# if defined(GHASH_ASM_X86) /* x86 only */
676# if defined(OPENSSL_IA32_SSE2)
677 if (crypto_cpu_caps_ia32() & CPUCAP_MASK_SSE) { /* check SSE bit */
678# else
679 if (crypto_cpu_caps_ia32() & CPUCAP_MASK_MMX) { /* check MMX bit */
680# endif
681 ctx->gmult = gcm_gmult_4bit_mmx;
682 ctx->ghash = gcm_ghash_4bit_mmx;
683 } else {
684 ctx->gmult = gcm_gmult_4bit_x86;
685 ctx->ghash = gcm_ghash_4bit_x86;
686 }
687# else
688 ctx->gmult = gcm_gmult_4bit;
689 ctx->ghash = gcm_ghash_4bit;
690# endif
691# elif defined(GHASH_ASM_ARM)
692 if (OPENSSL_armcap_P & ARMV7_NEON) {
693 ctx->gmult = gcm_gmult_neon;
694 ctx->ghash = gcm_ghash_neon;
695 } else {
696 gcm_init_4bit(ctx->Htable, ctx->H.u);
697 ctx->gmult = gcm_gmult_4bit;
698 ctx->ghash = gcm_ghash_4bit;
699 }
700# else
701 gcm_init_4bit(ctx->Htable, ctx->H.u);
702# endif
703#endif
704}
705LCRYPTO_ALIAS(CRYPTO_gcm128_init);
706
707void
708CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len)
709{
710 unsigned int ctr;
711#ifdef GCM_FUNCREF_4BIT
712 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
713#endif
714
715 ctx->Yi.u[0] = 0;
716 ctx->Yi.u[1] = 0;
717 ctx->Xi.u[0] = 0;
718 ctx->Xi.u[1] = 0;
719 ctx->len.u[0] = 0; /* AAD length */
720 ctx->len.u[1] = 0; /* message length */
721 ctx->ares = 0;
722 ctx->mres = 0;
723
724 if (len == 12) {
725 memcpy(ctx->Yi.c, iv, 12);
726 ctx->Yi.c[15] = 1;
727 ctr = 1;
728 } else {
729 size_t i;
730 u64 len0 = len;
731
732 while (len >= 16) {
733 for (i = 0; i < 16; ++i)
734 ctx->Yi.c[i] ^= iv[i];
735 GCM_MUL(ctx, Yi);
736 iv += 16;
737 len -= 16;
738 }
739 if (len) {
740 for (i = 0; i < len; ++i)
741 ctx->Yi.c[i] ^= iv[i];
742 GCM_MUL(ctx, Yi);
743 }
744 len0 <<= 3;
745 ctx->Yi.u[1] ^= htobe64(len0);
746
747 GCM_MUL(ctx, Yi);
748
749 ctr = be32toh(ctx->Yi.d[3]);
750 }
751
752 (*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
753 ++ctr;
754 ctx->Yi.d[3] = htobe32(ctr);
755}
756LCRYPTO_ALIAS(CRYPTO_gcm128_setiv);
757
758int
759CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len)
760{
761 size_t i;
762 unsigned int n;
763 u64 alen = ctx->len.u[0];
764#ifdef GCM_FUNCREF_4BIT
765 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
766# ifdef GHASH
767 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
768 const u8 *inp, size_t len) = ctx->ghash;
769# endif
770#endif
771
772 if (ctx->len.u[1])
773 return -2;
774
775 alen += len;
776 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
777 return -1;
778 ctx->len.u[0] = alen;
779
780 n = ctx->ares;
781 if (n) {
782 while (n && len) {
783 ctx->Xi.c[n] ^= *(aad++);
784 --len;
785 n = (n + 1) % 16;
786 }
787 if (n == 0)
788 GCM_MUL(ctx, Xi);
789 else {
790 ctx->ares = n;
791 return 0;
792 }
793 }
794
795#ifdef GHASH
796 if ((i = (len & (size_t)-16))) {
797 GHASH(ctx, aad, i);
798 aad += i;
799 len -= i;
800 }
801#else
802 while (len >= 16) {
803 for (i = 0; i < 16; ++i)
804 ctx->Xi.c[i] ^= aad[i];
805 GCM_MUL(ctx, Xi);
806 aad += 16;
807 len -= 16;
808 }
809#endif
810 if (len) {
811 n = (unsigned int)len;
812 for (i = 0; i < len; ++i)
813 ctx->Xi.c[i] ^= aad[i];
814 }
815
816 ctx->ares = n;
817 return 0;
818}
819LCRYPTO_ALIAS(CRYPTO_gcm128_aad);
820
821int
822CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
823 const unsigned char *in, unsigned char *out,
824 size_t len)
825{
826 unsigned int n, ctr;
827 size_t i;
828 u64 mlen = ctx->len.u[1];
829 block128_f block = ctx->block;
830 void *key = ctx->key;
831#ifdef GCM_FUNCREF_4BIT
832 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
833# ifdef GHASH
834 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
835 const u8 *inp, size_t len) = ctx->ghash;
836# endif
837#endif
838
839 mlen += len;
840 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
841 return -1;
842 ctx->len.u[1] = mlen;
843
844 if (ctx->ares) {
845 /* First call to encrypt finalizes GHASH(AAD) */
846 GCM_MUL(ctx, Xi);
847 ctx->ares = 0;
848 }
849
850 ctr = be32toh(ctx->Yi.d[3]);
851
852 n = ctx->mres;
853#if !defined(OPENSSL_SMALL_FOOTPRINT)
854 if (16 % sizeof(size_t) == 0)
855 do { /* always true actually */
856 if (n) {
857 while (n && len) {
858 ctx->Xi.c[n] ^= *(out++) = *(in++) ^
859 ctx->EKi.c[n];
860 --len;
861 n = (n + 1) % 16;
862 }
863 if (n == 0)
864 GCM_MUL(ctx, Xi);
865 else {
866 ctx->mres = n;
867 return 0;
868 }
869 }
870#ifdef __STRICT_ALIGNMENT
871 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
872 break;
873#endif
874#if defined(GHASH) && defined(GHASH_CHUNK)
875 while (len >= GHASH_CHUNK) {
876 size_t j = GHASH_CHUNK;
877
878 while (j) {
879 size_t *out_t = (size_t *)out;
880 const size_t *in_t = (const size_t *)in;
881
882 (*block)(ctx->Yi.c, ctx->EKi.c, key);
883 ++ctr;
884 ctx->Yi.d[3] = htobe32(ctr);
885
886 for (i = 0; i < 16/sizeof(size_t); ++i)
887 out_t[i] = in_t[i] ^
888 ctx->EKi.t[i];
889 out += 16;
890 in += 16;
891 j -= 16;
892 }
893 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
894 len -= GHASH_CHUNK;
895 }
896 if ((i = (len & (size_t)-16))) {
897 size_t j = i;
898
899 while (len >= 16) {
900 size_t *out_t = (size_t *)out;
901 const size_t *in_t = (const size_t *)in;
902
903 (*block)(ctx->Yi.c, ctx->EKi.c, key);
904 ++ctr;
905 ctx->Yi.d[3] = htobe32(ctr);
906
907 for (i = 0; i < 16/sizeof(size_t); ++i)
908 out_t[i] = in_t[i] ^
909 ctx->EKi.t[i];
910 out += 16;
911 in += 16;
912 len -= 16;
913 }
914 GHASH(ctx, out - j, j);
915 }
916#else
917 while (len >= 16) {
918 size_t *out_t = (size_t *)out;
919 const size_t *in_t = (const size_t *)in;
920
921 (*block)(ctx->Yi.c, ctx->EKi.c, key);
922 ++ctr;
923 ctx->Yi.d[3] = htobe32(ctr);
924
925 for (i = 0; i < 16/sizeof(size_t); ++i)
926 ctx->Xi.t[i] ^=
927 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
928 GCM_MUL(ctx, Xi);
929 out += 16;
930 in += 16;
931 len -= 16;
932 }
933#endif
934 if (len) {
935 (*block)(ctx->Yi.c, ctx->EKi.c, key);
936 ++ctr;
937 ctx->Yi.d[3] = htobe32(ctr);
938
939 while (len--) {
940 ctx->Xi.c[n] ^= out[n] = in[n] ^
941 ctx->EKi.c[n];
942 ++n;
943 }
944 }
945
946 ctx->mres = n;
947 return 0;
948 } while (0);
949#endif
950 for (i = 0; i < len; ++i) {
951 if (n == 0) {
952 (*block)(ctx->Yi.c, ctx->EKi.c, key);
953 ++ctr;
954 ctx->Yi.d[3] = htobe32(ctr);
955 }
956 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
957 n = (n + 1) % 16;
958 if (n == 0)
959 GCM_MUL(ctx, Xi);
960 }
961
962 ctx->mres = n;
963 return 0;
964}
965LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt);
966
967int
968CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
969 const unsigned char *in, unsigned char *out,
970 size_t len)
971{
972 unsigned int n, ctr;
973 size_t i;
974 u64 mlen = ctx->len.u[1];
975 block128_f block = ctx->block;
976 void *key = ctx->key;
977#ifdef GCM_FUNCREF_4BIT
978 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
979# ifdef GHASH
980 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
981 const u8 *inp, size_t len) = ctx->ghash;
982# endif
983#endif
984
985 mlen += len;
986 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
987 return -1;
988 ctx->len.u[1] = mlen;
989
990 if (ctx->ares) {
991 /* First call to decrypt finalizes GHASH(AAD) */
992 GCM_MUL(ctx, Xi);
993 ctx->ares = 0;
994 }
995
996 ctr = be32toh(ctx->Yi.d[3]);
997
998 n = ctx->mres;
999#if !defined(OPENSSL_SMALL_FOOTPRINT)
1000 if (16 % sizeof(size_t) == 0)
1001 do { /* always true actually */
1002 if (n) {
1003 while (n && len) {
1004 u8 c = *(in++);
1005 *(out++) = c ^ ctx->EKi.c[n];
1006 ctx->Xi.c[n] ^= c;
1007 --len;
1008 n = (n + 1) % 16;
1009 }
1010 if (n == 0)
1011 GCM_MUL(ctx, Xi);
1012 else {
1013 ctx->mres = n;
1014 return 0;
1015 }
1016 }
1017#ifdef __STRICT_ALIGNMENT
1018 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
1019 break;
1020#endif
1021#if defined(GHASH) && defined(GHASH_CHUNK)
1022 while (len >= GHASH_CHUNK) {
1023 size_t j = GHASH_CHUNK;
1024
1025 GHASH(ctx, in, GHASH_CHUNK);
1026 while (j) {
1027 size_t *out_t = (size_t *)out;
1028 const size_t *in_t = (const size_t *)in;
1029
1030 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1031 ++ctr;
1032 ctx->Yi.d[3] = htobe32(ctr);
1033
1034 for (i = 0; i < 16/sizeof(size_t); ++i)
1035 out_t[i] = in_t[i] ^
1036 ctx->EKi.t[i];
1037 out += 16;
1038 in += 16;
1039 j -= 16;
1040 }
1041 len -= GHASH_CHUNK;
1042 }
1043 if ((i = (len & (size_t)-16))) {
1044 GHASH(ctx, in, i);
1045 while (len >= 16) {
1046 size_t *out_t = (size_t *)out;
1047 const size_t *in_t = (const size_t *)in;
1048
1049 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1050 ++ctr;
1051 ctx->Yi.d[3] = htobe32(ctr);
1052
1053 for (i = 0; i < 16/sizeof(size_t); ++i)
1054 out_t[i] = in_t[i] ^
1055 ctx->EKi.t[i];
1056 out += 16;
1057 in += 16;
1058 len -= 16;
1059 }
1060 }
1061#else
1062 while (len >= 16) {
1063 size_t *out_t = (size_t *)out;
1064 const size_t *in_t = (const size_t *)in;
1065
1066 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1067 ++ctr;
1068 ctx->Yi.d[3] = htobe32(ctr);
1069
1070 for (i = 0; i < 16/sizeof(size_t); ++i) {
1071 size_t c = in[i];
1072 out[i] = c ^ ctx->EKi.t[i];
1073 ctx->Xi.t[i] ^= c;
1074 }
1075 GCM_MUL(ctx, Xi);
1076 out += 16;
1077 in += 16;
1078 len -= 16;
1079 }
1080#endif
1081 if (len) {
1082 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1083 ++ctr;
1084 ctx->Yi.d[3] = htobe32(ctr);
1085
1086 while (len--) {
1087 u8 c = in[n];
1088 ctx->Xi.c[n] ^= c;
1089 out[n] = c ^ ctx->EKi.c[n];
1090 ++n;
1091 }
1092 }
1093
1094 ctx->mres = n;
1095 return 0;
1096 } while (0);
1097#endif
1098 for (i = 0; i < len; ++i) {
1099 u8 c;
1100 if (n == 0) {
1101 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1102 ++ctr;
1103 ctx->Yi.d[3] = htobe32(ctr);
1104 }
1105 c = in[i];
1106 out[i] = c ^ ctx->EKi.c[n];
1107 ctx->Xi.c[n] ^= c;
1108 n = (n + 1) % 16;
1109 if (n == 0)
1110 GCM_MUL(ctx, Xi);
1111 }
1112
1113 ctx->mres = n;
1114 return 0;
1115}
1116LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt);
1117
1118int
1119CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1120 const unsigned char *in, unsigned char *out,
1121 size_t len, ctr128_f stream)
1122{
1123 unsigned int n, ctr;
1124 size_t i;
1125 u64 mlen = ctx->len.u[1];
1126 void *key = ctx->key;
1127#ifdef GCM_FUNCREF_4BIT
1128 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1129# ifdef GHASH
1130 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1131 const u8 *inp, size_t len) = ctx->ghash;
1132# endif
1133#endif
1134
1135 mlen += len;
1136 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1137 return -1;
1138 ctx->len.u[1] = mlen;
1139
1140 if (ctx->ares) {
1141 /* First call to encrypt finalizes GHASH(AAD) */
1142 GCM_MUL(ctx, Xi);
1143 ctx->ares = 0;
1144 }
1145
1146 ctr = be32toh(ctx->Yi.d[3]);
1147
1148 n = ctx->mres;
1149 if (n) {
1150 while (n && len) {
1151 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1152 --len;
1153 n = (n + 1) % 16;
1154 }
1155 if (n == 0)
1156 GCM_MUL(ctx, Xi);
1157 else {
1158 ctx->mres = n;
1159 return 0;
1160 }
1161 }
1162#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1163 while (len >= GHASH_CHUNK) {
1164 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1165 ctr += GHASH_CHUNK/16;
1166 ctx->Yi.d[3] = htobe32(ctr);
1167 GHASH(ctx, out, GHASH_CHUNK);
1168 out += GHASH_CHUNK;
1169 in += GHASH_CHUNK;
1170 len -= GHASH_CHUNK;
1171 }
1172#endif
1173 if ((i = (len & (size_t)-16))) {
1174 size_t j = i/16;
1175
1176 (*stream)(in, out, j, key, ctx->Yi.c);
1177 ctr += (unsigned int)j;
1178 ctx->Yi.d[3] = htobe32(ctr);
1179 in += i;
1180 len -= i;
1181#if defined(GHASH)
1182 GHASH(ctx, out, i);
1183 out += i;
1184#else
1185 while (j--) {
1186 for (i = 0; i < 16; ++i)
1187 ctx->Xi.c[i] ^= out[i];
1188 GCM_MUL(ctx, Xi);
1189 out += 16;
1190 }
1191#endif
1192 }
1193 if (len) {
1194 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1195 ++ctr;
1196 ctx->Yi.d[3] = htobe32(ctr);
1197 while (len--) {
1198 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1199 ++n;
1200 }
1201 }
1202
1203 ctx->mres = n;
1204 return 0;
1205}
1206LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32);
1207
1208int
1209CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1210 const unsigned char *in, unsigned char *out,
1211 size_t len, ctr128_f stream)
1212{
1213 unsigned int n, ctr;
1214 size_t i;
1215 u64 mlen = ctx->len.u[1];
1216 void *key = ctx->key;
1217#ifdef GCM_FUNCREF_4BIT
1218 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1219# ifdef GHASH
1220 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1221 const u8 *inp, size_t len) = ctx->ghash;
1222# endif
1223#endif
1224
1225 mlen += len;
1226 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1227 return -1;
1228 ctx->len.u[1] = mlen;
1229
1230 if (ctx->ares) {
1231 /* First call to decrypt finalizes GHASH(AAD) */
1232 GCM_MUL(ctx, Xi);
1233 ctx->ares = 0;
1234 }
1235
1236 ctr = be32toh(ctx->Yi.d[3]);
1237
1238 n = ctx->mres;
1239 if (n) {
1240 while (n && len) {
1241 u8 c = *(in++);
1242 *(out++) = c ^ ctx->EKi.c[n];
1243 ctx->Xi.c[n] ^= c;
1244 --len;
1245 n = (n + 1) % 16;
1246 }
1247 if (n == 0)
1248 GCM_MUL(ctx, Xi);
1249 else {
1250 ctx->mres = n;
1251 return 0;
1252 }
1253 }
1254#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1255 while (len >= GHASH_CHUNK) {
1256 GHASH(ctx, in, GHASH_CHUNK);
1257 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1258 ctr += GHASH_CHUNK/16;
1259 ctx->Yi.d[3] = htobe32(ctr);
1260 out += GHASH_CHUNK;
1261 in += GHASH_CHUNK;
1262 len -= GHASH_CHUNK;
1263 }
1264#endif
1265 if ((i = (len & (size_t)-16))) {
1266 size_t j = i/16;
1267
1268#if defined(GHASH)
1269 GHASH(ctx, in, i);
1270#else
1271 while (j--) {
1272 size_t k;
1273 for (k = 0; k < 16; ++k)
1274 ctx->Xi.c[k] ^= in[k];
1275 GCM_MUL(ctx, Xi);
1276 in += 16;
1277 }
1278 j = i/16;
1279 in -= i;
1280#endif
1281 (*stream)(in, out, j, key, ctx->Yi.c);
1282 ctr += (unsigned int)j;
1283 ctx->Yi.d[3] = htobe32(ctr);
1284 out += i;
1285 in += i;
1286 len -= i;
1287 }
1288 if (len) {
1289 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1290 ++ctr;
1291 ctx->Yi.d[3] = htobe32(ctr);
1292 while (len--) {
1293 u8 c = in[n];
1294 ctx->Xi.c[n] ^= c;
1295 out[n] = c ^ ctx->EKi.c[n];
1296 ++n;
1297 }
1298 }
1299
1300 ctx->mres = n;
1301 return 0;
1302}
1303LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32);
1304
1305int
1306CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1307 size_t len)
1308{
1309 u64 alen = ctx->len.u[0] << 3;
1310 u64 clen = ctx->len.u[1] << 3;
1311#ifdef GCM_FUNCREF_4BIT
1312 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1313#endif
1314
1315 if (ctx->mres || ctx->ares)
1316 GCM_MUL(ctx, Xi);
1317
1318 ctx->Xi.u[0] ^= htobe64(alen);
1319 ctx->Xi.u[1] ^= htobe64(clen);
1320 GCM_MUL(ctx, Xi);
1321
1322 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1323 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1324
1325 if (tag && len <= sizeof(ctx->Xi))
1326 return memcmp(ctx->Xi.c, tag, len);
1327 else
1328 return -1;
1329}
1330LCRYPTO_ALIAS(CRYPTO_gcm128_finish);
1331
1332void
1333CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1334{
1335 CRYPTO_gcm128_finish(ctx, NULL, 0);
1336 memcpy(tag, ctx->Xi.c,
1337 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1338}
1339LCRYPTO_ALIAS(CRYPTO_gcm128_tag);
1340
1341GCM128_CONTEXT *
1342CRYPTO_gcm128_new(void *key, block128_f block)
1343{
1344 GCM128_CONTEXT *ret;
1345
1346 if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1347 CRYPTO_gcm128_init(ret, key, block);
1348
1349 return ret;
1350}
1351LCRYPTO_ALIAS(CRYPTO_gcm128_new);
1352
1353void
1354CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1355{
1356 freezero(ctx, sizeof(*ctx));
1357}
1358LCRYPTO_ALIAS(CRYPTO_gcm128_release);
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
deleted file mode 100644
index 53fa9afb0d..0000000000
--- a/src/lib/libcrypto/modes/modes.h
+++ /dev/null
@@ -1,118 +0,0 @@
1/* $OpenBSD: modes.h,v 1.6 2023/07/08 14:55:36 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Rights for redistribution and usage in source and binary
6 * forms are granted according to the OpenSSL license.
7 */
8
9#include <stddef.h>
10
11#ifdef __cplusplus
12extern "C" {
13#endif
14
15typedef void (*block128_f)(const unsigned char in[16],
16 unsigned char out[16],
17 const void *key);
18
19typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
20 size_t len, const void *key,
21 unsigned char ivec[16], int enc);
22
23typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
24 size_t blocks, const void *key,
25 const unsigned char ivec[16]);
26
27typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
28 size_t blocks, const void *key,
29 const unsigned char ivec[16], unsigned char cmac[16]);
30
31void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
32 size_t len, const void *key,
33 unsigned char ivec[16], block128_f block);
34void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
35 size_t len, const void *key,
36 unsigned char ivec[16], block128_f block);
37
38void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
39 size_t len, const void *key,
40 unsigned char ivec[16], unsigned char ecount_buf[16],
41 unsigned int *num, block128_f block);
42
43void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
44 size_t len, const void *key,
45 unsigned char ivec[16], unsigned char ecount_buf[16],
46 unsigned int *num, ctr128_f ctr);
47
48void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
49 size_t len, const void *key,
50 unsigned char ivec[16], int *num,
51 block128_f block);
52
53void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
54 size_t len, const void *key,
55 unsigned char ivec[16], int *num,
56 int enc, block128_f block);
57void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
58 size_t length, const void *key,
59 unsigned char ivec[16], int *num,
60 int enc, block128_f block);
61void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
62 size_t bits, const void *key,
63 unsigned char ivec[16], int *num,
64 int enc, block128_f block);
65
66typedef struct gcm128_context GCM128_CONTEXT;
67
68GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
69void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block);
70void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
71 size_t len);
72int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
73 size_t len);
74int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
75 const unsigned char *in, unsigned char *out,
76 size_t len);
77int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
78 const unsigned char *in, unsigned char *out,
79 size_t len);
80int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
81 const unsigned char *in, unsigned char *out,
82 size_t len, ctr128_f stream);
83int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
84 const unsigned char *in, unsigned char *out,
85 size_t len, ctr128_f stream);
86int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
87 size_t len);
88void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
89void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
90
91typedef struct ccm128_context CCM128_CONTEXT;
92
93void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
94 unsigned int M, unsigned int L, void *key, block128_f block);
95int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
96 const unsigned char *nonce, size_t nlen, size_t mlen);
97void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
98 const unsigned char *aad, size_t alen);
99int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
100 const unsigned char *inp, unsigned char *out, size_t len);
101int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
102 const unsigned char *inp, unsigned char *out, size_t len);
103int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
104 const unsigned char *inp, unsigned char *out, size_t len,
105 ccm128_f stream);
106int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
107 const unsigned char *inp, unsigned char *out, size_t len,
108 ccm128_f stream);
109size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
110
111typedef struct xts128_context XTS128_CONTEXT;
112
113int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
114 const unsigned char *inp, unsigned char *out, size_t len, int enc);
115
116#ifdef __cplusplus
117}
118#endif
diff --git a/src/lib/libcrypto/modes/modes_local.h b/src/lib/libcrypto/modes/modes_local.h
deleted file mode 100644
index 511855f2e0..0000000000
--- a/src/lib/libcrypto/modes/modes_local.h
+++ /dev/null
@@ -1,121 +0,0 @@
1/* $OpenBSD: modes_local.h,v 1.2 2023/07/08 14:55:36 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use is governed by OpenSSL license.
6 * ====================================================================
7 */
8
9#include <endian.h>
10
11#include <openssl/opensslconf.h>
12
13#include <openssl/modes.h>
14
15__BEGIN_HIDDEN_DECLS
16
17#if defined(_LP64)
18typedef long i64;
19typedef unsigned long u64;
20#define U64(C) C##UL
21#else
22typedef long long i64;
23typedef unsigned long long u64;
24#define U64(C) C##ULL
25#endif
26
27typedef unsigned int u32;
28typedef unsigned char u8;
29
30#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
31#if defined(__GNUC__) && __GNUC__>=2
32# if defined(__x86_64) || defined(__x86_64__)
33# define BSWAP8(x) ({ u64 ret=(x); \
34 asm ("bswapq %0" \
35 : "+r"(ret)); ret; })
36# define BSWAP4(x) ({ u32 ret=(x); \
37 asm ("bswapl %0" \
38 : "+r"(ret)); ret; })
39# elif (defined(__i386) || defined(__i386__))
40# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
41 asm ("bswapl %0; bswapl %1" \
42 : "+r"(hi),"+r"(lo)); \
43 (u64)hi<<32|lo; })
44# define BSWAP4(x) ({ u32 ret=(x); \
45 asm ("bswapl %0" \
46 : "+r"(ret)); ret; })
47# elif (defined(__arm__) || defined(__arm)) && !defined(__STRICT_ALIGNMENT)
48# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
49 asm ("rev %0,%0; rev %1,%1" \
50 : "+r"(hi),"+r"(lo)); \
51 (u64)hi<<32|lo; })
52# define BSWAP4(x) ({ u32 ret; \
53 asm ("rev %0,%1" \
54 : "=r"(ret) : "r"((u32)(x))); \
55 ret; })
56# endif
57#endif
58#endif
59
60#if defined(BSWAP4) && !defined(__STRICT_ALIGNMENT)
61#define GETU32(p) BSWAP4(*(const u32 *)(p))
62#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
63#else
64#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
65#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
66#endif
67
68/* GCM definitions */
69
70typedef struct {
71 u64 hi, lo;
72} u128;
73
74#ifdef TABLE_BITS
75#undef TABLE_BITS
76#endif
77/*
78 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
79 * never be set to 8 [or 1]. For further information see gcm128.c.
80 */
81#define TABLE_BITS 4
82
83struct gcm128_context {
84 /* Following 6 names follow names in GCM specification */
85 union {
86 u64 u[2];
87 u32 d[4];
88 u8 c[16];
89 size_t t[16/sizeof(size_t)];
90 } Yi, EKi, EK0, len, Xi, H;
91 /* Relative position of Xi, H and pre-computed Htable is used
92 * in some assembler modules, i.e. don't change the order! */
93#if TABLE_BITS==8
94 u128 Htable[256];
95#else
96 u128 Htable[16];
97 void (*gmult)(u64 Xi[2], const u128 Htable[16]);
98 void (*ghash)(u64 Xi[2], const u128 Htable[16], const u8 *inp,
99 size_t len);
100#endif
101 unsigned int mres, ares;
102 block128_f block;
103 void *key;
104};
105
106struct xts128_context {
107 void *key1, *key2;
108 block128_f block1, block2;
109};
110
111struct ccm128_context {
112 union {
113 u64 u[2];
114 u8 c[16];
115 } nonce, cmac;
116 u64 blocks;
117 block128_f block;
118 void *key;
119};
120
121__END_HIDDEN_DECLS
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
deleted file mode 100644
index 42afd29d58..0000000000
--- a/src/lib/libcrypto/modes/ofb128.c
+++ /dev/null
@@ -1,124 +0,0 @@
1/* $OpenBSD: ofb128.c,v 1.7 2023/07/08 14:56:54 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/crypto.h>
53#include "modes_local.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58# define NDEBUG
59# endif
60#endif
61
62/* The input and output encrypted as though 128bit ofb mode is being
63 * used. The extra state information to record how much of the
64 * 128bit block we have used is contained in *num;
65 */
66void
67CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
68 size_t len, const void *key,
69 unsigned char ivec[16], int *num,
70 block128_f block)
71{
72 unsigned int n;
73 size_t l = 0;
74
75 n = *num;
76
77#if !defined(OPENSSL_SMALL_FOOTPRINT)
78 if (16 % sizeof(size_t) == 0)
79 do { /* always true actually */
80 while (n && len) {
81 *(out++) = *(in++) ^ ivec[n];
82 --len;
83 n = (n + 1) % 16;
84 }
85#ifdef __STRICT_ALIGNMENT
86 if (((size_t)in|(size_t)out|(size_t)ivec) %
87 sizeof(size_t) != 0)
88 break;
89#endif
90 while (len >= 16) {
91 (*block)(ivec, ivec, key);
92 for (; n < 16; n += sizeof(size_t))
93 *(size_t *)(out + n) =
94 *(size_t *)(in + n) ^ *(size_t *)(ivec +
95 n);
96 len -= 16;
97 out += 16;
98 in += 16;
99 n = 0;
100 }
101 if (len) {
102 (*block)(ivec, ivec, key);
103 while (len--) {
104 out[n] = in[n] ^ ivec[n];
105 ++n;
106 }
107 }
108 *num = n;
109 return;
110 } while (0);
111 /* the rest would be commonly eliminated by x86* compiler */
112#endif
113 while (l < len) {
114 if (n == 0) {
115 (*block)(ivec, ivec, key);
116 }
117 out[l] = in[l] ^ ivec[n];
118 ++l;
119 n = (n + 1) % 16;
120 }
121
122 *num = n;
123}
124LCRYPTO_ALIAS(CRYPTO_ofb128_encrypt);
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
deleted file mode 100644
index 7516acf850..0000000000
--- a/src/lib/libcrypto/modes/xts128.c
+++ /dev/null
@@ -1,197 +0,0 @@
1/* $OpenBSD: xts128.c,v 1.12 2023/07/08 14:56:54 beck Exp $ */
2/* ====================================================================
3 * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51#include <openssl/crypto.h>
52#include "modes_local.h"
53
54#include <endian.h>
55#include <string.h>
56
57#ifndef MODES_DEBUG
58# ifndef NDEBUG
59# define NDEBUG
60# endif
61#endif
62
63int
64CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
65 const unsigned char *inp, unsigned char *out,
66 size_t len, int enc)
67{
68 union {
69 u64 u[2];
70 u32 d[4];
71 u8 c[16];
72 } tweak, scratch;
73 unsigned int i;
74
75 if (len < 16)
76 return -1;
77
78 memcpy(tweak.c, iv, 16);
79
80 (*ctx->block2)(tweak.c, tweak.c, ctx->key2);
81
82 if (!enc && (len % 16))
83 len -= 16;
84
85 while (len >= 16) {
86#ifdef __STRICT_ALIGNMENT
87 memcpy(scratch.c, inp, 16);
88 scratch.u[0] ^= tweak.u[0];
89 scratch.u[1] ^= tweak.u[1];
90#else
91 scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0];
92 scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];
93#endif
94 (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
95#ifdef __STRICT_ALIGNMENT
96 scratch.u[0] ^= tweak.u[0];
97 scratch.u[1] ^= tweak.u[1];
98 memcpy(out, scratch.c, 16);
99#else
100 ((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0];
101 ((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];
102#endif
103 inp += 16;
104 out += 16;
105 len -= 16;
106
107 if (len == 0)
108 return 0;
109
110#if BYTE_ORDER == LITTLE_ENDIAN
111 unsigned int carry, res;
112
113 res = 0x87 & (((int)tweak.d[3]) >> 31);
114 carry = (unsigned int)(tweak.u[0] >> 63);
115 tweak.u[0] = (tweak.u[0] << 1) ^ res;
116 tweak.u[1] = (tweak.u[1] << 1)|carry;
117#else /* BIG_ENDIAN */
118 size_t c;
119
120 for (c = 0, i = 0; i < 16; ++i) {
121 /*+ substitutes for |, because c is 1 bit */
122 c += ((size_t)tweak.c[i]) << 1;
123 tweak.c[i] = (u8)c;
124 c = c >> 8;
125 }
126 tweak.c[0] ^= (u8)(0x87 & (0 - c));
127#endif
128 }
129 if (enc) {
130 for (i = 0; i < len; ++i) {
131 u8 ch = inp[i];
132 out[i] = scratch.c[i];
133 scratch.c[i] = ch;
134 }
135 scratch.u[0] ^= tweak.u[0];
136 scratch.u[1] ^= tweak.u[1];
137 (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
138 scratch.u[0] ^= tweak.u[0];
139 scratch.u[1] ^= tweak.u[1];
140 memcpy(out - 16, scratch.c, 16);
141 } else {
142 union {
143 u64 u[2];
144 u8 c[16];
145 } tweak1;
146
147#if BYTE_ORDER == LITTLE_ENDIAN
148 unsigned int carry, res;
149
150 res = 0x87 & (((int)tweak.d[3]) >> 31);
151 carry = (unsigned int)(tweak.u[0] >> 63);
152 tweak1.u[0] = (tweak.u[0] << 1) ^ res;
153 tweak1.u[1] = (tweak.u[1] << 1)|carry;
154#else
155 size_t c;
156
157 for (c = 0, i = 0; i < 16; ++i) {
158 /*+ substitutes for |, because c is 1 bit */
159 c += ((size_t)tweak.c[i]) << 1;
160 tweak1.c[i] = (u8)c;
161 c = c >> 8;
162 }
163 tweak1.c[0] ^= (u8)(0x87 & (0 - c));
164#endif
165#ifdef __STRICT_ALIGNMENT
166 memcpy(scratch.c, inp, 16);
167 scratch.u[0] ^= tweak1.u[0];
168 scratch.u[1] ^= tweak1.u[1];
169#else
170 scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0];
171 scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];
172#endif
173 (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
174 scratch.u[0] ^= tweak1.u[0];
175 scratch.u[1] ^= tweak1.u[1];
176
177 for (i = 0; i < len; ++i) {
178 u8 ch = inp[16 + i];
179 out[16 + i] = scratch.c[i];
180 scratch.c[i] = ch;
181 }
182 scratch.u[0] ^= tweak.u[0];
183 scratch.u[1] ^= tweak.u[1];
184 (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
185#ifdef __STRICT_ALIGNMENT
186 scratch.u[0] ^= tweak.u[0];
187 scratch.u[1] ^= tweak.u[1];
188 memcpy(out, scratch.c, 16);
189#else
190 ((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0];
191 ((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];
192#endif
193 }
194
195 return 0;
196}
197LCRYPTO_ALIAS(CRYPTO_xts128_encrypt);