summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/modes/asm
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2016-07-23 19:31:36 +0000
committercvs2svn <admin@example.com>2016-07-23 19:31:36 +0000
commit86c49b31af735796dfde37aa29473a30d36367db (patch)
treee9a354a92a348338fe2b361e2eda703cae23cfab /src/lib/libcrypto/modes/asm
parent19d5fe348e8926bac4521c5807aa64c45b8f7a41 (diff)
downloadopenbsd-OPENBSD_6_0_BASE.tar.gz
openbsd-OPENBSD_6_0_BASE.tar.bz2
openbsd-OPENBSD_6_0_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_6_0_BASE'.OPENBSD_6_0_BASE
Diffstat (limited to 'src/lib/libcrypto/modes/asm')
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-alpha.pl455
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-armv4.pl429
-rwxr-xr-xsrc/lib/libcrypto/modes/asm/ghash-ia64.pl463
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-parisc.pl741
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-s390x.pl262
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86.pl1342
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-x86_64.pl806
8 files changed, 0 insertions, 4828 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index b6d6ea5a62..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,455 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#include <machine/asm.h>
249
250.text
251
252.set noat
253.set noreorder
254.globl gcm_gmult_4bit
255.align 4
256.ent gcm_gmult_4bit
257gcm_gmult_4bit:
258 .frame sp,0,ra
259 .prologue 0
260
261 ldq $Xlo,8($Xi)
262 ldq $Xhi,0($Xi)
263
264 bsr $t0,picmeup
265 nop
266___
267
268 &loop();
269
270$code.=<<___;
271 srl $Zlo,24,$t0 # byte swap
272 srl $Zlo,8,$t1
273
274 sll $Zlo,8,$t2
275 sll $Zlo,24,$Zlo
276 zapnot $t0,0x11,$t0
277 zapnot $t1,0x22,$t1
278
279 zapnot $Zlo,0x88,$Zlo
280 or $t0,$t1,$t0
281 zapnot $t2,0x44,$t2
282
283 or $Zlo,$t0,$Zlo
284 srl $Zhi,24,$t0
285 srl $Zhi,8,$t1
286
287 or $Zlo,$t2,$Zlo
288 sll $Zhi,8,$t2
289 sll $Zhi,24,$Zhi
290
291 srl $Zlo,32,$Xlo
292 sll $Zlo,32,$Zlo
293
294 zapnot $t0,0x11,$t0
295 zapnot $t1,0x22,$t1
296 or $Zlo,$Xlo,$Xlo
297
298 zapnot $Zhi,0x88,$Zhi
299 or $t0,$t1,$t0
300 zapnot $t2,0x44,$t2
301
302 or $Zhi,$t0,$Zhi
303 or $Zhi,$t2,$Zhi
304
305 srl $Zhi,32,$Xhi
306 sll $Zhi,32,$Zhi
307
308 or $Zhi,$Xhi,$Xhi
309 stq $Xlo,8($Xi)
310 stq $Xhi,0($Xi)
311
312 ret (ra)
313.end gcm_gmult_4bit
314___
315
316$inhi="s0";
317$inlo="s1";
318
319$code.=<<___;
320.globl gcm_ghash_4bit
321.align 4
322.ent gcm_ghash_4bit
323gcm_ghash_4bit:
324 lda sp,-32(sp)
325 stq ra,0(sp)
326 stq s0,8(sp)
327 stq s1,16(sp)
328 .mask 0x04000600,-32
329 .frame sp,32,ra
330 .prologue 0
331
332 ldq_u $inhi,0($inp)
333 ldq_u $Thi0,7($inp)
334 ldq_u $inlo,8($inp)
335 ldq_u $Tlo0,15($inp)
336 ldq $Xhi,0($Xi)
337 ldq $Xlo,8($Xi)
338
339 bsr $t0,picmeup
340 nop
341
342.Louter:
343 extql $inhi,$inp,$inhi
344 extqh $Thi0,$inp,$Thi0
345 or $inhi,$Thi0,$inhi
346 lda $inp,16($inp)
347
348 extql $inlo,$inp,$inlo
349 extqh $Tlo0,$inp,$Tlo0
350 or $inlo,$Tlo0,$inlo
351 subq $len,16,$len
352
353 xor $Xlo,$inlo,$Xlo
354 xor $Xhi,$inhi,$Xhi
355___
356
357 &loop();
358
359$code.=<<___;
360 srl $Zlo,24,$t0 # byte swap
361 srl $Zlo,8,$t1
362
363 sll $Zlo,8,$t2
364 sll $Zlo,24,$Zlo
365 zapnot $t0,0x11,$t0
366 zapnot $t1,0x22,$t1
367
368 zapnot $Zlo,0x88,$Zlo
369 or $t0,$t1,$t0
370 zapnot $t2,0x44,$t2
371
372 or $Zlo,$t0,$Zlo
373 srl $Zhi,24,$t0
374 srl $Zhi,8,$t1
375
376 or $Zlo,$t2,$Zlo
377 sll $Zhi,8,$t2
378 sll $Zhi,24,$Zhi
379
380 srl $Zlo,32,$Xlo
381 sll $Zlo,32,$Zlo
382 beq $len,.Ldone
383
384 zapnot $t0,0x11,$t0
385 zapnot $t1,0x22,$t1
386 or $Zlo,$Xlo,$Xlo
387 ldq_u $inhi,0($inp)
388
389 zapnot $Zhi,0x88,$Zhi
390 or $t0,$t1,$t0
391 zapnot $t2,0x44,$t2
392 ldq_u $Thi0,7($inp)
393
394 or $Zhi,$t0,$Zhi
395 or $Zhi,$t2,$Zhi
396 ldq_u $inlo,8($inp)
397 ldq_u $Tlo0,15($inp)
398
399 srl $Zhi,32,$Xhi
400 sll $Zhi,32,$Zhi
401
402 or $Zhi,$Xhi,$Xhi
403 br zero,.Louter
404
405.Ldone:
406 zapnot $t0,0x11,$t0
407 zapnot $t1,0x22,$t1
408 or $Zlo,$Xlo,$Xlo
409
410 zapnot $Zhi,0x88,$Zhi
411 or $t0,$t1,$t0
412 zapnot $t2,0x44,$t2
413
414 or $Zhi,$t0,$Zhi
415 or $Zhi,$t2,$Zhi
416
417 srl $Zhi,32,$Xhi
418 sll $Zhi,32,$Zhi
419
420 or $Zhi,$Xhi,$Xhi
421
422 stq $Xlo,8($Xi)
423 stq $Xhi,0($Xi)
424
425 .set noreorder
426 /*ldq ra,0(sp)*/
427 ldq s0,8(sp)
428 ldq s1,16(sp)
429 lda sp,32(sp)
430 ret (ra)
431.end gcm_ghash_4bit
432
433.align 4
434.ent picmeup
435picmeup:
436 .frame sp,0,$t0
437 .prologue 0
438 br $rem_4bit,.Lpic
439.Lpic: lda $rem_4bit,12($rem_4bit)
440 ret ($t0)
441.end picmeup
442 nop
443rem_4bit:
444 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
445 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
446 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
447 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
448.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
449.align 4
450
451___
452$output=shift and open STDOUT,">$output";
453print $code;
454close STDOUT;
455
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index d91586ee29..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,429 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+32 bytes shared table]. There is no
15# experimental performance data available yet. The only approximation
16# that can be made at this point is based on code size. Inner loop is
17# 32 instructions long and on single-issue core should execute in <40
18# cycles. Having verified that gcc 3.4 didn't unroll corresponding
19# loop, this assembler loop body was found to be ~3x smaller than
20# compiler-generated one...
21#
22# July 2010
23#
24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
25# Cortex A8 core and ~25 cycles per processed byte (which was observed
26# to be ~3 times faster than gcc-generated code:-)
27#
28# February 2011
29#
30# Profiler-assisted and platform-specific optimization resulted in 7%
31# improvement on Cortex A8 core and ~23.5 cycles per byte.
32#
33# March 2011
34#
35# Add NEON implementation featuring polynomial multiplication, i.e. no
36# lookup tables involved. On Cortex A8 it was measured to process one
37# byte in 15 cycles or 55% faster than integer-only code.
38
39# ====================================================================
40# Note about "528B" variant. In ARM case it makes lesser sense to
41# implement it for following reasons:
42#
43# - performance improvement won't be anywhere near 50%, because 128-
44# bit shift operation is neatly fused with 128-bit xor here, and
45# "538B" variant would eliminate only 4-5 instructions out of 32
46# in the inner loop (meaning that estimated improvement is ~15%);
47# - ARM-based systems are often embedded ones and extra memory
48# consumption might be unappreciated (for so little improvement);
49#
50# Byte order [in]dependence. =========================================
51#
52# Caller is expected to maintain specific *dword* order in Htable,
53# namely with *least* significant dword of 128-bit value at *lower*
54# address. This differs completely from C code and has everything to
55# do with ldm instruction and order in which dwords are "consumed" by
56# algorithm. *Byte* order within these dwords in turn is whatever
57# *native* byte order on current platform. See gcm128.c for working
58# example...
59
60while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61open STDOUT,">$output";
62
63$Xi="r0"; # argument block
64$Htbl="r1";
65$inp="r2";
66$len="r3";
67
68$Zll="r4"; # variables
69$Zlh="r5";
70$Zhl="r6";
71$Zhh="r7";
72$Tll="r8";
73$Tlh="r9";
74$Thl="r10";
75$Thh="r11";
76$nlo="r12";
77################# r13 is stack pointer
78$nhi="r14";
79################# r15 is program counter
80
81$rem_4bit=$inp; # used in gcm_gmult_4bit
82$cnt=$len;
83
84sub Zsmash() {
85 my $i=12;
86 my @args=@_;
87 for ($Zll,$Zlh,$Zhl,$Zhh) {
88 $code.=<<___;
89#if __ARM_ARCH__>=7 && defined(__ARMEL__)
90 rev $_,$_
91 str $_,[$Xi,#$i]
92#elif defined(__ARMEB__)
93 str $_,[$Xi,#$i]
94#else
95 mov $Tlh,$_,lsr#8
96 strb $_,[$Xi,#$i+3]
97 mov $Thl,$_,lsr#16
98 strb $Tlh,[$Xi,#$i+2]
99 mov $Thh,$_,lsr#24
100 strb $Thl,[$Xi,#$i+1]
101 strb $Thh,[$Xi,#$i]
102#endif
103___
104 $code.="\t".shift(@args)."\n";
105 $i-=4;
106 }
107}
108
109$code=<<___;
110#include "arm_arch.h"
111
112.text
113.code 32
114
115.type rem_4bit,%object
116.align 5
117rem_4bit:
118.short 0x0000,0x1C20,0x3840,0x2460
119.short 0x7080,0x6CA0,0x48C0,0x54E0
120.short 0xE100,0xFD20,0xD940,0xC560
121.short 0x9180,0x8DA0,0xA9C0,0xB5E0
122.size rem_4bit,.-rem_4bit
123
124.type rem_4bit_get,%function
125rem_4bit_get:
126 sub $rem_4bit,pc,#8
127 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
128 b .Lrem_4bit_got
129 nop
130.size rem_4bit_get,.-rem_4bit_get
131
132.global gcm_ghash_4bit
133.type gcm_ghash_4bit,%function
134gcm_ghash_4bit:
135 sub r12,pc,#8
136 add $len,$inp,$len @ $len to point at the end
137 stmdb sp!,{r3-r11,lr} @ save $len/end too
138 sub r12,r12,#48 @ &rem_4bit
139
140 ldmia r12,{r4-r11} @ copy rem_4bit ...
141 stmdb sp!,{r4-r11} @ ... to stack
142
143 ldrb $nlo,[$inp,#15]
144 ldrb $nhi,[$Xi,#15]
145.Louter:
146 eor $nlo,$nlo,$nhi
147 and $nhi,$nlo,#0xf0
148 and $nlo,$nlo,#0x0f
149 mov $cnt,#14
150
151 add $Zhh,$Htbl,$nlo,lsl#4
152 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
153 add $Thh,$Htbl,$nhi
154 ldrb $nlo,[$inp,#14]
155
156 and $nhi,$Zll,#0xf @ rem
157 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
158 add $nhi,$nhi,$nhi
159 eor $Zll,$Tll,$Zll,lsr#4
160 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
161 eor $Zll,$Zll,$Zlh,lsl#28
162 ldrb $nhi,[$Xi,#14]
163 eor $Zlh,$Tlh,$Zlh,lsr#4
164 eor $Zlh,$Zlh,$Zhl,lsl#28
165 eor $Zhl,$Thl,$Zhl,lsr#4
166 eor $Zhl,$Zhl,$Zhh,lsl#28
167 eor $Zhh,$Thh,$Zhh,lsr#4
168 eor $nlo,$nlo,$nhi
169 and $nhi,$nlo,#0xf0
170 and $nlo,$nlo,#0x0f
171 eor $Zhh,$Zhh,$Tll,lsl#16
172
173.Linner:
174 add $Thh,$Htbl,$nlo,lsl#4
175 and $nlo,$Zll,#0xf @ rem
176 subs $cnt,$cnt,#1
177 add $nlo,$nlo,$nlo
178 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
179 eor $Zll,$Tll,$Zll,lsr#4
180 eor $Zll,$Zll,$Zlh,lsl#28
181 eor $Zlh,$Tlh,$Zlh,lsr#4
182 eor $Zlh,$Zlh,$Zhl,lsl#28
183 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
184 eor $Zhl,$Thl,$Zhl,lsr#4
185 ldrplb $nlo,[$inp,$cnt]
186 eor $Zhl,$Zhl,$Zhh,lsl#28
187 eor $Zhh,$Thh,$Zhh,lsr#4
188
189 add $Thh,$Htbl,$nhi
190 and $nhi,$Zll,#0xf @ rem
191 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
192 add $nhi,$nhi,$nhi
193 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
194 eor $Zll,$Tll,$Zll,lsr#4
195 ldrplb $Tll,[$Xi,$cnt]
196 eor $Zll,$Zll,$Zlh,lsl#28
197 eor $Zlh,$Tlh,$Zlh,lsr#4
198 ldrh $Tlh,[sp,$nhi]
199 eor $Zlh,$Zlh,$Zhl,lsl#28
200 eor $Zhl,$Thl,$Zhl,lsr#4
201 eor $Zhl,$Zhl,$Zhh,lsl#28
202 eorpl $nlo,$nlo,$Tll
203 eor $Zhh,$Thh,$Zhh,lsr#4
204 andpl $nhi,$nlo,#0xf0
205 andpl $nlo,$nlo,#0x0f
206 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
207 bpl .Linner
208
209 ldr $len,[sp,#32] @ re-load $len/end
210 add $inp,$inp,#16
211 mov $nhi,$Zll
212___
213 &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
214$code.=<<___;
215 bne .Louter
216
217 add sp,sp,#36
218#if __ARM_ARCH__>=5
219 ldmia sp!,{r4-r11,pc}
220#else
221 ldmia sp!,{r4-r11,lr}
222 tst lr,#1
223 moveq pc,lr @ be binary compatible with V4, yet
224 bx lr @ interoperable with Thumb ISA:-)
225#endif
226.size gcm_ghash_4bit,.-gcm_ghash_4bit
227
228.global gcm_gmult_4bit
229.type gcm_gmult_4bit,%function
230gcm_gmult_4bit:
231 stmdb sp!,{r4-r11,lr}
232 ldrb $nlo,[$Xi,#15]
233 b rem_4bit_get
234.Lrem_4bit_got:
235 and $nhi,$nlo,#0xf0
236 and $nlo,$nlo,#0x0f
237 mov $cnt,#14
238
239 add $Zhh,$Htbl,$nlo,lsl#4
240 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
241 ldrb $nlo,[$Xi,#14]
242
243 add $Thh,$Htbl,$nhi
244 and $nhi,$Zll,#0xf @ rem
245 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
246 add $nhi,$nhi,$nhi
247 eor $Zll,$Tll,$Zll,lsr#4
248 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
249 eor $Zll,$Zll,$Zlh,lsl#28
250 eor $Zlh,$Tlh,$Zlh,lsr#4
251 eor $Zlh,$Zlh,$Zhl,lsl#28
252 eor $Zhl,$Thl,$Zhl,lsr#4
253 eor $Zhl,$Zhl,$Zhh,lsl#28
254 eor $Zhh,$Thh,$Zhh,lsr#4
255 and $nhi,$nlo,#0xf0
256 eor $Zhh,$Zhh,$Tll,lsl#16
257 and $nlo,$nlo,#0x0f
258
259.Loop:
260 add $Thh,$Htbl,$nlo,lsl#4
261 and $nlo,$Zll,#0xf @ rem
262 subs $cnt,$cnt,#1
263 add $nlo,$nlo,$nlo
264 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
265 eor $Zll,$Tll,$Zll,lsr#4
266 eor $Zll,$Zll,$Zlh,lsl#28
267 eor $Zlh,$Tlh,$Zlh,lsr#4
268 eor $Zlh,$Zlh,$Zhl,lsl#28
269 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
270 eor $Zhl,$Thl,$Zhl,lsr#4
271 ldrplb $nlo,[$Xi,$cnt]
272 eor $Zhl,$Zhl,$Zhh,lsl#28
273 eor $Zhh,$Thh,$Zhh,lsr#4
274
275 add $Thh,$Htbl,$nhi
276 and $nhi,$Zll,#0xf @ rem
277 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
278 add $nhi,$nhi,$nhi
279 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
280 eor $Zll,$Tll,$Zll,lsr#4
281 eor $Zll,$Zll,$Zlh,lsl#28
282 eor $Zlh,$Tlh,$Zlh,lsr#4
283 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
284 eor $Zlh,$Zlh,$Zhl,lsl#28
285 eor $Zhl,$Thl,$Zhl,lsr#4
286 eor $Zhl,$Zhl,$Zhh,lsl#28
287 eor $Zhh,$Thh,$Zhh,lsr#4
288 andpl $nhi,$nlo,#0xf0
289 andpl $nlo,$nlo,#0x0f
290 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
291 bpl .Loop
292___
293 &Zsmash();
294$code.=<<___;
295#if __ARM_ARCH__>=5
296 ldmia sp!,{r4-r11,pc}
297#else
298 ldmia sp!,{r4-r11,lr}
299 tst lr,#1
300 moveq pc,lr @ be binary compatible with V4, yet
301 bx lr @ interoperable with Thumb ISA:-)
302#endif
303.size gcm_gmult_4bit,.-gcm_gmult_4bit
304___
305{
306my $cnt=$Htbl; # $Htbl is used once in the very beginning
307
308my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
309my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
310
311# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
312# in Zo. Or should I say "top bit", because GHASH is specified in
313# reverse bit order? Otherwise straightforward 128-bt H by one input
314# byte multiplication and modulo-reduction, times 16.
315
316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
318sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
319
320$code.=<<___;
321#if __ARM_ARCH__>=7
322.fpu neon
323
324.global gcm_gmult_neon
325.type gcm_gmult_neon,%function
326.align 4
327gcm_gmult_neon:
328 sub $Htbl,#16 @ point at H in GCM128_CTX
329 vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
330 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
331 vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
332 vshr.u64 $mod,#32
333 vldmia $Htbl,{$Hhi-$Hlo} @ load H
334 veor $zero,$zero
335#ifdef __ARMEL__
336 vrev64.8 $IN,$IN
337#endif
338 veor $Qpost,$Qpost
339 veor $R,$R
340 mov $cnt,#16
341 veor $Z,$Z
342 mov $len,#16
343 veor $Zo,$Zo
344 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
345 b .Linner_neon
346.size gcm_gmult_neon,.-gcm_gmult_neon
347
348.global gcm_ghash_neon
349.type gcm_ghash_neon,%function
350.align 4
351gcm_ghash_neon:
352 vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
353 vmov.i32 $mod,#0xe1 @ our irreducible polynomial
354 vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
355 vshr.u64 $mod,#32
356 vldmia $Xi,{$Hhi-$Hlo} @ load H
357 veor $zero,$zero
358 nop
359#ifdef __ARMEL__
360 vrev64.8 $Z,$Z
361#endif
362.Louter_neon:
363 vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
364 veor $Qpost,$Qpost
365 vld1.64 `&Dlo($IN)`,[$inp]!
366 veor $R,$R
367 mov $cnt,#16
368#ifdef __ARMEL__
369 vrev64.8 $IN,$IN
370#endif
371 veor $Zo,$Zo
372 veor $IN,$Z @ inp^=Xi
373 veor $Z,$Z
374 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
375.Linner_neon:
376 subs $cnt,$cnt,#1
377 vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
378 vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
379 vext.8 $IN,$zero,#1 @ IN>>=8
380
381 veor $Z,$Qpost @ modulo-scheduled part
382 vshl.i64 `&Dlo("$R")`,#48
383 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
384 veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
385
386 veor `&Dhi("$Z")`,`&Dlo("$R")`
387 vuzp.8 $Qlo,$Qhi
388 vsli.8 $Zo,$T,#1 @ compose the "carry" byte
389 vext.8 $Z,$zero,#1 @ Z>>=8
390
391 vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
392 vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
393 vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
394 veor $Z,$Qhi
395 bne .Linner_neon
396
397 veor $Z,$Qpost @ modulo-scheduled artefact
398 vshl.i64 `&Dlo("$R")`,#48
399 veor `&Dhi("$Z")`,`&Dlo("$R")`
400
401 @ finalization, normalize Z:Zo
402 vand $Zo,$mod @ suffices to mask the bit
403 vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
404 vshl.i64 $Z,#1
405 subs $len,#16
406 vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
407 bne .Louter_neon
408
409#ifdef __ARMEL__
410 vrev64.8 $Z,$Z
411#endif
412 sub $Xi,#16
413 vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
414 vst1.64 `&Dlo("$Z")`,[$Xi,:64]
415
416 bx lr
417.size gcm_ghash_neon,.-gcm_ghash_neon
418#endif
419___
420}
421$code.=<<___;
422.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
423.align 2
424___
425
426$code =~ s/\`([^\`]*)\`/eval $1/gem;
427$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
428print $code;
429close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
deleted file mode 100755
index 0354c95444..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-ia64.pl
+++ /dev/null
@@ -1,463 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15# GHASH performance was measured to be 6.67 cycles per processed byte
16# on Itanium 2, which is >90% better than Microsoft compiler generated
17# code. To anchor to something else sha1-ia64.pl module processes one
18# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19# byte.
20
21# September 2010
22#
23# It was originally thought that it makes lesser sense to implement
24# "528B" variant on Itanium 2 for following reason. Because number of
25# functional units is naturally limited, it appeared impossible to
26# implement "528B" loop in 4 cycles, only in 5. This would mean that
27# theoretically performance improvement couldn't be more than 20%.
28# But occasionally you prove yourself wrong:-) I figured out a way to
29# fold couple of instructions and having freed yet another instruction
30# slot by unrolling the loop... Resulting performance is 4.45 cycles
31# per processed byte and 50% better than "256B" version. On original
32# Itanium performance should remain the same as the "256B" version,
33# i.e. ~8.5 cycles.
34
35$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36
37if ($^O eq "hpux") {
38 $ADDP="addp4";
39 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40} else { $ADDP="add"; }
41for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
42 $big_endian=0 if (/\-DL_ENDIAN/); }
43if (!defined($big_endian))
44 { $big_endian=(unpack('L',pack('N',1))==1); }
45
46sub loop() {
47my $label=shift;
48my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49
50# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51# in scalable manner;-) Naturally assuming data in L1 cache...
52# Special note about 'dep' instruction, which is used to construct
53# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54# bytes boundary and lower 7 bits of its address are guaranteed to
55# be zero.
56$code.=<<___;
57$label:
58{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
59 (p19) dep rem=Zlo,rem_4bitp,3,4 }
60{ .mfi; (p19) xor Zhi=Zhi,Hhi
61 ($p17) xor xi[1]=xi[1],in[1] };;
62{ .mfi; (p18) ld8 Hhi=[Hi[1]]
63 (p19) shrp Zlo=Zhi,Zlo,4 }
64{ .mfi; (p19) ld8 rem=[rem]
65 (p18) and Hi[1]=mask0xf0,xi[2] };;
66{ .mmi; ($p16) ld1 in[0]=[inp],-1
67 (p18) xor Zlo=Zlo,Hlo
68 (p19) shr.u Zhi=Zhi,4 }
69{ .mib; (p19) xor Hhi=Hhi,rem
70 (p18) add Hi[1]=Htbl,Hi[1] };;
71
72{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
73 (p18) dep rem=Zlo,rem_4bitp,3,4 }
74{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
75 (p18) xor Zhi=Zhi,Hhi };;
76{ .mfi; (p18) ld8 Hhi=[Hi[1]]
77 (p18) shrp Zlo=Zhi,Zlo,4 }
78{ .mfi; (p18) ld8 rem=[rem]
79 (p17) and Hi[0]=mask0xf0,Hi[0] };;
80{ .mmi; (p16) ld1 xi[0]=[Xi],-1
81 (p18) xor Zlo=Zlo,Hlo
82 (p18) shr.u Zhi=Zhi,4 }
83{ .mib; (p18) xor Hhi=Hhi,rem
84 (p17) add Hi[0]=Htbl,Hi[0]
85 br.ctop.sptk $label };;
86___
87}
88
89$code=<<___;
90.explicit
91.text
92
93prevfs=r2; prevlc=r3; prevpr=r8;
94mask0xf0=r21;
95rem=r22; rem_4bitp=r23;
96Xi=r24; Htbl=r25;
97inp=r26; end=r27;
98Hhi=r28; Hlo=r29;
99Zhi=r30; Zlo=r31;
100
101.align 128
102.skip 16 // aligns loop body
103.global gcm_gmult_4bit#
104.proc gcm_gmult_4bit#
105gcm_gmult_4bit:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,2,6,0,8
109 $ADDP Xi=15,in0 // &Xi[15]
110 mov rem_4bitp=ip }
111{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
112 .save ar.lc,prevlc
113 mov prevlc=ar.lc
114 .save pr,prevpr
115 mov prevpr=pr };;
116
117 .body
118 .rotr in[3],xi[3],Hi[2]
119
120{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
121 mov mask0xf0=0xf0
122 brp.loop.imp .Loop1,.Lend1-16};;
123{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
124 };;
125{ .mii; shladd Hi[1]=xi[2],4,r0
126 mov pr.rot=0x7<<16
127 mov ar.lc=13 };;
128{ .mii; and Hi[1]=mask0xf0,Hi[1]
129 mov ar.ec=3
130 xor Zlo=Zlo,Zlo };;
131{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
132 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133 xor Zhi=Zhi,Zhi };;
134___
135 &loop (".Loop1",1);
136$code.=<<___;
137.Lend1:
138{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
139{ .mib; mux1 Zlo=Zlo,\@rev };;
140{ .mib; mux1 Zhi=Zhi,\@rev };;
141{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
142 add Hhi=1,Xi };; // pipeline flush on Itanium
143{ .mib; st8 [Hlo]=Zlo
144 mov pr=prevpr,0x1ffff };;
145{ .mib; st8 [Hhi]=Zhi
146 mov ar.lc=prevlc
147 br.ret.sptk.many b0 };;
148.endp gcm_gmult_4bit#
149___
150
151######################################################################
152# "528B" (well, "512B" actualy) streamed GHASH
153#
154$Xip="in0";
155$Htbl="in1";
156$inp="in2";
157$len="in3";
158$rem_8bit="loc0";
159$mask0xff="loc1";
160($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161
162sub load_htable() {
163 for (my $i=0;$i<8;$i++) {
164 $code.=<<___;
165{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
166 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
167{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
168 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
169___
170 $code.=shift if (($i+$#_)==7);
171 $code.="\t};;\n"
172 }
173}
174
175$code.=<<___;
176prevsp=r3;
177
178.align 32
179.skip 16 // aligns loop body
180.global gcm_ghash_4bit#
181.proc gcm_ghash_4bit#
182gcm_ghash_4bit:
183 .prologue
184{ .mmi; .save ar.pfs,prevfs
185 alloc prevfs=ar.pfs,4,2,0,0
186 .vframe prevsp
187 mov prevsp=sp
188 mov $rem_8bit=ip };;
189 .body
190{ .mfi; $ADDP r8=0+0,$Htbl
191 $ADDP r9=0+8,$Htbl }
192{ .mfi; $ADDP r10=128+0,$Htbl
193 $ADDP r11=128+8,$Htbl };;
194___
195 &load_htable(
196 " $ADDP $Xip=15,$Xip", # &Xi[15]
197 " $ADDP $len=$len,$inp", # &inp[len]
198 " $ADDP $inp=15,$inp", # &inp[15]
199 " mov $mask0xff=0xff",
200 " add sp=-512,sp",
201 " andcm sp=sp,$mask0xff", # align stack frame
202 " add r14=0,sp",
203 " add r15=8,sp");
204$code.=<<___;
205{ .mmi; $sum 1<<1 // go big-endian
206 add r8=256+0,sp
207 add r9=256+8,sp }
208{ .mmi; add r10=256+128+0,sp
209 add r11=256+128+8,sp
210 add $len=-17,$len };;
211___
212for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
213my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214$code.=<<___;
215{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
216 st8 [r9]=$rhi,16 // Htable[$i].hi
217 shrp $rlo=$rhi,$rlo,4 }//;;
218{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
219 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
220 shr.u $rhi=$rhi,4 };;
221{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
222 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
223___
224}
225$code.=<<___;
226{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
227 ld8 r17=[r9],16 };; // Htable[8].hi
228{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
229 ld8 r19=[r9],16 } // Htable[9].hi
230{ .mmi; rum 1<<5 // clear um.mfh
231 shrp r16=r17,r16,4 };;
232___
233for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
234$code.=<<___;
235{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
236 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
237 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
238{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
239 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
240 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
241___
242}
243$code.=<<___;
244{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248{ .mmi; add $Htbl=256,sp // &Htable[0]
249 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
251{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
252 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
253___
254
255$in="r15";
256@xi=("r16","r17");
257@rem=("r18","r19");
258($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259($Atbl,$Btbl)=("r26","r27");
260
261$code.=<<___; # (p16)
262{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
263 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
264 cmp.eq p0,p6=r0,r0 };; // clear p6
265___
266push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
267
268$code.=<<___; # (p16),(p17)
269{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
270 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
271{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
272 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
273 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
274.align 32
275.LOOP:
276{ .mmi;
277(p6) st8 [$Xip]=$Zhi,13
278 xor $Zlo=$Zlo,$Zlo
279 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
280___
281push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
282
283$code.=<<___; # (p16),(p17),(p18)
284{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
285 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
287{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
288 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
289{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
290 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
291{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292 ld1 $in=[$inp],-1 } //(p16) *inp--
293{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
294 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
295 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
296{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
297 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
298 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
301___
302push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
303
304for ($i=1;$i<14;$i++) {
305# Above and below fragments are derived from this one by removing
306# unsuitable (p??) instructions.
307$code.=<<___; # (p16),(p17),(p18),(p19)
308{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
309 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
311{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
312 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
313 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
314{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
315 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
316 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
317{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
318 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
319 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
320{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321 ld1 $in=[$inp],-1 //(p16) *inp--
322 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
323{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
324 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
325 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
326{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
327 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
328 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
331 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
332___
333push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
334}
335
336$code.=<<___; # (p17),(p18),(p19)
337{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
338 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
340{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
341 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
342 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
343{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
344 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
345 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
346{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
347 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
348 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
349{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
351{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
352 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
353 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
354{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
355 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
358 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
359___
360push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
361
362$code.=<<___; # (p18),(p19)
363{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
364 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
365{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
366 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
367{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
368 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
369{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
370 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
371{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
372 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
373{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
374 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
375{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
376 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
379___
380push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
381
382$code.=<<___; # (p19)
383{ .mmi; cmp.ltu p6,p0=$inp,$len
384 add $inp=32,$inp
385 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
386{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
387 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
388 add $Xip=9,$Xip };; // &Xi.lo
389{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
390(p6) ld1 $in=[$inp],-1 //[p16] *inp--
391(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
392{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
393(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
394{ .mmi; st8 [$Xip]=$Zlo,-8
395(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
396 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
397{ .mmi;
398(p6) ld1 $in=[$inp],-1 //[p16] *inp--
399 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
400(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
401{ .mib;
402(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
403(p6) br.cond.dptk.many .LOOP };;
404
405{ .mib; st8 [$Xip]=$Zhi };;
406{ .mib; $rum 1<<1 // return to little-endian
407 .restore sp
408 mov sp=prevsp
409 br.ret.sptk.many b0 };;
410.endp gcm_ghash_4bit#
411___
412$code.=<<___;
413.align 128
414.type rem_4bit#,\@object
415rem_4bit:
416 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420.size rem_4bit#,128
421.type rem_8bit#,\@object
422rem_8bit:
423 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455.size rem_8bit#,512
456stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457___
458
459$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461
462print $code;
463close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index 965802d3fa..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,741 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36 $NREGS =6;
37} else {
38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46 $NREGS =11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50 # [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26"; # argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl; # variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73 $Zhl="%r6";
74 $Zlh="%r7";
75 $Hhl="%r8";
76 $Hlh="%r9";
77 $Thl="%r10";
78 $Tlh="%r11";
79}
80$rem2="%r6"; # used in PA-RISC 2.0 code
81
82$code.=<<___;
83 .LEVEL $LEVEL
84#if 0
85 .SPACE \$TEXT\$
86 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
87#else
88 .text
89#endif
90
91 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
92 .ALIGN 64
93gcm_gmult_4bit
94 .PROC
95 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
96 .ENTRY
97 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
98 $PUSHMA %r3,$FRAME(%sp)
99 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
100 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
101 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
102___
103$code.=<<___ if ($SIZE_T==4);
104 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
105 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
106 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
107 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
108 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
109___
110$code.=<<___;
111 blr %r0,$rem_4bit
112 ldi 3,$rem
113L\$pic_gmult
114 andcm $rem_4bit,$rem,$rem_4bit
115 addl $inp,$len,$len
116 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
117 ldi 0xf0,$mask0xf0
118___
119$code.=<<___ if ($SIZE_T==4);
120#ifndef __OpenBSD__
121 ldi 31,$rem
122 mtctl $rem,%cr11
123 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
124 b L\$parisc1_gmult
125 nop
126___
127
128$code.=<<___;
129 ldb 15($Xi),$nlo
130 ldo 8($Htbl),$Hll
131
132 and $mask0xf0,$nlo,$nhi
133 depd,z $nlo,59,4,$nlo
134
135 ldd $nlo($Hll),$Zll
136 ldd $nlo($Hhh),$Zhh
137
138 depd,z $Zll,60,4,$rem
139 shrpd $Zhh,$Zll,4,$Zll
140 extrd,u $Zhh,59,60,$Zhh
141 ldb 14($Xi),$nlo
142
143 ldd $nhi($Hll),$Tll
144 ldd $nhi($Hhh),$Thh
145 and $mask0xf0,$nlo,$nhi
146 depd,z $nlo,59,4,$nlo
147
148 xor $Tll,$Zll,$Zll
149 xor $Thh,$Zhh,$Zhh
150 ldd $rem($rem_4bit),$rem
151 b L\$oop_gmult_pa2
152 ldi 13,$cnt
153
154 .ALIGN 8
155L\$oop_gmult_pa2
156 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
157 depd,z $Zll,60,4,$rem
158
159 shrpd $Zhh,$Zll,4,$Zll
160 extrd,u $Zhh,59,60,$Zhh
161 ldd $nlo($Hll),$Tll
162 ldd $nlo($Hhh),$Thh
163
164 xor $Tll,$Zll,$Zll
165 xor $Thh,$Zhh,$Zhh
166 ldd $rem($rem_4bit),$rem
167
168 xor $rem,$Zhh,$Zhh
169 depd,z $Zll,60,4,$rem
170 ldbx $cnt($Xi),$nlo
171
172 shrpd $Zhh,$Zll,4,$Zll
173 extrd,u $Zhh,59,60,$Zhh
174 ldd $nhi($Hll),$Tll
175 ldd $nhi($Hhh),$Thh
176
177 and $mask0xf0,$nlo,$nhi
178 depd,z $nlo,59,4,$nlo
179 ldd $rem($rem_4bit),$rem
180
181 xor $Tll,$Zll,$Zll
182 addib,uv -1,$cnt,L\$oop_gmult_pa2
183 xor $Thh,$Zhh,$Zhh
184
185 xor $rem,$Zhh,$Zhh
186 depd,z $Zll,60,4,$rem
187
188 shrpd $Zhh,$Zll,4,$Zll
189 extrd,u $Zhh,59,60,$Zhh
190 ldd $nlo($Hll),$Tll
191 ldd $nlo($Hhh),$Thh
192
193 xor $Tll,$Zll,$Zll
194 xor $Thh,$Zhh,$Zhh
195 ldd $rem($rem_4bit),$rem
196
197 xor $rem,$Zhh,$Zhh
198 depd,z $Zll,60,4,$rem
199
200 shrpd $Zhh,$Zll,4,$Zll
201 extrd,u $Zhh,59,60,$Zhh
202 ldd $nhi($Hll),$Tll
203 ldd $nhi($Hhh),$Thh
204
205 xor $Tll,$Zll,$Zll
206 xor $Thh,$Zhh,$Zhh
207 ldd $rem($rem_4bit),$rem
208
209 xor $rem,$Zhh,$Zhh
210 std $Zll,8($Xi)
211 std $Zhh,0($Xi)
212___
213
214$code.=<<___ if ($SIZE_T==4);
215 b L\$done_gmult
216 nop
217
218L\$parisc1_gmult
219#endif
220 ldb 15($Xi),$nlo
221 ldo 12($Htbl),$Hll
222 ldo 8($Htbl),$Hlh
223 ldo 4($Htbl),$Hhl
224
225 and $mask0xf0,$nlo,$nhi
226 zdep $nlo,27,4,$nlo
227
228 ldwx $nlo($Hll),$Zll
229 ldwx $nlo($Hlh),$Zlh
230 ldwx $nlo($Hhl),$Zhl
231 ldwx $nlo($Hhh),$Zhh
232 zdep $Zll,28,4,$rem
233 ldb 14($Xi),$nlo
234 ldwx $rem($rem_4bit),$rem
235 shrpw $Zlh,$Zll,4,$Zll
236 ldwx $nhi($Hll),$Tll
237 shrpw $Zhl,$Zlh,4,$Zlh
238 ldwx $nhi($Hlh),$Tlh
239 shrpw $Zhh,$Zhl,4,$Zhl
240 ldwx $nhi($Hhl),$Thl
241 extru $Zhh,27,28,$Zhh
242 ldwx $nhi($Hhh),$Thh
243 xor $rem,$Zhh,$Zhh
244 and $mask0xf0,$nlo,$nhi
245 zdep $nlo,27,4,$nlo
246
247 xor $Tll,$Zll,$Zll
248 ldwx $nlo($Hll),$Tll
249 xor $Tlh,$Zlh,$Zlh
250 ldwx $nlo($Hlh),$Tlh
251 xor $Thl,$Zhl,$Zhl
252 b L\$oop_gmult_pa1
253 ldi 13,$cnt
254
255 .ALIGN 8
256L\$oop_gmult_pa1
257 zdep $Zll,28,4,$rem
258 ldwx $nlo($Hhl),$Thl
259 xor $Thh,$Zhh,$Zhh
260 ldwx $rem($rem_4bit),$rem
261 shrpw $Zlh,$Zll,4,$Zll
262 ldwx $nlo($Hhh),$Thh
263 shrpw $Zhl,$Zlh,4,$Zlh
264 ldbx $cnt($Xi),$nlo
265 xor $Tll,$Zll,$Zll
266 ldwx $nhi($Hll),$Tll
267 shrpw $Zhh,$Zhl,4,$Zhl
268 xor $Tlh,$Zlh,$Zlh
269 ldwx $nhi($Hlh),$Tlh
270 extru $Zhh,27,28,$Zhh
271 xor $Thl,$Zhl,$Zhl
272 ldwx $nhi($Hhl),$Thl
273 xor $rem,$Zhh,$Zhh
274 zdep $Zll,28,4,$rem
275 xor $Thh,$Zhh,$Zhh
276 ldwx $nhi($Hhh),$Thh
277 shrpw $Zlh,$Zll,4,$Zll
278 ldwx $rem($rem_4bit),$rem
279 shrpw $Zhl,$Zlh,4,$Zlh
280 shrpw $Zhh,$Zhl,4,$Zhl
281 and $mask0xf0,$nlo,$nhi
282 extru $Zhh,27,28,$Zhh
283 zdep $nlo,27,4,$nlo
284 xor $Tll,$Zll,$Zll
285 ldwx $nlo($Hll),$Tll
286 xor $Tlh,$Zlh,$Zlh
287 ldwx $nlo($Hlh),$Tlh
288 xor $rem,$Zhh,$Zhh
289 addib,uv -1,$cnt,L\$oop_gmult_pa1
290 xor $Thl,$Zhl,$Zhl
291
292 zdep $Zll,28,4,$rem
293 ldwx $nlo($Hhl),$Thl
294 xor $Thh,$Zhh,$Zhh
295 ldwx $rem($rem_4bit),$rem
296 shrpw $Zlh,$Zll,4,$Zll
297 ldwx $nlo($Hhh),$Thh
298 shrpw $Zhl,$Zlh,4,$Zlh
299 xor $Tll,$Zll,$Zll
300 ldwx $nhi($Hll),$Tll
301 shrpw $Zhh,$Zhl,4,$Zhl
302 xor $Tlh,$Zlh,$Zlh
303 ldwx $nhi($Hlh),$Tlh
304 extru $Zhh,27,28,$Zhh
305 xor $rem,$Zhh,$Zhh
306 xor $Thl,$Zhl,$Zhl
307 ldwx $nhi($Hhl),$Thl
308 xor $Thh,$Zhh,$Zhh
309 ldwx $nhi($Hhh),$Thh
310 zdep $Zll,28,4,$rem
311 ldwx $rem($rem_4bit),$rem
312 shrpw $Zlh,$Zll,4,$Zll
313 shrpw $Zhl,$Zlh,4,$Zlh
314 shrpw $Zhh,$Zhl,4,$Zhl
315 extru $Zhh,27,28,$Zhh
316 xor $Tll,$Zll,$Zll
317 xor $Tlh,$Zlh,$Zlh
318 xor $rem,$Zhh,$Zhh
319 stw $Zll,12($Xi)
320 xor $Thl,$Zhl,$Zhl
321 stw $Zlh,8($Xi)
322 xor $Thh,$Zhh,$Zhh
323 stw $Zhl,4($Xi)
324 stw $Zhh,0($Xi)
325___
326$code.=<<___;
327L\$done_gmult
328 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
329 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
330 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
331 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
332___
333$code.=<<___ if ($SIZE_T==4);
334 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
335 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
336 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
337 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
338 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
339___
340$code.=<<___;
341 bv (%r2)
342 .EXIT
343 $POPMB -$FRAME(%sp),%r3
344 .PROCEND
345
346 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
347 .ALIGN 64
348gcm_ghash_4bit
349 .PROC
350 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
351 .ENTRY
352 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
353 $PUSHMA %r3,$FRAME(%sp)
354 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
355 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
356 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
357___
358$code.=<<___ if ($SIZE_T==4);
359 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
360 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
361 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
362 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
363 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
364___
365$code.=<<___;
366 blr %r0,$rem_4bit
367 ldi 3,$rem
368L\$pic_ghash
369 andcm $rem_4bit,$rem,$rem_4bit
370 addl $inp,$len,$len
371 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
372 ldi 0xf0,$mask0xf0
373___
374$code.=<<___ if ($SIZE_T==4);
375#ifndef __OpenBSD__
376 ldi 31,$rem
377 mtctl $rem,%cr11
378 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
379 b L\$parisc1_ghash
380 nop
381___
382
383$code.=<<___;
384 ldb 15($Xi),$nlo
385 ldo 8($Htbl),$Hll
386
387L\$outer_ghash_pa2
388 ldb 15($inp),$nhi
389 xor $nhi,$nlo,$nlo
390 and $mask0xf0,$nlo,$nhi
391 depd,z $nlo,59,4,$nlo
392
393 ldd $nlo($Hll),$Zll
394 ldd $nlo($Hhh),$Zhh
395
396 depd,z $Zll,60,4,$rem
397 shrpd $Zhh,$Zll,4,$Zll
398 extrd,u $Zhh,59,60,$Zhh
399 ldb 14($Xi),$nlo
400 ldb 14($inp),$byte
401
402 ldd $nhi($Hll),$Tll
403 ldd $nhi($Hhh),$Thh
404 xor $byte,$nlo,$nlo
405 and $mask0xf0,$nlo,$nhi
406 depd,z $nlo,59,4,$nlo
407
408 xor $Tll,$Zll,$Zll
409 xor $Thh,$Zhh,$Zhh
410 ldd $rem($rem_4bit),$rem
411 b L\$oop_ghash_pa2
412 ldi 13,$cnt
413
414 .ALIGN 8
415L\$oop_ghash_pa2
416 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
417 depd,z $Zll,60,4,$rem2
418
419 shrpd $Zhh,$Zll,4,$Zll
420 extrd,u $Zhh,59,60,$Zhh
421 ldd $nlo($Hll),$Tll
422 ldd $nlo($Hhh),$Thh
423
424 xor $Tll,$Zll,$Zll
425 xor $Thh,$Zhh,$Zhh
426 ldbx $cnt($Xi),$nlo
427 ldbx $cnt($inp),$byte
428
429 depd,z $Zll,60,4,$rem
430 shrpd $Zhh,$Zll,4,$Zll
431 ldd $rem2($rem_4bit),$rem2
432
433 xor $rem2,$Zhh,$Zhh
434 xor $byte,$nlo,$nlo
435 ldd $nhi($Hll),$Tll
436 ldd $nhi($Hhh),$Thh
437
438 and $mask0xf0,$nlo,$nhi
439 depd,z $nlo,59,4,$nlo
440
441 extrd,u $Zhh,59,60,$Zhh
442 xor $Tll,$Zll,$Zll
443
444 ldd $rem($rem_4bit),$rem
445 addib,uv -1,$cnt,L\$oop_ghash_pa2
446 xor $Thh,$Zhh,$Zhh
447
448 xor $rem,$Zhh,$Zhh
449 depd,z $Zll,60,4,$rem2
450
451 shrpd $Zhh,$Zll,4,$Zll
452 extrd,u $Zhh,59,60,$Zhh
453 ldd $nlo($Hll),$Tll
454 ldd $nlo($Hhh),$Thh
455
456 xor $Tll,$Zll,$Zll
457 xor $Thh,$Zhh,$Zhh
458
459 depd,z $Zll,60,4,$rem
460 shrpd $Zhh,$Zll,4,$Zll
461 ldd $rem2($rem_4bit),$rem2
462
463 xor $rem2,$Zhh,$Zhh
464 ldd $nhi($Hll),$Tll
465 ldd $nhi($Hhh),$Thh
466
467 extrd,u $Zhh,59,60,$Zhh
468 xor $Tll,$Zll,$Zll
469 xor $Thh,$Zhh,$Zhh
470 ldd $rem($rem_4bit),$rem
471
472 xor $rem,$Zhh,$Zhh
473 std $Zll,8($Xi)
474 ldo 16($inp),$inp
475 std $Zhh,0($Xi)
476 cmpb,*<> $inp,$len,L\$outer_ghash_pa2
477 copy $Zll,$nlo
478___
479
480$code.=<<___ if ($SIZE_T==4);
481 b L\$done_ghash
482 nop
483
484L\$parisc1_ghash
485#endif
486 ldb 15($Xi),$nlo
487 ldo 12($Htbl),$Hll
488 ldo 8($Htbl),$Hlh
489 ldo 4($Htbl),$Hhl
490
491L\$outer_ghash_pa1
492 ldb 15($inp),$byte
493 xor $byte,$nlo,$nlo
494 and $mask0xf0,$nlo,$nhi
495 zdep $nlo,27,4,$nlo
496
497 ldwx $nlo($Hll),$Zll
498 ldwx $nlo($Hlh),$Zlh
499 ldwx $nlo($Hhl),$Zhl
500 ldwx $nlo($Hhh),$Zhh
501 zdep $Zll,28,4,$rem
502 ldb 14($Xi),$nlo
503 ldb 14($inp),$byte
504 ldwx $rem($rem_4bit),$rem
505 shrpw $Zlh,$Zll,4,$Zll
506 ldwx $nhi($Hll),$Tll
507 shrpw $Zhl,$Zlh,4,$Zlh
508 ldwx $nhi($Hlh),$Tlh
509 shrpw $Zhh,$Zhl,4,$Zhl
510 ldwx $nhi($Hhl),$Thl
511 extru $Zhh,27,28,$Zhh
512 ldwx $nhi($Hhh),$Thh
513 xor $byte,$nlo,$nlo
514 xor $rem,$Zhh,$Zhh
515 and $mask0xf0,$nlo,$nhi
516 zdep $nlo,27,4,$nlo
517
518 xor $Tll,$Zll,$Zll
519 ldwx $nlo($Hll),$Tll
520 xor $Tlh,$Zlh,$Zlh
521 ldwx $nlo($Hlh),$Tlh
522 xor $Thl,$Zhl,$Zhl
523 b L\$oop_ghash_pa1
524 ldi 13,$cnt
525
526 .ALIGN 8
527L\$oop_ghash_pa1
528 zdep $Zll,28,4,$rem
529 ldwx $nlo($Hhl),$Thl
530 xor $Thh,$Zhh,$Zhh
531 ldwx $rem($rem_4bit),$rem
532 shrpw $Zlh,$Zll,4,$Zll
533 ldwx $nlo($Hhh),$Thh
534 shrpw $Zhl,$Zlh,4,$Zlh
535 ldbx $cnt($Xi),$nlo
536 xor $Tll,$Zll,$Zll
537 ldwx $nhi($Hll),$Tll
538 shrpw $Zhh,$Zhl,4,$Zhl
539 ldbx $cnt($inp),$byte
540 xor $Tlh,$Zlh,$Zlh
541 ldwx $nhi($Hlh),$Tlh
542 extru $Zhh,27,28,$Zhh
543 xor $Thl,$Zhl,$Zhl
544 ldwx $nhi($Hhl),$Thl
545 xor $rem,$Zhh,$Zhh
546 zdep $Zll,28,4,$rem
547 xor $Thh,$Zhh,$Zhh
548 ldwx $nhi($Hhh),$Thh
549 shrpw $Zlh,$Zll,4,$Zll
550 ldwx $rem($rem_4bit),$rem
551 shrpw $Zhl,$Zlh,4,$Zlh
552 xor $byte,$nlo,$nlo
553 shrpw $Zhh,$Zhl,4,$Zhl
554 and $mask0xf0,$nlo,$nhi
555 extru $Zhh,27,28,$Zhh
556 zdep $nlo,27,4,$nlo
557 xor $Tll,$Zll,$Zll
558 ldwx $nlo($Hll),$Tll
559 xor $Tlh,$Zlh,$Zlh
560 ldwx $nlo($Hlh),$Tlh
561 xor $rem,$Zhh,$Zhh
562 addib,uv -1,$cnt,L\$oop_ghash_pa1
563 xor $Thl,$Zhl,$Zhl
564
565 zdep $Zll,28,4,$rem
566 ldwx $nlo($Hhl),$Thl
567 xor $Thh,$Zhh,$Zhh
568 ldwx $rem($rem_4bit),$rem
569 shrpw $Zlh,$Zll,4,$Zll
570 ldwx $nlo($Hhh),$Thh
571 shrpw $Zhl,$Zlh,4,$Zlh
572 xor $Tll,$Zll,$Zll
573 ldwx $nhi($Hll),$Tll
574 shrpw $Zhh,$Zhl,4,$Zhl
575 xor $Tlh,$Zlh,$Zlh
576 ldwx $nhi($Hlh),$Tlh
577 extru $Zhh,27,28,$Zhh
578 xor $rem,$Zhh,$Zhh
579 xor $Thl,$Zhl,$Zhl
580 ldwx $nhi($Hhl),$Thl
581 xor $Thh,$Zhh,$Zhh
582 ldwx $nhi($Hhh),$Thh
583 zdep $Zll,28,4,$rem
584 ldwx $rem($rem_4bit),$rem
585 shrpw $Zlh,$Zll,4,$Zll
586 shrpw $Zhl,$Zlh,4,$Zlh
587 shrpw $Zhh,$Zhl,4,$Zhl
588 extru $Zhh,27,28,$Zhh
589 xor $Tll,$Zll,$Zll
590 xor $Tlh,$Zlh,$Zlh
591 xor $rem,$Zhh,$Zhh
592 stw $Zll,12($Xi)
593 xor $Thl,$Zhl,$Zhl
594 stw $Zlh,8($Xi)
595 xor $Thh,$Zhh,$Zhh
596 stw $Zhl,4($Xi)
597 ldo 16($inp),$inp
598 stw $Zhh,0($Xi)
599 comb,<> $inp,$len,L\$outer_ghash_pa1
600 copy $Zll,$nlo
601___
602$code.=<<___;
603L\$done_ghash
604 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
605 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
606 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
607 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
608___
609$code.=<<___ if ($SIZE_T==4);
610 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
611 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
612 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
613 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
614 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
615___
616$code.=<<___;
617 bv (%r2)
618 .EXIT
619 $POPMB -$FRAME(%sp),%r3
620 .PROCEND
621
622 .ALIGN 64
623L\$rem_4bit
624 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
625 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
626 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
627 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
628
629 .data
630 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
631 .ALIGN 64
632___
633
634# Explicitly encode PA-RISC 2.0 instructions used in this module, so
635# that it can be compiled with .LEVEL 1.0. It should be noted that I
636# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
637# directive...
638
639my $ldd = sub {
640 my ($mod,$args) = @_;
641 my $orig = "ldd$mod\t$args";
642
643 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
644 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
645 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
646 }
647 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
648 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
649 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
650 $opcode|=(1<<5) if ($mod =~ /^,m/);
651 $opcode|=(1<<13) if ($mod =~ /^,mb/);
652 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
653 }
654 else { "\t".$orig; }
655};
656
657my $std = sub {
658 my ($mod,$args) = @_;
659 my $orig = "std$mod\t$args";
660
661 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
662 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
663 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
664 }
665 else { "\t".$orig; }
666};
667
668my $extrd = sub {
669 my ($mod,$args) = @_;
670 my $orig = "extrd$mod\t$args";
671
672 # I only have ",u" completer, it's implicitly encoded...
673 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
674 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
675 my $len=32-$3;
676 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
677 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
678 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
679 }
680 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
681 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
682 my $len=32-$2;
683 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
684 $opcode |= (1<<13) if ($mod =~ /,\**=/);
685 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
686 }
687 else { "\t".$orig; }
688};
689
690my $shrpd = sub {
691 my ($mod,$args) = @_;
692 my $orig = "shrpd$mod\t$args";
693
694 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
695 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
696 my $cpos=63-$3;
697 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
698 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
699 }
700 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
701 { sprintf "\t.WORD\t0x%08x\t; %s",
702 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
703 }
704 else { "\t".$orig; }
705};
706
707my $depd = sub {
708 my ($mod,$args) = @_;
709 my $orig = "depd$mod\t$args";
710
711 # I only have ",z" completer, it's implicitly encoded...
712 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
713 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
714 my $cpos=63-$2;
715 my $len=32-$3;
716 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
717 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
718 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
719 }
720 else { "\t".$orig; }
721};
722
723sub assemble {
724 my ($mnemonic,$mod,$args)=@_;
725 my $opcode = eval("\$$mnemonic");
726
727 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
728}
729
730foreach (split("\n",$code)) {
731 s/\`([^\`]*)\`/eval $1/ge;
732 if ($SIZE_T==4) {
733 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
734 s/cmpb,\*/comb,/;
735 s/,\*/,/;
736 }
737 s/\bbv\b/bve/ if ($SIZE_T==8);
738 print $_,"\n";
739}
740
741close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
deleted file mode 100644
index 6a40d5d89c..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-s390x.pl
+++ /dev/null
@@ -1,262 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# September 2010.
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# was measured to be ~18 cycles per processed byte on z10, which is
16# almost 40% better than gcc-generated code. It should be noted that
17# 18 cycles is worse result than expected: loop is scheduled for 12
18# and the result should be close to 12. In the lack of instruction-
19# level profiling data it's impossible to tell why...
20
21# November 2010.
22#
23# Adapt for -m31 build. If kernel supports what's called "highgprs"
24# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
25# instructions and achieve "64-bit" performance even in 31-bit legacy
26# application context. The feature is not specific to any particular
27# processor, as long as it's "z-CPU". Latter implies that the code
28# remains z/Architecture specific. On z990 it was measured to perform
29# 2.8x better than 32-bit code generated by gcc 4.3.
30
31# March 2011.
32#
33# Support for hardware KIMD-GHASH is verified to produce correct
34# result and therefore is engaged. On z196 it was measured to process
35# 8KB buffer ~7 faster than software implementation. It's not as
36# impressive for smaller buffer sizes and for smallest 16-bytes buffer
37# it's actually almost 2 times slower. Which is the reason why
38# KIMD-GHASH is not used in gcm_gmult_4bit.
39
40$flavour = shift;
41
42if ($flavour =~ /3[12]/) {
43 $SIZE_T=4;
44 $g="";
45} else {
46 $SIZE_T=8;
47 $g="g";
48}
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53$softonly=0;
54
55$Zhi="%r0";
56$Zlo="%r1";
57
58$Xi="%r2"; # argument block
59$Htbl="%r3";
60$inp="%r4";
61$len="%r5";
62
63$rem0="%r6"; # variables
64$rem1="%r7";
65$nlo="%r8";
66$nhi="%r9";
67$xi="%r10";
68$cnt="%r11";
69$tmp="%r12";
70$x78="%r13";
71$rem_4bit="%r14";
72
73$sp="%r15";
74
75$code.=<<___;
76.text
77
78.globl gcm_gmult_4bit
79.align 32
80gcm_gmult_4bit:
81___
82$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
83 larl %r1,OPENSSL_s390xcap_P
84 lg %r0,0(%r1)
85 tmhl %r0,0x4000 # check for message-security-assist
86 jz .Lsoft_gmult
87 lghi %r0,0
88 la %r1,16($sp)
89 .long 0xb93e0004 # kimd %r0,%r4
90 lg %r1,24($sp)
91 tmhh %r1,0x4000 # check for function 65
92 jz .Lsoft_gmult
93 stg %r0,16($sp) # arrange 16 bytes of zero input
94 stg %r0,24($sp)
95 lghi %r0,65 # function 65
96 la %r1,0($Xi) # H lies right after Xi in gcm128_context
97 la $inp,16($sp)
98 lghi $len,16
99 .long 0xb93e0004 # kimd %r0,$inp
100 brc 1,.-4 # pay attention to "partial completion"
101 br %r14
102.align 32
103.Lsoft_gmult:
104___
105$code.=<<___;
106 stm${g} %r6,%r14,6*$SIZE_T($sp)
107
108 aghi $Xi,-1
109 lghi $len,1
110 lghi $x78,`0xf<<3`
111 larl $rem_4bit,rem_4bit
112
113 lg $Zlo,8+1($Xi) # Xi
114 j .Lgmult_shortcut
115.type gcm_gmult_4bit,\@function
116.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
117
118.globl gcm_ghash_4bit
119.align 32
120gcm_ghash_4bit:
121___
122$code.=<<___ if(!$softonly);
123 larl %r1,OPENSSL_s390xcap_P
124 lg %r0,0(%r1)
125 tmhl %r0,0x4000 # check for message-security-assist
126 jz .Lsoft_ghash
127 lghi %r0,0
128 la %r1,16($sp)
129 .long 0xb93e0004 # kimd %r0,%r4
130 lg %r1,24($sp)
131 tmhh %r1,0x4000 # check for function 65
132 jz .Lsoft_ghash
133 lghi %r0,65 # function 65
134 la %r1,0($Xi) # H lies right after Xi in gcm128_context
135 .long 0xb93e0004 # kimd %r0,$inp
136 brc 1,.-4 # pay attention to "partial completion"
137 br %r14
138.align 32
139.Lsoft_ghash:
140___
141$code.=<<___ if ($flavour =~ /3[12]/);
142 llgfr $len,$len
143___
144$code.=<<___;
145 stm${g} %r6,%r14,6*$SIZE_T($sp)
146
147 aghi $Xi,-1
148 srlg $len,$len,4
149 lghi $x78,`0xf<<3`
150 larl $rem_4bit,rem_4bit
151
152 lg $Zlo,8+1($Xi) # Xi
153 lg $Zhi,0+1($Xi)
154 lghi $tmp,0
155.Louter:
156 xg $Zhi,0($inp) # Xi ^= inp
157 xg $Zlo,8($inp)
158 xgr $Zhi,$tmp
159 stg $Zlo,8+1($Xi)
160 stg $Zhi,0+1($Xi)
161
162.Lgmult_shortcut:
163 lghi $tmp,0xf0
164 sllg $nlo,$Zlo,4
165 srlg $xi,$Zlo,8 # extract second byte
166 ngr $nlo,$tmp
167 lgr $nhi,$Zlo
168 lghi $cnt,14
169 ngr $nhi,$tmp
170
171 lg $Zlo,8($nlo,$Htbl)
172 lg $Zhi,0($nlo,$Htbl)
173
174 sllg $nlo,$xi,4
175 sllg $rem0,$Zlo,3
176 ngr $nlo,$tmp
177 ngr $rem0,$x78
178 ngr $xi,$tmp
179
180 sllg $tmp,$Zhi,60
181 srlg $Zlo,$Zlo,4
182 srlg $Zhi,$Zhi,4
183 xg $Zlo,8($nhi,$Htbl)
184 xg $Zhi,0($nhi,$Htbl)
185 lgr $nhi,$xi
186 sllg $rem1,$Zlo,3
187 xgr $Zlo,$tmp
188 ngr $rem1,$x78
189 j .Lghash_inner
190.align 16
191.Lghash_inner:
192 srlg $Zlo,$Zlo,4
193 sllg $tmp,$Zhi,60
194 xg $Zlo,8($nlo,$Htbl)
195 srlg $Zhi,$Zhi,4
196 llgc $xi,0($cnt,$Xi)
197 xg $Zhi,0($nlo,$Htbl)
198 sllg $nlo,$xi,4
199 xg $Zhi,0($rem0,$rem_4bit)
200 nill $nlo,0xf0
201 sllg $rem0,$Zlo,3
202 xgr $Zlo,$tmp
203 ngr $rem0,$x78
204 nill $xi,0xf0
205
206 sllg $tmp,$Zhi,60
207 srlg $Zlo,$Zlo,4
208 srlg $Zhi,$Zhi,4
209 xg $Zlo,8($nhi,$Htbl)
210 xg $Zhi,0($nhi,$Htbl)
211 lgr $nhi,$xi
212 xg $Zhi,0($rem1,$rem_4bit)
213 sllg $rem1,$Zlo,3
214 xgr $Zlo,$tmp
215 ngr $rem1,$x78
216 brct $cnt,.Lghash_inner
217
218 sllg $tmp,$Zhi,60
219 srlg $Zlo,$Zlo,4
220 srlg $Zhi,$Zhi,4
221 xg $Zlo,8($nlo,$Htbl)
222 xg $Zhi,0($nlo,$Htbl)
223 sllg $xi,$Zlo,3
224 xg $Zhi,0($rem0,$rem_4bit)
225 xgr $Zlo,$tmp
226 ngr $xi,$x78
227
228 sllg $tmp,$Zhi,60
229 srlg $Zlo,$Zlo,4
230 srlg $Zhi,$Zhi,4
231 xg $Zlo,8($nhi,$Htbl)
232 xg $Zhi,0($nhi,$Htbl)
233 xgr $Zlo,$tmp
234 xg $Zhi,0($rem1,$rem_4bit)
235
236 lg $tmp,0($xi,$rem_4bit)
237 la $inp,16($inp)
238 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
239 brctg $len,.Louter
240
241 xgr $Zhi,$tmp
242 stg $Zlo,8+1($Xi)
243 stg $Zhi,0+1($Xi)
244 lm${g} %r6,%r14,6*$SIZE_T($sp)
245 br %r14
246.type gcm_ghash_4bit,\@function
247.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
248
249.align 64
250rem_4bit:
251 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
252 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
253 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
254 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
255.type rem_4bit,\@object
256.size rem_4bit,(.-rem_4bit)
257.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
258___
259
260$code =~ s/\`([^\`]*)\`/eval $1/gem;
261print $code;
262close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index 70e7b044a3..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,330 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18# gcc 3.3.x cc 5.2 this assembler
19#
20# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
21# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25# gcc 4.4.1 this assembler
26#
27# 32-bit build 566 50 (+1000%)
28# 64-bit build 56 50 (+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64) { $bias=2047; $frame=192; }
43else { $bias=0; $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0"; # 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0"; # small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0"; # input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section ".text",#alloc,#execinstr
71
72.align 64
73rem_4bit:
74 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type rem_4bit,#object
79.size rem_4bit,(.-rem_4bit)
80
81.globl gcm_ghash_4bit
82.align 32
83gcm_ghash_4bit:
84 save %sp,-$frame,%sp
85 ldub [$inp+15],$nlo
86 ldub [$Xi+15],$xi0
87 ldub [$Xi+14],$xi1
88 add $len,$inp,$len
89 add $Htbl,8,$Htblo
90
911: call .+8
92 add %o7,rem_4bit-1b,$rem_4bit
93
94.Louter:
95 xor $xi0,$nlo,$nlo
96 and $nlo,0xf0,$nhi
97 and $nlo,0x0f,$nlo
98 sll $nlo,4,$nlo
99 ldx [$Htblo+$nlo],$Zlo
100 ldx [$Htbl+$nlo],$Zhi
101
102 ldub [$inp+14],$nlo
103
104 ldx [$Htblo+$nhi],$Tlo
105 and $Zlo,0xf,$remi
106 ldx [$Htbl+$nhi],$Thi
107 sll $remi,3,$remi
108 ldx [$rem_4bit+$remi],$rem
109 srlx $Zlo,4,$Zlo
110 mov 13,$cnt
111 sllx $Zhi,60,$tmp
112 xor $Tlo,$Zlo,$Zlo
113 srlx $Zhi,4,$Zhi
114 xor $Zlo,$tmp,$Zlo
115
116 xor $xi1,$nlo,$nlo
117 and $Zlo,0xf,$remi
118 and $nlo,0xf0,$nhi
119 and $nlo,0x0f,$nlo
120 ba .Lghash_inner
121 sll $nlo,4,$nlo
122.align 32
123.Lghash_inner:
124 ldx [$Htblo+$nlo],$Tlo
125 sll $remi,3,$remi
126 xor $Thi,$Zhi,$Zhi
127 ldx [$Htbl+$nlo],$Thi
128 srlx $Zlo,4,$Zlo
129 xor $rem,$Zhi,$Zhi
130 ldx [$rem_4bit+$remi],$rem
131 sllx $Zhi,60,$tmp
132 xor $Tlo,$Zlo,$Zlo
133 ldub [$inp+$cnt],$nlo
134 srlx $Zhi,4,$Zhi
135 xor $Zlo,$tmp,$Zlo
136 ldub [$Xi+$cnt],$xi1
137 xor $Thi,$Zhi,$Zhi
138 and $Zlo,0xf,$remi
139
140 ldx [$Htblo+$nhi],$Tlo
141 sll $remi,3,$remi
142 xor $rem,$Zhi,$Zhi
143 ldx [$Htbl+$nhi],$Thi
144 srlx $Zlo,4,$Zlo
145 ldx [$rem_4bit+$remi],$rem
146 sllx $Zhi,60,$tmp
147 xor $xi1,$nlo,$nlo
148 srlx $Zhi,4,$Zhi
149 and $nlo,0xf0,$nhi
150 addcc $cnt,-1,$cnt
151 xor $Zlo,$tmp,$Zlo
152 and $nlo,0x0f,$nlo
153 xor $Tlo,$Zlo,$Zlo
154 sll $nlo,4,$nlo
155 blu .Lghash_inner
156 and $Zlo,0xf,$remi
157
158 ldx [$Htblo+$nlo],$Tlo
159 sll $remi,3,$remi
160 xor $Thi,$Zhi,$Zhi
161 ldx [$Htbl+$nlo],$Thi
162 srlx $Zlo,4,$Zlo
163 xor $rem,$Zhi,$Zhi
164 ldx [$rem_4bit+$remi],$rem
165 sllx $Zhi,60,$tmp
166 xor $Tlo,$Zlo,$Zlo
167 srlx $Zhi,4,$Zhi
168 xor $Zlo,$tmp,$Zlo
169 xor $Thi,$Zhi,$Zhi
170
171 add $inp,16,$inp
172 cmp $inp,$len
173 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
174 and $Zlo,0xf,$remi
175
176 ldx [$Htblo+$nhi],$Tlo
177 sll $remi,3,$remi
178 xor $rem,$Zhi,$Zhi
179 ldx [$Htbl+$nhi],$Thi
180 srlx $Zlo,4,$Zlo
181 ldx [$rem_4bit+$remi],$rem
182 sllx $Zhi,60,$tmp
183 xor $Tlo,$Zlo,$Zlo
184 ldub [$inp+15],$nlo
185 srlx $Zhi,4,$Zhi
186 xor $Zlo,$tmp,$Zlo
187 xor $Thi,$Zhi,$Zhi
188 stx $Zlo,[$Xi+8]
189 xor $rem,$Zhi,$Zhi
190 stx $Zhi,[$Xi]
191 srl $Zlo,8,$xi1
192 and $Zlo,0xff,$xi0
193 ba .Louter
194 and $xi1,0xff,$xi1
195.align 32
196.Ldone:
197 ldx [$Htblo+$nhi],$Tlo
198 sll $remi,3,$remi
199 xor $rem,$Zhi,$Zhi
200 ldx [$Htbl+$nhi],$Thi
201 srlx $Zlo,4,$Zlo
202 ldx [$rem_4bit+$remi],$rem
203 sllx $Zhi,60,$tmp
204 xor $Tlo,$Zlo,$Zlo
205 srlx $Zhi,4,$Zhi
206 xor $Zlo,$tmp,$Zlo
207 xor $Thi,$Zhi,$Zhi
208 stx $Zlo,[$Xi+8]
209 xor $rem,$Zhi,$Zhi
210 stx $Zhi,[$Xi]
211
212 ret
213 restore
214.type gcm_ghash_4bit,#function
215.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
216___
217
218undef $inp;
219undef $len;
220
221$code.=<<___;
222.globl gcm_gmult_4bit
223.align 32
224gcm_gmult_4bit:
225 save %sp,-$frame,%sp
226 ldub [$Xi+15],$nlo
227 add $Htbl,8,$Htblo
228
2291: call .+8
230 add %o7,rem_4bit-1b,$rem_4bit
231
232 and $nlo,0xf0,$nhi
233 and $nlo,0x0f,$nlo
234 sll $nlo,4,$nlo
235 ldx [$Htblo+$nlo],$Zlo
236 ldx [$Htbl+$nlo],$Zhi
237
238 ldub [$Xi+14],$nlo
239
240 ldx [$Htblo+$nhi],$Tlo
241 and $Zlo,0xf,$remi
242 ldx [$Htbl+$nhi],$Thi
243 sll $remi,3,$remi
244 ldx [$rem_4bit+$remi],$rem
245 srlx $Zlo,4,$Zlo
246 mov 13,$cnt
247 sllx $Zhi,60,$tmp
248 xor $Tlo,$Zlo,$Zlo
249 srlx $Zhi,4,$Zhi
250 xor $Zlo,$tmp,$Zlo
251
252 and $Zlo,0xf,$remi
253 and $nlo,0xf0,$nhi
254 and $nlo,0x0f,$nlo
255 ba .Lgmult_inner
256 sll $nlo,4,$nlo
257.align 32
258.Lgmult_inner:
259 ldx [$Htblo+$nlo],$Tlo
260 sll $remi,3,$remi
261 xor $Thi,$Zhi,$Zhi
262 ldx [$Htbl+$nlo],$Thi
263 srlx $Zlo,4,$Zlo
264 xor $rem,$Zhi,$Zhi
265 ldx [$rem_4bit+$remi],$rem
266 sllx $Zhi,60,$tmp
267 xor $Tlo,$Zlo,$Zlo
268 ldub [$Xi+$cnt],$nlo
269 srlx $Zhi,4,$Zhi
270 xor $Zlo,$tmp,$Zlo
271 xor $Thi,$Zhi,$Zhi
272 and $Zlo,0xf,$remi
273
274 ldx [$Htblo+$nhi],$Tlo
275 sll $remi,3,$remi
276 xor $rem,$Zhi,$Zhi
277 ldx [$Htbl+$nhi],$Thi
278 srlx $Zlo,4,$Zlo
279 ldx [$rem_4bit+$remi],$rem
280 sllx $Zhi,60,$tmp
281 srlx $Zhi,4,$Zhi
282 and $nlo,0xf0,$nhi
283 addcc $cnt,-1,$cnt
284 xor $Zlo,$tmp,$Zlo
285 and $nlo,0x0f,$nlo
286 xor $Tlo,$Zlo,$Zlo
287 sll $nlo,4,$nlo
288 blu .Lgmult_inner
289 and $Zlo,0xf,$remi
290
291 ldx [$Htblo+$nlo],$Tlo
292 sll $remi,3,$remi
293 xor $Thi,$Zhi,$Zhi
294 ldx [$Htbl+$nlo],$Thi
295 srlx $Zlo,4,$Zlo
296 xor $rem,$Zhi,$Zhi
297 ldx [$rem_4bit+$remi],$rem
298 sllx $Zhi,60,$tmp
299 xor $Tlo,$Zlo,$Zlo
300 srlx $Zhi,4,$Zhi
301 xor $Zlo,$tmp,$Zlo
302 xor $Thi,$Zhi,$Zhi
303 and $Zlo,0xf,$remi
304
305 ldx [$Htblo+$nhi],$Tlo
306 sll $remi,3,$remi
307 xor $rem,$Zhi,$Zhi
308 ldx [$Htbl+$nhi],$Thi
309 srlx $Zlo,4,$Zlo
310 ldx [$rem_4bit+$remi],$rem
311 sllx $Zhi,60,$tmp
312 xor $Tlo,$Zlo,$Zlo
313 srlx $Zhi,4,$Zhi
314 xor $Zlo,$tmp,$Zlo
315 xor $Thi,$Zhi,$Zhi
316 stx $Zlo,[$Xi+8]
317 xor $rem,$Zhi,$Zhi
318 stx $Zhi,[$Xi]
319
320 ret
321 restore
322.type gcm_gmult_4bit,#function
323.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
324.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
325.align 4
326___
327
328$code =~ s/\`([^\`]*)\`/eval $1/gem;
329print $code;
330close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 83c727e07f..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1342 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, May, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15# code paths: vanilla x86 and vanilla MMX. Former will be executed on
16# 486 and Pentium, latter on all others. MMX GHASH features so called
17# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18# of per-key storage [+512 bytes shared table]. Performance results
19# are for streamed GHASH subroutine and are expressed in cycles per
20# processed byte, less is better:
21#
22# gcc 2.95.3(*) MMX assembler x86 assembler
23#
24# Pentium 105/111(**) - 50
25# PIII 68 /75 12.2 24
26# P4 125/125 17.8 84(***)
27# Opteron 66 /70 10.1 30
28# Core2 54 /67 8.4 18
29#
30# (*) gcc 3.4.x was observed to generate few percent slower code,
31# which is one of reasons why 2.95.3 results were chosen,
32# another reason is lack of 3.4.x results for older CPUs;
33# comparison with MMX results is not completely fair, because C
34# results are for vanilla "256B" implementation, while
35# assembler results are for "528B";-)
36# (**) second number is result for code compiled with -fPIC flag,
37# which is actually more relevant, because assembler code is
38# position-independent;
39# (***) see comment in non-MMX routine for further details;
40#
41# To summarize, it's >2-5 times faster than gcc-generated code. To
42# anchor it to something else SHA1 assembler processes one byte in
43# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
44# particular, see comment at the end of the file...
45
46# May 2010
47#
48# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
49# The question is how close is it to theoretical limit? The pclmulqdq
50# instruction latency appears to be 14 cycles and there can't be more
51# than 2 of them executing at any given time. This means that single
52# Karatsuba multiplication would take 28 cycles *plus* few cycles for
53# pre- and post-processing. Then multiplication has to be followed by
54# modulo-reduction. Given that aggregated reduction method [see
55# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
56# white paper by Intel] allows you to perform reduction only once in
57# a while we can assume that asymptotic performance can be estimated
58# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
59# and Naggr is the aggregation factor.
60#
61# Before we proceed to this implementation let's have closer look at
62# the best-performing code suggested by Intel in their white paper.
63# By tracing inter-register dependencies Tmod is estimated as ~19
64# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
65# processed byte. As implied, this is quite optimistic estimate,
66# because it does not account for Karatsuba pre- and post-processing,
67# which for a single multiplication is ~5 cycles. Unfortunately Intel
68# does not provide performance data for GHASH alone. But benchmarking
69# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
70# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
71# the result accounts even for pre-computing of degrees of the hash
72# key H, but its portion is negligible at 16KB buffer size.
73#
74# Moving on to the implementation in question. Tmod is estimated as
75# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
76# 2.16. How is it possible that measured performance is better than
77# optimistic theoretical estimate? There is one thing Intel failed
78# to recognize. By serializing GHASH with CTR in same subroutine
79# former's performance is really limited to above (Tmul + Tmod/Naggr)
80# equation. But if GHASH procedure is detached, the modulo-reduction
81# can be interleaved with Naggr-1 multiplications at instruction level
82# and under ideal conditions even disappear from the equation. So that
83# optimistic theoretical estimate for this implementation is ...
84# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
85# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
86# where Tproc is time required for Karatsuba pre- and post-processing,
87# is more realistic estimate. In this case it gives ... 1.91 cycles.
88# Or in other words, depending on how well we can interleave reduction
89# and one of the two multiplications the performance should be betwen
90# 1.91 and 2.16. As already mentioned, this implementation processes
91# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
92# - in 2.02. x86_64 performance is better, because larger register
93# bank allows to interleave reduction and multiplication better.
94#
95# Does it make sense to increase Naggr? To start with it's virtually
96# impossible in 32-bit mode, because of limited register bank
97# capacity. Otherwise improvement has to be weighed agiainst slower
98# setup, as well as code size and complexity increase. As even
99# optimistic estimate doesn't promise 30% performance improvement,
100# there are currently no plans to increase Naggr.
101#
102# Special thanks to David Woodhouse <dwmw2@infradead.org> for
103# providing access to a Westmere-based system on behalf of Intel
104# Open Source Technology Centre.
105
106# January 2010
107#
108# Tweaked to optimize transitions between integer and FP operations
109# on same XMM register, PCLMULQDQ subroutine was measured to process
110# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
111# The minor regression on Westmere is outweighed by ~15% improvement
112# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
113# similar manner resulted in almost 20% degradation on Sandy Bridge,
114# where original 64-bit code processes one byte in 1.95 cycles.
115
116$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117push(@INC,"${dir}","${dir}../../perlasm");
118require "x86asm.pl";
119
120&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
121
122$sse2=0;
123for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
124
125($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
126$inp = "edi";
127$Htbl = "esi";
128
129$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
130 # than unrolled, which has to be weighted against
131 # 2.5x x86-specific code size reduction.
132
133sub x86_loop {
134 my $off = shift;
135 my $rem = "eax";
136
137 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
138 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
139 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
140 &mov ($Zll,&DWP(8,$Htbl,$Zll));
141 &xor ($rem,$rem); # avoid partial register stalls on PIII
142
143 # shrd practically kills P4, 2.5x deterioration, but P4 has
144 # MMX code-path to execute. shrd runs tad faster [than twice
145 # the shifts, move's and or's] on pre-MMX Pentium (as well as
146 # PIII and Core2), *but* minimizes code size, spares register
147 # and thus allows to fold the loop...
148 if (!$unroll) {
149 my $cnt = $inp;
150 &mov ($cnt,15);
151 &jmp (&label("x86_loop"));
152 &set_label("x86_loop",16);
153 for($i=1;$i<=2;$i++) {
154 &mov (&LB($rem),&LB($Zll));
155 &shrd ($Zll,$Zlh,4);
156 &and (&LB($rem),0xf);
157 &shrd ($Zlh,$Zhl,4);
158 &shrd ($Zhl,$Zhh,4);
159 &shr ($Zhh,4);
160 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
161
162 &mov (&LB($rem),&BP($off,"esp",$cnt));
163 if ($i&1) {
164 &and (&LB($rem),0xf0);
165 } else {
166 &shl (&LB($rem),4);
167 }
168
169 &xor ($Zll,&DWP(8,$Htbl,$rem));
170 &xor ($Zlh,&DWP(12,$Htbl,$rem));
171 &xor ($Zhl,&DWP(0,$Htbl,$rem));
172 &xor ($Zhh,&DWP(4,$Htbl,$rem));
173
174 if ($i&1) {
175 &dec ($cnt);
176 &js (&label("x86_break"));
177 } else {
178 &jmp (&label("x86_loop"));
179 }
180 }
181 &set_label("x86_break",16);
182 } else {
183 for($i=1;$i<32;$i++) {
184 &comment($i);
185 &mov (&LB($rem),&LB($Zll));
186 &shrd ($Zll,$Zlh,4);
187 &and (&LB($rem),0xf);
188 &shrd ($Zlh,$Zhl,4);
189 &shrd ($Zhl,$Zhh,4);
190 &shr ($Zhh,4);
191 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
192
193 if ($i&1) {
194 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
195 &and (&LB($rem),0xf0);
196 } else {
197 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
198 &shl (&LB($rem),4);
199 }
200
201 &xor ($Zll,&DWP(8,$Htbl,$rem));
202 &xor ($Zlh,&DWP(12,$Htbl,$rem));
203 &xor ($Zhl,&DWP(0,$Htbl,$rem));
204 &xor ($Zhh,&DWP(4,$Htbl,$rem));
205 }
206 }
207 &bswap ($Zll);
208 &bswap ($Zlh);
209 &bswap ($Zhl);
210 if (!$x86only) {
211 &bswap ($Zhh);
212 } else {
213 &mov ("eax",$Zhh);
214 &bswap ("eax");
215 &mov ($Zhh,"eax");
216 }
217}
218
219if ($unroll) {
220 &function_begin_B("_x86_gmult_4bit_inner");
221 &x86_loop(4);
222 &ret ();
223 &function_end_B("_x86_gmult_4bit_inner");
224}
225
226sub deposit_rem_4bit {
227 my $bias = shift;
228
229 &mov (&DWP($bias+0, "esp"),0x0000<<16);
230 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
231 &mov (&DWP($bias+8, "esp"),0x3840<<16);
232 &mov (&DWP($bias+12,"esp"),0x2460<<16);
233 &mov (&DWP($bias+16,"esp"),0x7080<<16);
234 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
235 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
236 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
237 &mov (&DWP($bias+32,"esp"),0xE100<<16);
238 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
239 &mov (&DWP($bias+40,"esp"),0xD940<<16);
240 &mov (&DWP($bias+44,"esp"),0xC560<<16);
241 &mov (&DWP($bias+48,"esp"),0x9180<<16);
242 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
243 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
244 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
245}
246
247$suffix = $x86only ? "" : "_x86";
248
249&function_begin("gcm_gmult_4bit".$suffix);
250 &stack_push(16+4+1); # +1 for stack alignment
251 &mov ($inp,&wparam(0)); # load Xi
252 &mov ($Htbl,&wparam(1)); # load Htable
253
254 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
255 &mov ($Zhl,&DWP(4,$inp));
256 &mov ($Zlh,&DWP(8,$inp));
257 &mov ($Zll,&DWP(12,$inp));
258
259 &deposit_rem_4bit(16);
260
261 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
262 &mov (&DWP(4,"esp"),$Zhl);
263 &mov (&DWP(8,"esp"),$Zlh);
264 &mov (&DWP(12,"esp"),$Zll);
265 &shr ($Zll,20);
266 &and ($Zll,0xf0);
267
268 if ($unroll) {
269 &call ("_x86_gmult_4bit_inner");
270 } else {
271 &x86_loop(0);
272 &mov ($inp,&wparam(0));
273 }
274
275 &mov (&DWP(12,$inp),$Zll);
276 &mov (&DWP(8,$inp),$Zlh);
277 &mov (&DWP(4,$inp),$Zhl);
278 &mov (&DWP(0,$inp),$Zhh);
279 &stack_pop(16+4+1);
280&function_end("gcm_gmult_4bit".$suffix);
281
282&function_begin("gcm_ghash_4bit".$suffix);
283 &stack_push(16+4+1); # +1 for 64-bit alignment
284 &mov ($Zll,&wparam(0)); # load Xi
285 &mov ($Htbl,&wparam(1)); # load Htable
286 &mov ($inp,&wparam(2)); # load in
287 &mov ("ecx",&wparam(3)); # load len
288 &add ("ecx",$inp);
289 &mov (&wparam(3),"ecx");
290
291 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
292 &mov ($Zhl,&DWP(4,$Zll));
293 &mov ($Zlh,&DWP(8,$Zll));
294 &mov ($Zll,&DWP(12,$Zll));
295
296 &deposit_rem_4bit(16);
297
298 &set_label("x86_outer_loop",16);
299 &xor ($Zll,&DWP(12,$inp)); # xor with input
300 &xor ($Zlh,&DWP(8,$inp));
301 &xor ($Zhl,&DWP(4,$inp));
302 &xor ($Zhh,&DWP(0,$inp));
303 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
304 &mov (&DWP(8,"esp"),$Zlh);
305 &mov (&DWP(4,"esp"),$Zhl);
306 &mov (&DWP(0,"esp"),$Zhh);
307
308 &shr ($Zll,20);
309 &and ($Zll,0xf0);
310
311 if ($unroll) {
312 &call ("_x86_gmult_4bit_inner");
313 } else {
314 &x86_loop(0);
315 &mov ($inp,&wparam(2));
316 }
317 &lea ($inp,&DWP(16,$inp));
318 &cmp ($inp,&wparam(3));
319 &mov (&wparam(2),$inp) if (!$unroll);
320 &jb (&label("x86_outer_loop"));
321
322 &mov ($inp,&wparam(0)); # load Xi
323 &mov (&DWP(12,$inp),$Zll);
324 &mov (&DWP(8,$inp),$Zlh);
325 &mov (&DWP(4,$inp),$Zhl);
326 &mov (&DWP(0,$inp),$Zhh);
327 &stack_pop(16+4+1);
328&function_end("gcm_ghash_4bit".$suffix);
329
330if (!$x86only) {{{
331
332&static_label("rem_4bit");
333
334if (!$sse2) {{ # pure-MMX "May" version...
335
336$S=12; # shift factor for rem_4bit
337
338&function_begin_B("_mmx_gmult_4bit_inner");
339# MMX version performs 3.5 times better on P4 (see comment in non-MMX
340# routine for further details), 100% better on Opteron, ~70% better
341# on Core2 and PIII... In other words effort is considered to be well
342# spent... Since initial release the loop was unrolled in order to
343# "liberate" register previously used as loop counter. Instead it's
344# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
345# The path involves move of Z.lo from MMX to integer register,
346# effective address calculation and finally merge of value to Z.hi.
347# Reference to rem_4bit is scheduled so late that I had to >>4
348# rem_4bit elements. This resulted in 20-45% procent improvement
349# on contemporary µ-archs.
350{
351 my $cnt;
352 my $rem_4bit = "eax";
353 my @rem = ($Zhh,$Zll);
354 my $nhi = $Zhl;
355 my $nlo = $Zlh;
356
357 my ($Zlo,$Zhi) = ("mm0","mm1");
358 my $tmp = "mm2";
359
360 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
361 &mov ($nhi,$Zll);
362 &mov (&LB($nlo),&LB($nhi));
363 &shl (&LB($nlo),4);
364 &and ($nhi,0xf0);
365 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
366 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
367 &movd ($rem[0],$Zlo);
368
369 for ($cnt=28;$cnt>=-2;$cnt--) {
370 my $odd = $cnt&1;
371 my $nix = $odd ? $nlo : $nhi;
372
373 &shl (&LB($nlo),4) if ($odd);
374 &psrlq ($Zlo,4);
375 &movq ($tmp,$Zhi);
376 &psrlq ($Zhi,4);
377 &pxor ($Zlo,&QWP(8,$Htbl,$nix));
378 &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
379 &psllq ($tmp,60);
380 &and ($nhi,0xf0) if ($odd);
381 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
382 &and ($rem[0],0xf);
383 &pxor ($Zhi,&QWP(0,$Htbl,$nix));
384 &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
385 &movd ($rem[1],$Zlo);
386 &pxor ($Zlo,$tmp);
387
388 push (@rem,shift(@rem)); # "rotate" registers
389 }
390
391 &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
392
393 &psrlq ($Zlo,32); # lower part of Zlo is already there
394 &movd ($Zhl,$Zhi);
395 &psrlq ($Zhi,32);
396 &movd ($Zlh,$Zlo);
397 &movd ($Zhh,$Zhi);
398 &shl ($inp,4); # compensate for rem_4bit[i] being >>4
399
400 &bswap ($Zll);
401 &bswap ($Zhl);
402 &bswap ($Zlh);
403 &xor ($Zhh,$inp);
404 &bswap ($Zhh);
405
406 &ret ();
407}
408&function_end_B("_mmx_gmult_4bit_inner");
409
410&function_begin("gcm_gmult_4bit_mmx");
411 &mov ($inp,&wparam(0)); # load Xi
412 &mov ($Htbl,&wparam(1)); # load Htable
413
414 &call (&label("pic_point"));
415 &set_label("pic_point");
416 &blindpop("eax");
417 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
418
419 &movz ($Zll,&BP(15,$inp));
420
421 &call ("_mmx_gmult_4bit_inner");
422
423 &mov ($inp,&wparam(0)); # load Xi
424 &emms ();
425 &mov (&DWP(12,$inp),$Zll);
426 &mov (&DWP(4,$inp),$Zhl);
427 &mov (&DWP(8,$inp),$Zlh);
428 &mov (&DWP(0,$inp),$Zhh);
429&function_end("gcm_gmult_4bit_mmx");
430
431# Streamed version performs 20% better on P4, 7% on Opteron,
432# 10% on Core2 and PIII...
433&function_begin("gcm_ghash_4bit_mmx");
434 &mov ($Zhh,&wparam(0)); # load Xi
435 &mov ($Htbl,&wparam(1)); # load Htable
436 &mov ($inp,&wparam(2)); # load in
437 &mov ($Zlh,&wparam(3)); # load len
438
439 &call (&label("pic_point"));
440 &set_label("pic_point");
441 &blindpop("eax");
442 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
443
444 &add ($Zlh,$inp);
445 &mov (&wparam(3),$Zlh); # len to point at the end of input
446 &stack_push(4+1); # +1 for stack alignment
447
448 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
449 &mov ($Zhl,&DWP(4,$Zhh));
450 &mov ($Zlh,&DWP(8,$Zhh));
451 &mov ($Zhh,&DWP(0,$Zhh));
452 &jmp (&label("mmx_outer_loop"));
453
454 &set_label("mmx_outer_loop",16);
455 &xor ($Zll,&DWP(12,$inp));
456 &xor ($Zhl,&DWP(4,$inp));
457 &xor ($Zlh,&DWP(8,$inp));
458 &xor ($Zhh,&DWP(0,$inp));
459 &mov (&wparam(2),$inp);
460 &mov (&DWP(12,"esp"),$Zll);
461 &mov (&DWP(4,"esp"),$Zhl);
462 &mov (&DWP(8,"esp"),$Zlh);
463 &mov (&DWP(0,"esp"),$Zhh);
464
465 &mov ($inp,"esp");
466 &shr ($Zll,24);
467
468 &call ("_mmx_gmult_4bit_inner");
469
470 &mov ($inp,&wparam(2));
471 &lea ($inp,&DWP(16,$inp));
472 &cmp ($inp,&wparam(3));
473 &jb (&label("mmx_outer_loop"));
474
475 &mov ($inp,&wparam(0)); # load Xi
476 &emms ();
477 &mov (&DWP(12,$inp),$Zll);
478 &mov (&DWP(4,$inp),$Zhl);
479 &mov (&DWP(8,$inp),$Zlh);
480 &mov (&DWP(0,$inp),$Zhh);
481
482 &stack_pop(4+1);
483&function_end("gcm_ghash_4bit_mmx");
484
485}} else {{ # "June" MMX version...
486 # ... has slower "April" gcm_gmult_4bit_mmx with folded
487 # loop. This is done to conserve code size...
488$S=16; # shift factor for rem_4bit
489
490sub mmx_loop() {
491# MMX version performs 2.8 times better on P4 (see comment in non-MMX
492# routine for further details), 40% better on Opteron and Core2, 50%
493# better on PIII... In other words effort is considered to be well
494# spent...
495 my $inp = shift;
496 my $rem_4bit = shift;
497 my $cnt = $Zhh;
498 my $nhi = $Zhl;
499 my $nlo = $Zlh;
500 my $rem = $Zll;
501
502 my ($Zlo,$Zhi) = ("mm0","mm1");
503 my $tmp = "mm2";
504
505 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
506 &mov ($nhi,$Zll);
507 &mov (&LB($nlo),&LB($nhi));
508 &mov ($cnt,14);
509 &shl (&LB($nlo),4);
510 &and ($nhi,0xf0);
511 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
512 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
513 &movd ($rem,$Zlo);
514 &jmp (&label("mmx_loop"));
515
516 &set_label("mmx_loop",16);
517 &psrlq ($Zlo,4);
518 &and ($rem,0xf);
519 &movq ($tmp,$Zhi);
520 &psrlq ($Zhi,4);
521 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
522 &mov (&LB($nlo),&BP(0,$inp,$cnt));
523 &psllq ($tmp,60);
524 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
525 &dec ($cnt);
526 &movd ($rem,$Zlo);
527 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
528 &mov ($nhi,$nlo);
529 &pxor ($Zlo,$tmp);
530 &js (&label("mmx_break"));
531
532 &shl (&LB($nlo),4);
533 &and ($rem,0xf);
534 &psrlq ($Zlo,4);
535 &and ($nhi,0xf0);
536 &movq ($tmp,$Zhi);
537 &psrlq ($Zhi,4);
538 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
539 &psllq ($tmp,60);
540 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
541 &movd ($rem,$Zlo);
542 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
543 &pxor ($Zlo,$tmp);
544 &jmp (&label("mmx_loop"));
545
546 &set_label("mmx_break",16);
547 &shl (&LB($nlo),4);
548 &and ($rem,0xf);
549 &psrlq ($Zlo,4);
550 &and ($nhi,0xf0);
551 &movq ($tmp,$Zhi);
552 &psrlq ($Zhi,4);
553 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
554 &psllq ($tmp,60);
555 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
556 &movd ($rem,$Zlo);
557 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
558 &pxor ($Zlo,$tmp);
559
560 &psrlq ($Zlo,4);
561 &and ($rem,0xf);
562 &movq ($tmp,$Zhi);
563 &psrlq ($Zhi,4);
564 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
565 &psllq ($tmp,60);
566 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
567 &movd ($rem,$Zlo);
568 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
569 &pxor ($Zlo,$tmp);
570
571 &psrlq ($Zlo,32); # lower part of Zlo is already there
572 &movd ($Zhl,$Zhi);
573 &psrlq ($Zhi,32);
574 &movd ($Zlh,$Zlo);
575 &movd ($Zhh,$Zhi);
576
577 &bswap ($Zll);
578 &bswap ($Zhl);
579 &bswap ($Zlh);
580 &bswap ($Zhh);
581}
582
583&function_begin("gcm_gmult_4bit_mmx");
584 &mov ($inp,&wparam(0)); # load Xi
585 &mov ($Htbl,&wparam(1)); # load Htable
586
587 &call (&label("pic_point"));
588 &set_label("pic_point");
589 &blindpop("eax");
590 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
591
592 &movz ($Zll,&BP(15,$inp));
593
594 &mmx_loop($inp,"eax");
595
596 &emms ();
597 &mov (&DWP(12,$inp),$Zll);
598 &mov (&DWP(4,$inp),$Zhl);
599 &mov (&DWP(8,$inp),$Zlh);
600 &mov (&DWP(0,$inp),$Zhh);
601&function_end("gcm_gmult_4bit_mmx");
602
603######################################################################
604# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
605# (see gcm128.c for details). It provides further 20-40% performance
606# improvement over above mentioned "May" version.
607
608&static_label("rem_8bit");
609
610&function_begin("gcm_ghash_4bit_mmx");
611{ my ($Zlo,$Zhi) = ("mm7","mm6");
612 my $rem_8bit = "esi";
613 my $Htbl = "ebx";
614
615 # parameter block
616 &mov ("eax",&wparam(0)); # Xi
617 &mov ("ebx",&wparam(1)); # Htable
618 &mov ("ecx",&wparam(2)); # inp
619 &mov ("edx",&wparam(3)); # len
620 &mov ("ebp","esp"); # original %esp
621 &call (&label("pic_point"));
622 &set_label ("pic_point");
623 &blindpop ($rem_8bit);
624 &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
625
626 &sub ("esp",512+16+16); # allocate stack frame...
627 &and ("esp",-64); # ...and align it
628 &sub ("esp",16); # place for (u8)(H[]<<4)
629
630 &add ("edx","ecx"); # pointer to the end of input
631 &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
632 &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
633 &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
634
635 { my @lo = ("mm0","mm1","mm2");
636 my @hi = ("mm3","mm4","mm5");
637 my @tmp = ("mm6","mm7");
638 my ($off1,$off2,$i) = (0,0,);
639
640 &add ($Htbl,128); # optimize for size
641 &lea ("edi",&DWP(16+128,"esp"));
642 &lea ("ebp",&DWP(16+256+128,"esp"));
643
644 # decompose Htable (low and high parts are kept separately),
645 # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
646 for ($i=0;$i<18;$i++) {
647
648 &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
649 &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
650 &psllq ($tmp[1],60) if ($i>1);
651 &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
652 &por ($lo[2],$tmp[1]) if ($i>1);
653 &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
654 &psrlq ($lo[1],4) if ($i>0 && $i<17);
655 &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
656 &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
657 &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
658 &psrlq ($hi[1],4) if ($i>0 && $i<17);
659 &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
660 &shl ("edx",4) if ($i<16);
661 &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
662
663 unshift (@lo,pop(@lo)); # "rotate" registers
664 unshift (@hi,pop(@hi));
665 unshift (@tmp,pop(@tmp));
666 $off1 += 8 if ($i>0);
667 $off2 += 8 if ($i>1);
668 }
669 }
670
671 &movq ($Zhi,&QWP(0,"eax"));
672 &mov ("ebx",&DWP(8,"eax"));
673 &mov ("edx",&DWP(12,"eax")); # load Xi
674
675&set_label("outer",16);
676 { my $nlo = "eax";
677 my $dat = "edx";
678 my @nhi = ("edi","ebp");
679 my @rem = ("ebx","ecx");
680 my @red = ("mm0","mm1","mm2");
681 my $tmp = "mm3";
682
683 &xor ($dat,&DWP(12,"ecx")); # merge input data
684 &xor ("ebx",&DWP(8,"ecx"));
685 &pxor ($Zhi,&QWP(0,"ecx"));
686 &lea ("ecx",&DWP(16,"ecx")); # inp+=16
687 #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
688 &mov (&DWP(528+8,"esp"),"ebx");
689 &movq (&QWP(528+0,"esp"),$Zhi);
690 &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
691
692 &xor ($nlo,$nlo);
693 &rol ($dat,8);
694 &mov (&LB($nlo),&LB($dat));
695 &mov ($nhi[1],$nlo);
696 &and (&LB($nlo),0x0f);
697 &shr ($nhi[1],4);
698 &pxor ($red[0],$red[0]);
699 &rol ($dat,8); # next byte
700 &pxor ($red[1],$red[1]);
701 &pxor ($red[2],$red[2]);
702
703 # Just like in "May" verson modulo-schedule for critical path in
704 # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
705 # is scheduled so late that rem_8bit[] has to be shifted *right*
706 # by 16, which is why last argument to pinsrw is 2, which
707 # corresponds to <<32=<<48>>16...
708 for ($j=11,$i=0;$i<15;$i++) {
709
710 if ($i>0) {
711 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
712 &rol ($dat,8); # next byte
713 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
714
715 &pxor ($Zlo,$tmp);
716 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
717 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
718 } else {
719 &movq ($Zlo,&QWP(16,"esp",$nlo,8));
720 &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
721 }
722
723 &mov (&LB($nlo),&LB($dat));
724 &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
725
726 &movd ($rem[0],$Zlo);
727 &movz ($rem[1],&LB($rem[1])) if ($i>0);
728 &psrlq ($Zlo,8); # Z>>=8
729
730 &movq ($tmp,$Zhi);
731 &mov ($nhi[0],$nlo);
732 &psrlq ($Zhi,8);
733
734 &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
735 &and (&LB($nlo),0x0f);
736 &psllq ($tmp,56);
737
738 &pxor ($Zhi,$red[1]) if ($i>1);
739 &shr ($nhi[0],4);
740 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
741
742 unshift (@red,pop(@red)); # "rotate" registers
743 unshift (@rem,pop(@rem));
744 unshift (@nhi,pop(@nhi));
745 }
746
747 &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
748 &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
749 &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
750
751 &pxor ($Zlo,$tmp);
752 &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
753 &movz ($rem[1],&LB($rem[1]));
754
755 &pxor ($red[2],$red[2]); # clear 2nd word
756 &psllq ($red[1],4);
757
758 &movd ($rem[0],$Zlo);
759 &psrlq ($Zlo,4); # Z>>=4
760
761 &movq ($tmp,$Zhi);
762 &psrlq ($Zhi,4);
763 &shl ($rem[0],4); # rem<<4
764
765 &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
766 &psllq ($tmp,60);
767 &movz ($rem[0],&LB($rem[0]));
768
769 &pxor ($Zlo,$tmp);
770 &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
771
772 &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
773 &pxor ($Zhi,$red[1]);
774
775 &movd ($dat,$Zlo);
776 &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
777
778 &psllq ($red[0],12); # correct by <<16>>4
779 &pxor ($Zhi,$red[0]);
780 &psrlq ($Zlo,32);
781 &pxor ($Zhi,$red[2]);
782
783 &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
784 &movd ("ebx",$Zlo);
785 &movq ($tmp,$Zhi); # 01234567
786 &psllw ($Zhi,8); # 1.3.5.7.
787 &psrlw ($tmp,8); # .0.2.4.6
788 &por ($Zhi,$tmp); # 10325476
789 &bswap ($dat);
790 &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
791 &bswap ("ebx");
792
793 &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
794 &jne (&label("outer"));
795 }
796
797 &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
798 &mov (&DWP(12,"eax"),"edx");
799 &mov (&DWP(8,"eax"),"ebx");
800 &movq (&QWP(0,"eax"),$Zhi);
801
802 &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
803 &emms ();
804}
805&function_end("gcm_ghash_4bit_mmx");
806}}
807
808if ($sse2) {{
809######################################################################
810# PCLMULQDQ version.
811
812$Xip="eax";
813$Htbl="edx";
814$const="ecx";
815$inp="esi";
816$len="ebx";
817
818($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
819($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
820($Xn,$Xhn)=("xmm6","xmm7");
821
822&static_label("bswap");
823
824sub clmul64x64_T2 { # minimal "register" pressure
825my ($Xhi,$Xi,$Hkey)=@_;
826
827 &movdqa ($Xhi,$Xi); #
828 &pshufd ($T1,$Xi,0b01001110);
829 &pshufd ($T2,$Hkey,0b01001110);
830 &pxor ($T1,$Xi); #
831 &pxor ($T2,$Hkey);
832
833 &pclmulqdq ($Xi,$Hkey,0x00); #######
834 &pclmulqdq ($Xhi,$Hkey,0x11); #######
835 &pclmulqdq ($T1,$T2,0x00); #######
836 &xorps ($T1,$Xi); #
837 &xorps ($T1,$Xhi); #
838
839 &movdqa ($T2,$T1); #
840 &psrldq ($T1,8);
841 &pslldq ($T2,8); #
842 &pxor ($Xhi,$T1);
843 &pxor ($Xi,$T2); #
844}
845
846sub clmul64x64_T3 {
847# Even though this subroutine offers visually better ILP, it
848# was empirically found to be a tad slower than above version.
849# At least in gcm_ghash_clmul context. But it's just as well,
850# because loop modulo-scheduling is possible only thanks to
851# minimized "register" pressure...
852my ($Xhi,$Xi,$Hkey)=@_;
853
854 &movdqa ($T1,$Xi); #
855 &movdqa ($Xhi,$Xi);
856 &pclmulqdq ($Xi,$Hkey,0x00); #######
857 &pclmulqdq ($Xhi,$Hkey,0x11); #######
858 &pshufd ($T2,$T1,0b01001110); #
859 &pshufd ($T3,$Hkey,0b01001110);
860 &pxor ($T2,$T1); #
861 &pxor ($T3,$Hkey);
862 &pclmulqdq ($T2,$T3,0x00); #######
863 &pxor ($T2,$Xi); #
864 &pxor ($T2,$Xhi); #
865
866 &movdqa ($T3,$T2); #
867 &psrldq ($T2,8);
868 &pslldq ($T3,8); #
869 &pxor ($Xhi,$T2);
870 &pxor ($Xi,$T3); #
871}
872
873if (1) { # Algorithm 9 with <<1 twist.
874 # Reduction is shorter and uses only two
875 # temporary registers, which makes it better
876 # candidate for interleaving with 64x64
877 # multiplication. Pre-modulo-scheduled loop
878 # was found to be ~20% faster than Algorithm 5
879 # below. Algorithm 9 was therefore chosen for
880 # further optimization...
881
882sub reduction_alg9 { # 17/13 times faster than Intel version
883my ($Xhi,$Xi) = @_;
884
885 # 1st phase
886 &movdqa ($T1,$Xi); #
887 &psllq ($Xi,1);
888 &pxor ($Xi,$T1); #
889 &psllq ($Xi,5); #
890 &pxor ($Xi,$T1); #
891 &psllq ($Xi,57); #
892 &movdqa ($T2,$Xi); #
893 &pslldq ($Xi,8);
894 &psrldq ($T2,8); #
895 &pxor ($Xi,$T1);
896 &pxor ($Xhi,$T2); #
897
898 # 2nd phase
899 &movdqa ($T2,$Xi);
900 &psrlq ($Xi,5);
901 &pxor ($Xi,$T2); #
902 &psrlq ($Xi,1); #
903 &pxor ($Xi,$T2); #
904 &pxor ($T2,$Xhi);
905 &psrlq ($Xi,1); #
906 &pxor ($Xi,$T2); #
907}
908
909&function_begin_B("gcm_init_clmul");
910 &mov ($Htbl,&wparam(0));
911 &mov ($Xip,&wparam(1));
912
913 &call (&label("pic"));
914&set_label("pic");
915 &blindpop ($const);
916 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
917
918 &movdqu ($Hkey,&QWP(0,$Xip));
919 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
920
921 # <<1 twist
922 &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
923 &movdqa ($T1,$Hkey);
924 &psllq ($Hkey,1);
925 &pxor ($T3,$T3); #
926 &psrlq ($T1,63);
927 &pcmpgtd ($T3,$T2); # broadcast carry bit
928 &pslldq ($T1,8);
929 &por ($Hkey,$T1); # H<<=1
930
931 # magic reduction
932 &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
933 &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
934
935 # calculate H^2
936 &movdqa ($Xi,$Hkey);
937 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
938 &reduction_alg9 ($Xhi,$Xi);
939
940 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
941 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
942
943 &ret ();
944&function_end_B("gcm_init_clmul");
945
946&function_begin_B("gcm_gmult_clmul");
947 &mov ($Xip,&wparam(0));
948 &mov ($Htbl,&wparam(1));
949
950 &call (&label("pic"));
951&set_label("pic");
952 &blindpop ($const);
953 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
954
955 &movdqu ($Xi,&QWP(0,$Xip));
956 &movdqa ($T3,&QWP(0,$const));
957 &movups ($Hkey,&QWP(0,$Htbl));
958 &pshufb ($Xi,$T3);
959
960 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
961 &reduction_alg9 ($Xhi,$Xi);
962
963 &pshufb ($Xi,$T3);
964 &movdqu (&QWP(0,$Xip),$Xi);
965
966 &ret ();
967&function_end_B("gcm_gmult_clmul");
968
969&function_begin("gcm_ghash_clmul");
970 &mov ($Xip,&wparam(0));
971 &mov ($Htbl,&wparam(1));
972 &mov ($inp,&wparam(2));
973 &mov ($len,&wparam(3));
974
975 &call (&label("pic"));
976&set_label("pic");
977 &blindpop ($const);
978 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
979
980 &movdqu ($Xi,&QWP(0,$Xip));
981 &movdqa ($T3,&QWP(0,$const));
982 &movdqu ($Hkey,&QWP(0,$Htbl));
983 &pshufb ($Xi,$T3);
984
985 &sub ($len,0x10);
986 &jz (&label("odd_tail"));
987
988 #######
989 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
990 # [(H*Ii+1) + (H*Xi+1)] mod P =
991 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
992 #
993 &movdqu ($T1,&QWP(0,$inp)); # Ii
994 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
995 &pshufb ($T1,$T3);
996 &pshufb ($Xn,$T3);
997 &pxor ($Xi,$T1); # Ii+Xi
998
999 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
1000 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1001
1002 &lea ($inp,&DWP(32,$inp)); # i+=2
1003 &sub ($len,0x20);
1004 &jbe (&label("even_tail"));
1005
1006&set_label("mod_loop");
1007 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1008 &movdqu ($T1,&QWP(0,$inp)); # Ii
1009 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1010
1011 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1012 &pxor ($Xhi,$Xhn);
1013
1014 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1015 &pshufb ($T1,$T3);
1016 &pshufb ($Xn,$T3);
1017
1018 &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1019 &movdqa ($Xhn,$Xn);
1020 &pxor ($Xhi,$T1); # "Ii+Xi", consume early
1021
1022 &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
1023 &psllq ($Xi,1);
1024 &pxor ($Xi,$T1); #
1025 &psllq ($Xi,5); #
1026 &pxor ($Xi,$T1); #
1027 &pclmulqdq ($Xn,$Hkey,0x00); #######
1028 &psllq ($Xi,57); #
1029 &movdqa ($T2,$Xi); #
1030 &pslldq ($Xi,8);
1031 &psrldq ($T2,8); #
1032 &pxor ($Xi,$T1);
1033 &pshufd ($T1,$T3,0b01001110);
1034 &pxor ($Xhi,$T2); #
1035 &pxor ($T1,$T3);
1036 &pshufd ($T3,$Hkey,0b01001110);
1037 &pxor ($T3,$Hkey); #
1038
1039 &pclmulqdq ($Xhn,$Hkey,0x11); #######
1040 &movdqa ($T2,$Xi); # 2nd phase
1041 &psrlq ($Xi,5);
1042 &pxor ($Xi,$T2); #
1043 &psrlq ($Xi,1); #
1044 &pxor ($Xi,$T2); #
1045 &pxor ($T2,$Xhi);
1046 &psrlq ($Xi,1); #
1047 &pxor ($Xi,$T2); #
1048
1049 &pclmulqdq ($T1,$T3,0x00); #######
1050 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1051 &xorps ($T1,$Xn); #
1052 &xorps ($T1,$Xhn); #
1053
1054 &movdqa ($T3,$T1); #
1055 &psrldq ($T1,8);
1056 &pslldq ($T3,8); #
1057 &pxor ($Xhn,$T1);
1058 &pxor ($Xn,$T3); #
1059 &movdqa ($T3,&QWP(0,$const));
1060
1061 &lea ($inp,&DWP(32,$inp));
1062 &sub ($len,0x20);
1063 &ja (&label("mod_loop"));
1064
1065&set_label("even_tail");
1066 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1067
1068 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1069 &pxor ($Xhi,$Xhn);
1070
1071 &reduction_alg9 ($Xhi,$Xi);
1072
1073 &test ($len,$len);
1074 &jnz (&label("done"));
1075
1076 &movups ($Hkey,&QWP(0,$Htbl)); # load H
1077&set_label("odd_tail");
1078 &movdqu ($T1,&QWP(0,$inp)); # Ii
1079 &pshufb ($T1,$T3);
1080 &pxor ($Xi,$T1); # Ii+Xi
1081
1082 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1083 &reduction_alg9 ($Xhi,$Xi);
1084
1085&set_label("done");
1086 &pshufb ($Xi,$T3);
1087 &movdqu (&QWP(0,$Xip),$Xi);
1088&function_end("gcm_ghash_clmul");
1089
1090} else { # Algorith 5. Kept for reference purposes.
1091
1092sub reduction_alg5 { # 19/16 times faster than Intel version
1093my ($Xhi,$Xi)=@_;
1094
1095 # <<1
1096 &movdqa ($T1,$Xi); #
1097 &movdqa ($T2,$Xhi);
1098 &pslld ($Xi,1);
1099 &pslld ($Xhi,1); #
1100 &psrld ($T1,31);
1101 &psrld ($T2,31); #
1102 &movdqa ($T3,$T1);
1103 &pslldq ($T1,4);
1104 &psrldq ($T3,12); #
1105 &pslldq ($T2,4);
1106 &por ($Xhi,$T3); #
1107 &por ($Xi,$T1);
1108 &por ($Xhi,$T2); #
1109
1110 # 1st phase
1111 &movdqa ($T1,$Xi);
1112 &movdqa ($T2,$Xi);
1113 &movdqa ($T3,$Xi); #
1114 &pslld ($T1,31);
1115 &pslld ($T2,30);
1116 &pslld ($Xi,25); #
1117 &pxor ($T1,$T2);
1118 &pxor ($T1,$Xi); #
1119 &movdqa ($T2,$T1); #
1120 &pslldq ($T1,12);
1121 &psrldq ($T2,4); #
1122 &pxor ($T3,$T1);
1123
1124 # 2nd phase
1125 &pxor ($Xhi,$T3); #
1126 &movdqa ($Xi,$T3);
1127 &movdqa ($T1,$T3);
1128 &psrld ($Xi,1); #
1129 &psrld ($T1,2);
1130 &psrld ($T3,7); #
1131 &pxor ($Xi,$T1);
1132 &pxor ($Xhi,$T2);
1133 &pxor ($Xi,$T3); #
1134 &pxor ($Xi,$Xhi); #
1135}
1136
1137&function_begin_B("gcm_init_clmul");
1138 &mov ($Htbl,&wparam(0));
1139 &mov ($Xip,&wparam(1));
1140
1141 &call (&label("pic"));
1142&set_label("pic");
1143 &blindpop ($const);
1144 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1145
1146 &movdqu ($Hkey,&QWP(0,$Xip));
1147 &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1148
1149 # calculate H^2
1150 &movdqa ($Xi,$Hkey);
1151 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1152 &reduction_alg5 ($Xhi,$Xi);
1153
1154 &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1155 &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1156
1157 &ret ();
1158&function_end_B("gcm_init_clmul");
1159
1160&function_begin_B("gcm_gmult_clmul");
1161 &mov ($Xip,&wparam(0));
1162 &mov ($Htbl,&wparam(1));
1163
1164 &call (&label("pic"));
1165&set_label("pic");
1166 &blindpop ($const);
1167 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1168
1169 &movdqu ($Xi,&QWP(0,$Xip));
1170 &movdqa ($Xn,&QWP(0,$const));
1171 &movdqu ($Hkey,&QWP(0,$Htbl));
1172 &pshufb ($Xi,$Xn);
1173
1174 &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1175 &reduction_alg5 ($Xhi,$Xi);
1176
1177 &pshufb ($Xi,$Xn);
1178 &movdqu (&QWP(0,$Xip),$Xi);
1179
1180 &ret ();
1181&function_end_B("gcm_gmult_clmul");
1182
1183&function_begin("gcm_ghash_clmul");
1184 &mov ($Xip,&wparam(0));
1185 &mov ($Htbl,&wparam(1));
1186 &mov ($inp,&wparam(2));
1187 &mov ($len,&wparam(3));
1188
1189 &call (&label("pic"));
1190&set_label("pic");
1191 &blindpop ($const);
1192 &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1193
1194 &movdqu ($Xi,&QWP(0,$Xip));
1195 &movdqa ($T3,&QWP(0,$const));
1196 &movdqu ($Hkey,&QWP(0,$Htbl));
1197 &pshufb ($Xi,$T3);
1198
1199 &sub ($len,0x10);
1200 &jz (&label("odd_tail"));
1201
1202 #######
1203 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1204 # [(H*Ii+1) + (H*Xi+1)] mod P =
1205 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1206 #
1207 &movdqu ($T1,&QWP(0,$inp)); # Ii
1208 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1209 &pshufb ($T1,$T3);
1210 &pshufb ($Xn,$T3);
1211 &pxor ($Xi,$T1); # Ii+Xi
1212
1213 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1214 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1215
1216 &sub ($len,0x20);
1217 &lea ($inp,&DWP(32,$inp)); # i+=2
1218 &jbe (&label("even_tail"));
1219
1220&set_label("mod_loop");
1221 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1222 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1223
1224 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1225 &pxor ($Xhi,$Xhn);
1226
1227 &reduction_alg5 ($Xhi,$Xi);
1228
1229 #######
1230 &movdqa ($T3,&QWP(0,$const));
1231 &movdqu ($T1,&QWP(0,$inp)); # Ii
1232 &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1233 &pshufb ($T1,$T3);
1234 &pshufb ($Xn,$T3);
1235 &pxor ($Xi,$T1); # Ii+Xi
1236
1237 &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1238 &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1239
1240 &sub ($len,0x20);
1241 &lea ($inp,&DWP(32,$inp));
1242 &ja (&label("mod_loop"));
1243
1244&set_label("even_tail");
1245 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1246
1247 &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1248 &pxor ($Xhi,$Xhn);
1249
1250 &reduction_alg5 ($Xhi,$Xi);
1251
1252 &movdqa ($T3,&QWP(0,$const));
1253 &test ($len,$len);
1254 &jnz (&label("done"));
1255
1256 &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1257&set_label("odd_tail");
1258 &movdqu ($T1,&QWP(0,$inp)); # Ii
1259 &pshufb ($T1,$T3);
1260 &pxor ($Xi,$T1); # Ii+Xi
1261
1262 &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1263 &reduction_alg5 ($Xhi,$Xi);
1264
1265 &movdqa ($T3,&QWP(0,$const));
1266&set_label("done");
1267 &pshufb ($Xi,$T3);
1268 &movdqu (&QWP(0,$Xip),$Xi);
1269&function_end("gcm_ghash_clmul");
1270
1271}
1272
1273&set_label("bswap",64);
1274 &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1275 &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1276}} # $sse2
1277
1278&set_label("rem_4bit",64);
1279 &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1280 &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1281 &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1282 &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1283&set_label("rem_8bit",64);
1284 &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1285 &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1286 &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1287 &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1288 &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1289 &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1290 &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1291 &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1292 &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1293 &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1294 &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1295 &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1296 &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1297 &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1298 &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1299 &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1300 &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1301 &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1302 &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1303 &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1304 &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1305 &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1306 &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1307 &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1308 &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1309 &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1310 &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1311 &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1312 &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1313 &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1314 &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1315 &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1316}}} # !$x86only
1317
1318&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
1319&asm_finish();
1320
1321# A question was risen about choice of vanilla MMX. Or rather why wasn't
1322# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1323# CPUs such as PIII, "4-bit" MMX version was observed to provide better
1324# performance than *corresponding* SSE2 one even on contemporary CPUs.
1325# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1326# implementation featuring full range of lookup-table sizes, but with
1327# per-invocation lookup table setup. Latter means that table size is
1328# chosen depending on how much data is to be hashed in every given call,
1329# more data - larger table. Best reported result for Core2 is ~4 cycles
1330# per processed byte out of 64KB block. This number accounts even for
1331# 64KB table setup overhead. As discussed in gcm128.c we choose to be
1332# more conservative in respect to lookup table sizes, but how do the
1333# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1334# on same platform. As also discussed in gcm128.c, next in line "8-bit
1335# Shoup's" or "4KB" method should deliver twice the performance of
1336# "256B" one, in other words not worse than ~6 cycles per byte. It
1337# should be also be noted that in SSE2 case improvement can be "super-
1338# linear," i.e. more than twice, mostly because >>8 maps to single
1339# instruction on SSE2 register. This is unlike "4-bit" case when >>4
1340# maps to same amount of instructions in both MMX and SSE2 cases.
1341# Bottom line is that switch to SSE2 is considered to be justifiable
1342# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index 38d779edbc..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,806 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March, June 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that
14# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15# function features so called "528B" variant utilizing additional
16# 256+16 bytes of per-key storage [+512 bytes shared table].
17# Performance results are for this streamed GHASH subroutine and are
18# expressed in cycles per processed byte, less is better:
19#
20# gcc 3.4.x(*) assembler
21#
22# P4 28.6 14.0 +100%
23# Opteron 19.3 7.7 +150%
24# Core2 17.8 8.1(**) +120%
25#
26# (*) comparison is not completely fair, because C results are
27# for vanilla "256B" implementation, while assembler results
28# are for "528B";-)
29# (**) it's mystery [to me] why Core2 result is not same as for
30# Opteron;
31
32# May 2010
33#
34# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
35# See ghash-x86.pl for background information and details about coding
36# techniques.
37#
38# Special thanks to David Woodhouse <dwmw2@infradead.org> for
39# providing access to a Westmere-based system on behalf of Intel
40# Open Source Technology Centre.
41
42$flavour = shift;
43$output = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour $output";
54*STDOUT=*OUT;
55
56# common register layout
57$nlo="%rax";
58$nhi="%rbx";
59$Zlo="%r8";
60$Zhi="%r9";
61$tmp="%r10";
62$rem_4bit = "%r11";
63
64$Xi="%rdi";
65$Htbl="%rsi";
66
67# per-function register layout
68$cnt="%rcx";
69$rem="%rdx";
70
71sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
72 $r =~ s/%[er]([sd]i)/%\1l/ or
73 $r =~ s/%[er](bp)/%\1l/ or
74 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
75
76sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
77{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
78 my $arg = pop;
79 $arg = "\$$arg" if ($arg*1 eq $arg);
80 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
81}
82
83{ my $N;
84 sub loop() {
85 my $inp = shift;
86
87 $N++;
88$code.=<<___;
89 xor $nlo,$nlo
90 xor $nhi,$nhi
91 mov `&LB("$Zlo")`,`&LB("$nlo")`
92 mov `&LB("$Zlo")`,`&LB("$nhi")`
93 shl \$4,`&LB("$nlo")`
94 mov \$14,$cnt
95 mov 8($Htbl,$nlo),$Zlo
96 mov ($Htbl,$nlo),$Zhi
97 and \$0xf0,`&LB("$nhi")`
98 mov $Zlo,$rem
99 jmp .Loop$N
100
101.align 16
102.Loop$N:
103 shr \$4,$Zlo
104 and \$0xf,$rem
105 mov $Zhi,$tmp
106 mov ($inp,$cnt),`&LB("$nlo")`
107 shr \$4,$Zhi
108 xor 8($Htbl,$nhi),$Zlo
109 shl \$60,$tmp
110 xor ($Htbl,$nhi),$Zhi
111 mov `&LB("$nlo")`,`&LB("$nhi")`
112 xor ($rem_4bit,$rem,8),$Zhi
113 mov $Zlo,$rem
114 shl \$4,`&LB("$nlo")`
115 xor $tmp,$Zlo
116 dec $cnt
117 js .Lbreak$N
118
119 shr \$4,$Zlo
120 and \$0xf,$rem
121 mov $Zhi,$tmp
122 shr \$4,$Zhi
123 xor 8($Htbl,$nlo),$Zlo
124 shl \$60,$tmp
125 xor ($Htbl,$nlo),$Zhi
126 and \$0xf0,`&LB("$nhi")`
127 xor ($rem_4bit,$rem,8),$Zhi
128 mov $Zlo,$rem
129 xor $tmp,$Zlo
130 jmp .Loop$N
131
132.align 16
133.Lbreak$N:
134 shr \$4,$Zlo
135 and \$0xf,$rem
136 mov $Zhi,$tmp
137 shr \$4,$Zhi
138 xor 8($Htbl,$nlo),$Zlo
139 shl \$60,$tmp
140 xor ($Htbl,$nlo),$Zhi
141 and \$0xf0,`&LB("$nhi")`
142 xor ($rem_4bit,$rem,8),$Zhi
143 mov $Zlo,$rem
144 xor $tmp,$Zlo
145
146 shr \$4,$Zlo
147 and \$0xf,$rem
148 mov $Zhi,$tmp
149 shr \$4,$Zhi
150 xor 8($Htbl,$nhi),$Zlo
151 shl \$60,$tmp
152 xor ($Htbl,$nhi),$Zhi
153 xor $tmp,$Zlo
154 xor ($rem_4bit,$rem,8),$Zhi
155
156 bswap $Zlo
157 bswap $Zhi
158___
159}}
160
161$code=<<___;
162.text
163
164.globl gcm_gmult_4bit
165.type gcm_gmult_4bit,\@function,2
166.align 16
167gcm_gmult_4bit:
168 push %rbx
169 push %rbp # %rbp and %r12 are pushed exclusively in
170 push %r12 # order to reuse Win64 exception handler...
171.Lgmult_prologue:
172
173 movzb 15($Xi),$Zlo
174 lea .Lrem_4bit(%rip),$rem_4bit
175___
176 &loop ($Xi);
177$code.=<<___;
178 mov $Zlo,8($Xi)
179 mov $Zhi,($Xi)
180
181 mov 16(%rsp),%rbx
182 lea 24(%rsp),%rsp
183.Lgmult_epilogue:
184 ret
185.size gcm_gmult_4bit,.-gcm_gmult_4bit
186___
187
188# per-function register layout
189$inp="%rdx";
190$len="%rcx";
191$rem_8bit=$rem_4bit;
192
193$code.=<<___;
194.globl gcm_ghash_4bit
195.type gcm_ghash_4bit,\@function,4
196.align 16
197gcm_ghash_4bit:
198 push %rbx
199 push %rbp
200 push %r12
201 push %r13
202 push %r14
203 push %r15
204 sub \$280,%rsp
205.Lghash_prologue:
206 mov $inp,%r14 # reassign couple of args
207 mov $len,%r15
208___
209{ my $inp="%r14";
210 my $dat="%edx";
211 my $len="%r15";
212 my @nhi=("%ebx","%ecx");
213 my @rem=("%r12","%r13");
214 my $Hshr4="%rbp";
215
216 &sub ($Htbl,-128); # size optimization
217 &lea ($Hshr4,"16+128(%rsp)");
218 { my @lo =($nlo,$nhi);
219 my @hi =($Zlo,$Zhi);
220
221 &xor ($dat,$dat);
222 for ($i=0,$j=-2;$i<18;$i++,$j++) {
223 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
224 &or ($lo[0],$tmp) if ($i>1);
225 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
226 &shr ($lo[1],4) if ($i>0 && $i<17);
227 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
228 &shr ($hi[1],4) if ($i>0 && $i<17);
229 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
230 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
231 &shl (&LB($dat),4) if ($i>0 && $i<17);
232 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
233 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
234 &shl ($tmp,60) if ($i>0 && $i<17);
235
236 push (@lo,shift(@lo));
237 push (@hi,shift(@hi));
238 }
239 }
240 &add ($Htbl,-128);
241 &mov ($Zlo,"8($Xi)");
242 &mov ($Zhi,"0($Xi)");
243 &add ($len,$inp); # pointer to the end of data
244 &lea ($rem_8bit,".Lrem_8bit(%rip)");
245 &jmp (".Louter_loop");
246
247$code.=".align 16\n.Louter_loop:\n";
248 &xor ($Zhi,"($inp)");
249 &mov ("%rdx","8($inp)");
250 &lea ($inp,"16($inp)");
251 &xor ("%rdx",$Zlo);
252 &mov ("($Xi)",$Zhi);
253 &mov ("8($Xi)","%rdx");
254 &shr ("%rdx",32);
255
256 &xor ($nlo,$nlo);
257 &rol ($dat,8);
258 &mov (&LB($nlo),&LB($dat));
259 &movz ($nhi[0],&LB($dat));
260 &shl (&LB($nlo),4);
261 &shr ($nhi[0],4);
262
263 for ($j=11,$i=0;$i<15;$i++) {
264 &rol ($dat,8);
265 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
266 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
267 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
268 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
269
270 &mov (&LB($nlo),&LB($dat));
271 &xor ($Zlo,$tmp) if ($i>0);
272 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
273
274 &movz ($nhi[1],&LB($dat));
275 &shl (&LB($nlo),4);
276 &movzb ($rem[0],"(%rsp,$nhi[0])");
277
278 &shr ($nhi[1],4) if ($i<14);
279 &and ($nhi[1],0xf0) if ($i==14);
280 &shl ($rem[1],48) if ($i>0);
281 &xor ($rem[0],$Zlo);
282
283 &mov ($tmp,$Zhi);
284 &xor ($Zhi,$rem[1]) if ($i>0);
285 &shr ($Zlo,8);
286
287 &movz ($rem[0],&LB($rem[0]));
288 &mov ($dat,"$j($Xi)") if (--$j%4==0);
289 &shr ($Zhi,8);
290
291 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
292 &shl ($tmp,56);
293 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
294
295 unshift (@nhi,pop(@nhi)); # "rotate" registers
296 unshift (@rem,pop(@rem));
297 }
298 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
299 &xor ($Zlo,"8($Htbl,$nlo)");
300 &xor ($Zhi,"($Htbl,$nlo)");
301
302 &shl ($rem[1],48);
303 &xor ($Zlo,$tmp);
304
305 &xor ($Zhi,$rem[1]);
306 &movz ($rem[0],&LB($Zlo));
307 &shr ($Zlo,4);
308
309 &mov ($tmp,$Zhi);
310 &shl (&LB($rem[0]),4);
311 &shr ($Zhi,4);
312
313 &xor ($Zlo,"8($Htbl,$nhi[0])");
314 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
315 &shl ($tmp,60);
316
317 &xor ($Zhi,"($Htbl,$nhi[0])");
318 &xor ($Zlo,$tmp);
319 &shl ($rem[0],48);
320
321 &bswap ($Zlo);
322 &xor ($Zhi,$rem[0]);
323
324 &bswap ($Zhi);
325 &cmp ($inp,$len);
326 &jb (".Louter_loop");
327}
328$code.=<<___;
329 mov $Zlo,8($Xi)
330 mov $Zhi,($Xi)
331
332 lea 280(%rsp),%rsi
333 mov 0(%rsi),%r15
334 mov 8(%rsi),%r14
335 mov 16(%rsi),%r13
336 mov 24(%rsi),%r12
337 mov 32(%rsi),%rbp
338 mov 40(%rsi),%rbx
339 lea 48(%rsi),%rsp
340.Lghash_epilogue:
341 ret
342.size gcm_ghash_4bit,.-gcm_ghash_4bit
343___
344
345######################################################################
346# PCLMULQDQ version.
347
348@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
349 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
350
351($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
352($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
353
354sub clmul64x64_T2 { # minimal register pressure
355my ($Xhi,$Xi,$Hkey,$modulo)=@_;
356
357$code.=<<___ if (!defined($modulo));
358 movdqa $Xi,$Xhi #
359 pshufd \$0b01001110,$Xi,$T1
360 pshufd \$0b01001110,$Hkey,$T2
361 pxor $Xi,$T1 #
362 pxor $Hkey,$T2
363___
364$code.=<<___;
365 pclmulqdq \$0x00,$Hkey,$Xi #######
366 pclmulqdq \$0x11,$Hkey,$Xhi #######
367 pclmulqdq \$0x00,$T2,$T1 #######
368 pxor $Xi,$T1 #
369 pxor $Xhi,$T1 #
370
371 movdqa $T1,$T2 #
372 psrldq \$8,$T1
373 pslldq \$8,$T2 #
374 pxor $T1,$Xhi
375 pxor $T2,$Xi #
376___
377}
378
379sub reduction_alg9 { # 17/13 times faster than Intel version
380my ($Xhi,$Xi) = @_;
381
382$code.=<<___;
383 # 1st phase
384 movdqa $Xi,$T1 #
385 psllq \$1,$Xi
386 pxor $T1,$Xi #
387 psllq \$5,$Xi #
388 pxor $T1,$Xi #
389 psllq \$57,$Xi #
390 movdqa $Xi,$T2 #
391 pslldq \$8,$Xi
392 psrldq \$8,$T2 #
393 pxor $T1,$Xi
394 pxor $T2,$Xhi #
395
396 # 2nd phase
397 movdqa $Xi,$T2
398 psrlq \$5,$Xi
399 pxor $T2,$Xi #
400 psrlq \$1,$Xi #
401 pxor $T2,$Xi #
402 pxor $Xhi,$T2
403 psrlq \$1,$Xi #
404 pxor $T2,$Xi #
405___
406}
407
408{ my ($Htbl,$Xip)=@_4args;
409
410$code.=<<___;
411.globl gcm_init_clmul
412.type gcm_init_clmul,\@abi-omnipotent
413.align 16
414gcm_init_clmul:
415 movdqu ($Xip),$Hkey
416 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
417
418 # <<1 twist
419 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
420 movdqa $Hkey,$T1
421 psllq \$1,$Hkey
422 pxor $T3,$T3 #
423 psrlq \$63,$T1
424 pcmpgtd $T2,$T3 # broadcast carry bit
425 pslldq \$8,$T1
426 por $T1,$Hkey # H<<=1
427
428 # magic reduction
429 pand .L0x1c2_polynomial(%rip),$T3
430 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
431
432 # calculate H^2
433 movdqa $Hkey,$Xi
434___
435 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
436 &reduction_alg9 ($Xhi,$Xi);
437$code.=<<___;
438 movdqu $Hkey,($Htbl) # save H
439 movdqu $Xi,16($Htbl) # save H^2
440 ret
441.size gcm_init_clmul,.-gcm_init_clmul
442___
443}
444
445{ my ($Xip,$Htbl)=@_4args;
446
447$code.=<<___;
448.globl gcm_gmult_clmul
449.type gcm_gmult_clmul,\@abi-omnipotent
450.align 16
451gcm_gmult_clmul:
452 movdqu ($Xip),$Xi
453 movdqa .Lbswap_mask(%rip),$T3
454 movdqu ($Htbl),$Hkey
455 pshufb $T3,$Xi
456___
457 &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
458 &reduction_alg9 ($Xhi,$Xi);
459$code.=<<___;
460 pshufb $T3,$Xi
461 movdqu $Xi,($Xip)
462 ret
463.size gcm_gmult_clmul,.-gcm_gmult_clmul
464___
465}
466
467{ my ($Xip,$Htbl,$inp,$len)=@_4args;
468 my $Xn="%xmm6";
469 my $Xhn="%xmm7";
470 my $Hkey2="%xmm8";
471 my $T1n="%xmm9";
472 my $T2n="%xmm10";
473
474$code.=<<___;
475.globl gcm_ghash_clmul
476.type gcm_ghash_clmul,\@abi-omnipotent
477.align 16
478gcm_ghash_clmul:
479___
480$code.=<<___ if ($win64);
481.LSEH_begin_gcm_ghash_clmul:
482 # I can't trust assembler to use specific encoding:-(
483 .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
484 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
485 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
486 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
487 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
488 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
489___
490$code.=<<___;
491 movdqa .Lbswap_mask(%rip),$T3
492
493 movdqu ($Xip),$Xi
494 movdqu ($Htbl),$Hkey
495 pshufb $T3,$Xi
496
497 sub \$0x10,$len
498 jz .Lodd_tail
499
500 movdqu 16($Htbl),$Hkey2
501 #######
502 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
503 # [(H*Ii+1) + (H*Xi+1)] mod P =
504 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
505 #
506 movdqu ($inp),$T1 # Ii
507 movdqu 16($inp),$Xn # Ii+1
508 pshufb $T3,$T1
509 pshufb $T3,$Xn
510 pxor $T1,$Xi # Ii+Xi
511___
512 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
513$code.=<<___;
514 movdqa $Xi,$Xhi #
515 pshufd \$0b01001110,$Xi,$T1
516 pshufd \$0b01001110,$Hkey2,$T2
517 pxor $Xi,$T1 #
518 pxor $Hkey2,$T2
519
520 lea 32($inp),$inp # i+=2
521 sub \$0x20,$len
522 jbe .Leven_tail
523
524.Lmod_loop:
525___
526 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
527$code.=<<___;
528 movdqu ($inp),$T1 # Ii
529 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
530 pxor $Xhn,$Xhi
531
532 movdqu 16($inp),$Xn # Ii+1
533 pshufb $T3,$T1
534 pshufb $T3,$Xn
535
536 movdqa $Xn,$Xhn #
537 pshufd \$0b01001110,$Xn,$T1n
538 pshufd \$0b01001110,$Hkey,$T2n
539 pxor $Xn,$T1n #
540 pxor $Hkey,$T2n
541 pxor $T1,$Xhi # "Ii+Xi", consume early
542
543 movdqa $Xi,$T1 # 1st phase
544 psllq \$1,$Xi
545 pxor $T1,$Xi #
546 psllq \$5,$Xi #
547 pxor $T1,$Xi #
548 pclmulqdq \$0x00,$Hkey,$Xn #######
549 psllq \$57,$Xi #
550 movdqa $Xi,$T2 #
551 pslldq \$8,$Xi
552 psrldq \$8,$T2 #
553 pxor $T1,$Xi
554 pxor $T2,$Xhi #
555
556 pclmulqdq \$0x11,$Hkey,$Xhn #######
557 movdqa $Xi,$T2 # 2nd phase
558 psrlq \$5,$Xi
559 pxor $T2,$Xi #
560 psrlq \$1,$Xi #
561 pxor $T2,$Xi #
562 pxor $Xhi,$T2
563 psrlq \$1,$Xi #
564 pxor $T2,$Xi #
565
566 pclmulqdq \$0x00,$T2n,$T1n #######
567 movdqa $Xi,$Xhi #
568 pshufd \$0b01001110,$Xi,$T1
569 pshufd \$0b01001110,$Hkey2,$T2
570 pxor $Xi,$T1 #
571 pxor $Hkey2,$T2
572
573 pxor $Xn,$T1n #
574 pxor $Xhn,$T1n #
575 movdqa $T1n,$T2n #
576 psrldq \$8,$T1n
577 pslldq \$8,$T2n #
578 pxor $T1n,$Xhn
579 pxor $T2n,$Xn #
580
581 lea 32($inp),$inp
582 sub \$0x20,$len
583 ja .Lmod_loop
584
585.Leven_tail:
586___
587 &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
588$code.=<<___;
589 pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
590 pxor $Xhn,$Xhi
591___
592 &reduction_alg9 ($Xhi,$Xi);
593$code.=<<___;
594 test $len,$len
595 jnz .Ldone
596
597.Lodd_tail:
598 movdqu ($inp),$T1 # Ii
599 pshufb $T3,$T1
600 pxor $T1,$Xi # Ii+Xi
601___
602 &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
603 &reduction_alg9 ($Xhi,$Xi);
604$code.=<<___;
605.Ldone:
606 pshufb $T3,$Xi
607 movdqu $Xi,($Xip)
608___
609$code.=<<___ if ($win64);
610 movaps (%rsp),%xmm6
611 movaps 0x10(%rsp),%xmm7
612 movaps 0x20(%rsp),%xmm8
613 movaps 0x30(%rsp),%xmm9
614 movaps 0x40(%rsp),%xmm10
615 add \$0x58,%rsp
616___
617$code.=<<___;
618 ret
619.LSEH_end_gcm_ghash_clmul:
620.size gcm_ghash_clmul,.-gcm_ghash_clmul
621___
622}
623
624$code.=<<___;
625.align 64
626.Lbswap_mask:
627 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
628.L0x1c2_polynomial:
629 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
630.align 64
631.type .Lrem_4bit,\@object
632.Lrem_4bit:
633 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
634 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
635 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
636 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
637.type .Lrem_8bit,\@object
638.Lrem_8bit:
639 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
640 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
641 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
642 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
643 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
644 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
645 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
646 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
647 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
648 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
649 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
650 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
651 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
652 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
653 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
654 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
655 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
656 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
657 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
658 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
659 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
660 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
661 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
662 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
663 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
664 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
665 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
666 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
667 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
668 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
669 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
670 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
671
672.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
673.align 64
674___
675
676# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
677# CONTEXT *context,DISPATCHER_CONTEXT *disp)
678if ($win64) {
679$rec="%rcx";
680$frame="%rdx";
681$context="%r8";
682$disp="%r9";
683
684$code.=<<___;
685.extern __imp_RtlVirtualUnwind
686.type se_handler,\@abi-omnipotent
687.align 16
688se_handler:
689 push %rsi
690 push %rdi
691 push %rbx
692 push %rbp
693 push %r12
694 push %r13
695 push %r14
696 push %r15
697 pushfq
698 sub \$64,%rsp
699
700 mov 120($context),%rax # pull context->Rax
701 mov 248($context),%rbx # pull context->Rip
702
703 mov 8($disp),%rsi # disp->ImageBase
704 mov 56($disp),%r11 # disp->HandlerData
705
706 mov 0(%r11),%r10d # HandlerData[0]
707 lea (%rsi,%r10),%r10 # prologue label
708 cmp %r10,%rbx # context->Rip<prologue label
709 jb .Lin_prologue
710
711 mov 152($context),%rax # pull context->Rsp
712
713 mov 4(%r11),%r10d # HandlerData[1]
714 lea (%rsi,%r10),%r10 # epilogue label
715 cmp %r10,%rbx # context->Rip>=epilogue label
716 jae .Lin_prologue
717
718 lea 24(%rax),%rax # adjust "rsp"
719
720 mov -8(%rax),%rbx
721 mov -16(%rax),%rbp
722 mov -24(%rax),%r12
723 mov %rbx,144($context) # restore context->Rbx
724 mov %rbp,160($context) # restore context->Rbp
725 mov %r12,216($context) # restore context->R12
726
727.Lin_prologue:
728 mov 8(%rax),%rdi
729 mov 16(%rax),%rsi
730 mov %rax,152($context) # restore context->Rsp
731 mov %rsi,168($context) # restore context->Rsi
732 mov %rdi,176($context) # restore context->Rdi
733
734 mov 40($disp),%rdi # disp->ContextRecord
735 mov $context,%rsi # context
736 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
737 .long 0xa548f3fc # cld; rep movsq
738
739 mov $disp,%rsi
740 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
741 mov 8(%rsi),%rdx # arg2, disp->ImageBase
742 mov 0(%rsi),%r8 # arg3, disp->ControlPc
743 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
744 mov 40(%rsi),%r10 # disp->ContextRecord
745 lea 56(%rsi),%r11 # &disp->HandlerData
746 lea 24(%rsi),%r12 # &disp->EstablisherFrame
747 mov %r10,32(%rsp) # arg5
748 mov %r11,40(%rsp) # arg6
749 mov %r12,48(%rsp) # arg7
750 mov %rcx,56(%rsp) # arg8, (NULL)
751 call *__imp_RtlVirtualUnwind(%rip)
752
753 mov \$1,%eax # ExceptionContinueSearch
754 add \$64,%rsp
755 popfq
756 pop %r15
757 pop %r14
758 pop %r13
759 pop %r12
760 pop %rbp
761 pop %rbx
762 pop %rdi
763 pop %rsi
764 ret
765.size se_handler,.-se_handler
766
767.section .pdata
768.align 4
769 .rva .LSEH_begin_gcm_gmult_4bit
770 .rva .LSEH_end_gcm_gmult_4bit
771 .rva .LSEH_info_gcm_gmult_4bit
772
773 .rva .LSEH_begin_gcm_ghash_4bit
774 .rva .LSEH_end_gcm_ghash_4bit
775 .rva .LSEH_info_gcm_ghash_4bit
776
777 .rva .LSEH_begin_gcm_ghash_clmul
778 .rva .LSEH_end_gcm_ghash_clmul
779 .rva .LSEH_info_gcm_ghash_clmul
780
781.section .xdata
782.align 8
783.LSEH_info_gcm_gmult_4bit:
784 .byte 9,0,0,0
785 .rva se_handler
786 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
787.LSEH_info_gcm_ghash_4bit:
788 .byte 9,0,0,0
789 .rva se_handler
790 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
791.LSEH_info_gcm_ghash_clmul:
792 .byte 0x01,0x1f,0x0b,0x00
793 .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
794 .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
795 .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
796 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
797 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
798 .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
799___
800}
801
802$code =~ s/\`([^\`]*)\`/eval($1)/gem;
803
804print $code;
805
806close STDOUT;