summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/modes/asm/ghash-alpha.pl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-alpha.pl444
1 files changed, 0 insertions, 444 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index 9d847006c4..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,444 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#include <machine/asm.h>
249
250.text
251
252.set noat
253.set noreorder
254.globl gcm_gmult_4bit
255.align 4
256.ent gcm_gmult_4bit
257gcm_gmult_4bit:
258 .frame sp,0,ra
259 .prologue 0
260
261 ldq $Xlo,8($Xi)
262 ldq $Xhi,0($Xi)
263
264 lda $rem_4bit,rem_4bit
265___
266
267 &loop();
268
269$code.=<<___;
270 srl $Zlo,24,$t0 # byte swap
271 srl $Zlo,8,$t1
272
273 sll $Zlo,8,$t2
274 sll $Zlo,24,$Zlo
275 zapnot $t0,0x11,$t0
276 zapnot $t1,0x22,$t1
277
278 zapnot $Zlo,0x88,$Zlo
279 or $t0,$t1,$t0
280 zapnot $t2,0x44,$t2
281
282 or $Zlo,$t0,$Zlo
283 srl $Zhi,24,$t0
284 srl $Zhi,8,$t1
285
286 or $Zlo,$t2,$Zlo
287 sll $Zhi,8,$t2
288 sll $Zhi,24,$Zhi
289
290 srl $Zlo,32,$Xlo
291 sll $Zlo,32,$Zlo
292
293 zapnot $t0,0x11,$t0
294 zapnot $t1,0x22,$t1
295 or $Zlo,$Xlo,$Xlo
296
297 zapnot $Zhi,0x88,$Zhi
298 or $t0,$t1,$t0
299 zapnot $t2,0x44,$t2
300
301 or $Zhi,$t0,$Zhi
302 or $Zhi,$t2,$Zhi
303
304 srl $Zhi,32,$Xhi
305 sll $Zhi,32,$Zhi
306
307 or $Zhi,$Xhi,$Xhi
308 stq $Xlo,8($Xi)
309 stq $Xhi,0($Xi)
310
311 ret (ra)
312.end gcm_gmult_4bit
313___
314
315$inhi="s0";
316$inlo="s1";
317
318$code.=<<___;
319.globl gcm_ghash_4bit
320.align 4
321.ent gcm_ghash_4bit
322gcm_ghash_4bit:
323 lda sp,-32(sp)
324 stq ra,0(sp)
325 stq s0,8(sp)
326 stq s1,16(sp)
327 .mask 0x04000600,-32
328 .frame sp,32,ra
329 .prologue 0
330
331 ldq_u $inhi,0($inp)
332 ldq_u $Thi0,7($inp)
333 ldq_u $inlo,8($inp)
334 ldq_u $Tlo0,15($inp)
335 ldq $Xhi,0($Xi)
336 ldq $Xlo,8($Xi)
337
338 lda $rem_4bit,rem_4bit
339
340.Louter:
341 extql $inhi,$inp,$inhi
342 extqh $Thi0,$inp,$Thi0
343 or $inhi,$Thi0,$inhi
344 lda $inp,16($inp)
345
346 extql $inlo,$inp,$inlo
347 extqh $Tlo0,$inp,$Tlo0
348 or $inlo,$Tlo0,$inlo
349 subq $len,16,$len
350
351 xor $Xlo,$inlo,$Xlo
352 xor $Xhi,$inhi,$Xhi
353___
354
355 &loop();
356
357$code.=<<___;
358 srl $Zlo,24,$t0 # byte swap
359 srl $Zlo,8,$t1
360
361 sll $Zlo,8,$t2
362 sll $Zlo,24,$Zlo
363 zapnot $t0,0x11,$t0
364 zapnot $t1,0x22,$t1
365
366 zapnot $Zlo,0x88,$Zlo
367 or $t0,$t1,$t0
368 zapnot $t2,0x44,$t2
369
370 or $Zlo,$t0,$Zlo
371 srl $Zhi,24,$t0
372 srl $Zhi,8,$t1
373
374 or $Zlo,$t2,$Zlo
375 sll $Zhi,8,$t2
376 sll $Zhi,24,$Zhi
377
378 srl $Zlo,32,$Xlo
379 sll $Zlo,32,$Zlo
380 beq $len,.Ldone
381
382 zapnot $t0,0x11,$t0
383 zapnot $t1,0x22,$t1
384 or $Zlo,$Xlo,$Xlo
385 ldq_u $inhi,0($inp)
386
387 zapnot $Zhi,0x88,$Zhi
388 or $t0,$t1,$t0
389 zapnot $t2,0x44,$t2
390 ldq_u $Thi0,7($inp)
391
392 or $Zhi,$t0,$Zhi
393 or $Zhi,$t2,$Zhi
394 ldq_u $inlo,8($inp)
395 ldq_u $Tlo0,15($inp)
396
397 srl $Zhi,32,$Xhi
398 sll $Zhi,32,$Zhi
399
400 or $Zhi,$Xhi,$Xhi
401 br zero,.Louter
402
403.Ldone:
404 zapnot $t0,0x11,$t0
405 zapnot $t1,0x22,$t1
406 or $Zlo,$Xlo,$Xlo
407
408 zapnot $Zhi,0x88,$Zhi
409 or $t0,$t1,$t0
410 zapnot $t2,0x44,$t2
411
412 or $Zhi,$t0,$Zhi
413 or $Zhi,$t2,$Zhi
414
415 srl $Zhi,32,$Xhi
416 sll $Zhi,32,$Zhi
417
418 or $Zhi,$Xhi,$Xhi
419
420 stq $Xlo,8($Xi)
421 stq $Xhi,0($Xi)
422
423 .set noreorder
424 /*ldq ra,0(sp)*/
425 ldq s0,8(sp)
426 ldq s1,16(sp)
427 lda sp,32(sp)
428 ret (ra)
429.end gcm_ghash_4bit
430
431 .section .rodata
432 .align 4
433rem_4bit:
434 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
435 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
436 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
437 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
438 .previous
439
440___
441$output=shift and open STDOUT,">$output";
442print $code;
443close STDOUT;
444