diff options
Diffstat (limited to 'src/lib/libcrypto/modes/asm/ghash-sparcv9.pl')
-rw-r--r-- | src/lib/libcrypto/modes/asm/ghash-sparcv9.pl | 330 |
1 files changed, 0 insertions, 330 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl deleted file mode 100644 index 70e7b044a3..0000000000 --- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl +++ /dev/null | |||
@@ -1,330 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # March 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance | ||
15 | # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU | ||
16 | # and are expressed in cycles per processed byte, less is better: | ||
17 | # | ||
18 | # gcc 3.3.x cc 5.2 this assembler | ||
19 | # | ||
20 | # 32-bit build 81.4 43.3 12.6 (+546%/+244%) | ||
21 | # 64-bit build 20.2 21.2 12.6 (+60%/+68%) | ||
22 | # | ||
23 | # Here is data collected on UltraSPARC T1 system running Linux: | ||
24 | # | ||
25 | # gcc 4.4.1 this assembler | ||
26 | # | ||
27 | # 32-bit build 566 50 (+1000%) | ||
28 | # 64-bit build 56 50 (+12%) | ||
29 | # | ||
30 | # I don't quite understand why difference between 32-bit and 64-bit | ||
31 | # compiler-generated code is so big. Compilers *were* instructed to | ||
32 | # generate code for UltraSPARC and should have used 64-bit registers | ||
33 | # for Z vector (see C code) even in 32-bit build... Oh well, it only | ||
34 | # means more impressive improvement coefficients for this assembler | ||
35 | # module;-) Loops are aggressively modulo-scheduled in respect to | ||
36 | # references to input data and Z.hi updates to achieve 12 cycles | ||
37 | # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 | ||
38 | # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. | ||
39 | |||
40 | $bits=32; | ||
41 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
42 | if ($bits==64) { $bias=2047; $frame=192; } | ||
43 | else { $bias=0; $frame=112; } | ||
44 | |||
45 | $output=shift; | ||
46 | open STDOUT,">$output"; | ||
47 | |||
48 | $Zhi="%o0"; # 64-bit values | ||
49 | $Zlo="%o1"; | ||
50 | $Thi="%o2"; | ||
51 | $Tlo="%o3"; | ||
52 | $rem="%o4"; | ||
53 | $tmp="%o5"; | ||
54 | |||
55 | $nhi="%l0"; # small values and pointers | ||
56 | $nlo="%l1"; | ||
57 | $xi0="%l2"; | ||
58 | $xi1="%l3"; | ||
59 | $rem_4bit="%l4"; | ||
60 | $remi="%l5"; | ||
61 | $Htblo="%l6"; | ||
62 | $cnt="%l7"; | ||
63 | |||
64 | $Xi="%i0"; # input argument block | ||
65 | $Htbl="%i1"; | ||
66 | $inp="%i2"; | ||
67 | $len="%i3"; | ||
68 | |||
69 | $code.=<<___; | ||
70 | .section ".text",#alloc,#execinstr | ||
71 | |||
72 | .align 64 | ||
73 | rem_4bit: | ||
74 | .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 | ||
75 | .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 | ||
76 | .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 | ||
77 | .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 | ||
78 | .type rem_4bit,#object | ||
79 | .size rem_4bit,(.-rem_4bit) | ||
80 | |||
81 | .globl gcm_ghash_4bit | ||
82 | .align 32 | ||
83 | gcm_ghash_4bit: | ||
84 | save %sp,-$frame,%sp | ||
85 | ldub [$inp+15],$nlo | ||
86 | ldub [$Xi+15],$xi0 | ||
87 | ldub [$Xi+14],$xi1 | ||
88 | add $len,$inp,$len | ||
89 | add $Htbl,8,$Htblo | ||
90 | |||
91 | 1: call .+8 | ||
92 | add %o7,rem_4bit-1b,$rem_4bit | ||
93 | |||
94 | .Louter: | ||
95 | xor $xi0,$nlo,$nlo | ||
96 | and $nlo,0xf0,$nhi | ||
97 | and $nlo,0x0f,$nlo | ||
98 | sll $nlo,4,$nlo | ||
99 | ldx [$Htblo+$nlo],$Zlo | ||
100 | ldx [$Htbl+$nlo],$Zhi | ||
101 | |||
102 | ldub [$inp+14],$nlo | ||
103 | |||
104 | ldx [$Htblo+$nhi],$Tlo | ||
105 | and $Zlo,0xf,$remi | ||
106 | ldx [$Htbl+$nhi],$Thi | ||
107 | sll $remi,3,$remi | ||
108 | ldx [$rem_4bit+$remi],$rem | ||
109 | srlx $Zlo,4,$Zlo | ||
110 | mov 13,$cnt | ||
111 | sllx $Zhi,60,$tmp | ||
112 | xor $Tlo,$Zlo,$Zlo | ||
113 | srlx $Zhi,4,$Zhi | ||
114 | xor $Zlo,$tmp,$Zlo | ||
115 | |||
116 | xor $xi1,$nlo,$nlo | ||
117 | and $Zlo,0xf,$remi | ||
118 | and $nlo,0xf0,$nhi | ||
119 | and $nlo,0x0f,$nlo | ||
120 | ba .Lghash_inner | ||
121 | sll $nlo,4,$nlo | ||
122 | .align 32 | ||
123 | .Lghash_inner: | ||
124 | ldx [$Htblo+$nlo],$Tlo | ||
125 | sll $remi,3,$remi | ||
126 | xor $Thi,$Zhi,$Zhi | ||
127 | ldx [$Htbl+$nlo],$Thi | ||
128 | srlx $Zlo,4,$Zlo | ||
129 | xor $rem,$Zhi,$Zhi | ||
130 | ldx [$rem_4bit+$remi],$rem | ||
131 | sllx $Zhi,60,$tmp | ||
132 | xor $Tlo,$Zlo,$Zlo | ||
133 | ldub [$inp+$cnt],$nlo | ||
134 | srlx $Zhi,4,$Zhi | ||
135 | xor $Zlo,$tmp,$Zlo | ||
136 | ldub [$Xi+$cnt],$xi1 | ||
137 | xor $Thi,$Zhi,$Zhi | ||
138 | and $Zlo,0xf,$remi | ||
139 | |||
140 | ldx [$Htblo+$nhi],$Tlo | ||
141 | sll $remi,3,$remi | ||
142 | xor $rem,$Zhi,$Zhi | ||
143 | ldx [$Htbl+$nhi],$Thi | ||
144 | srlx $Zlo,4,$Zlo | ||
145 | ldx [$rem_4bit+$remi],$rem | ||
146 | sllx $Zhi,60,$tmp | ||
147 | xor $xi1,$nlo,$nlo | ||
148 | srlx $Zhi,4,$Zhi | ||
149 | and $nlo,0xf0,$nhi | ||
150 | addcc $cnt,-1,$cnt | ||
151 | xor $Zlo,$tmp,$Zlo | ||
152 | and $nlo,0x0f,$nlo | ||
153 | xor $Tlo,$Zlo,$Zlo | ||
154 | sll $nlo,4,$nlo | ||
155 | blu .Lghash_inner | ||
156 | and $Zlo,0xf,$remi | ||
157 | |||
158 | ldx [$Htblo+$nlo],$Tlo | ||
159 | sll $remi,3,$remi | ||
160 | xor $Thi,$Zhi,$Zhi | ||
161 | ldx [$Htbl+$nlo],$Thi | ||
162 | srlx $Zlo,4,$Zlo | ||
163 | xor $rem,$Zhi,$Zhi | ||
164 | ldx [$rem_4bit+$remi],$rem | ||
165 | sllx $Zhi,60,$tmp | ||
166 | xor $Tlo,$Zlo,$Zlo | ||
167 | srlx $Zhi,4,$Zhi | ||
168 | xor $Zlo,$tmp,$Zlo | ||
169 | xor $Thi,$Zhi,$Zhi | ||
170 | |||
171 | add $inp,16,$inp | ||
172 | cmp $inp,$len | ||
173 | be,pn `$bits==64?"%xcc":"%icc"`,.Ldone | ||
174 | and $Zlo,0xf,$remi | ||
175 | |||
176 | ldx [$Htblo+$nhi],$Tlo | ||
177 | sll $remi,3,$remi | ||
178 | xor $rem,$Zhi,$Zhi | ||
179 | ldx [$Htbl+$nhi],$Thi | ||
180 | srlx $Zlo,4,$Zlo | ||
181 | ldx [$rem_4bit+$remi],$rem | ||
182 | sllx $Zhi,60,$tmp | ||
183 | xor $Tlo,$Zlo,$Zlo | ||
184 | ldub [$inp+15],$nlo | ||
185 | srlx $Zhi,4,$Zhi | ||
186 | xor $Zlo,$tmp,$Zlo | ||
187 | xor $Thi,$Zhi,$Zhi | ||
188 | stx $Zlo,[$Xi+8] | ||
189 | xor $rem,$Zhi,$Zhi | ||
190 | stx $Zhi,[$Xi] | ||
191 | srl $Zlo,8,$xi1 | ||
192 | and $Zlo,0xff,$xi0 | ||
193 | ba .Louter | ||
194 | and $xi1,0xff,$xi1 | ||
195 | .align 32 | ||
196 | .Ldone: | ||
197 | ldx [$Htblo+$nhi],$Tlo | ||
198 | sll $remi,3,$remi | ||
199 | xor $rem,$Zhi,$Zhi | ||
200 | ldx [$Htbl+$nhi],$Thi | ||
201 | srlx $Zlo,4,$Zlo | ||
202 | ldx [$rem_4bit+$remi],$rem | ||
203 | sllx $Zhi,60,$tmp | ||
204 | xor $Tlo,$Zlo,$Zlo | ||
205 | srlx $Zhi,4,$Zhi | ||
206 | xor $Zlo,$tmp,$Zlo | ||
207 | xor $Thi,$Zhi,$Zhi | ||
208 | stx $Zlo,[$Xi+8] | ||
209 | xor $rem,$Zhi,$Zhi | ||
210 | stx $Zhi,[$Xi] | ||
211 | |||
212 | ret | ||
213 | restore | ||
214 | .type gcm_ghash_4bit,#function | ||
215 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) | ||
216 | ___ | ||
217 | |||
218 | undef $inp; | ||
219 | undef $len; | ||
220 | |||
221 | $code.=<<___; | ||
222 | .globl gcm_gmult_4bit | ||
223 | .align 32 | ||
224 | gcm_gmult_4bit: | ||
225 | save %sp,-$frame,%sp | ||
226 | ldub [$Xi+15],$nlo | ||
227 | add $Htbl,8,$Htblo | ||
228 | |||
229 | 1: call .+8 | ||
230 | add %o7,rem_4bit-1b,$rem_4bit | ||
231 | |||
232 | and $nlo,0xf0,$nhi | ||
233 | and $nlo,0x0f,$nlo | ||
234 | sll $nlo,4,$nlo | ||
235 | ldx [$Htblo+$nlo],$Zlo | ||
236 | ldx [$Htbl+$nlo],$Zhi | ||
237 | |||
238 | ldub [$Xi+14],$nlo | ||
239 | |||
240 | ldx [$Htblo+$nhi],$Tlo | ||
241 | and $Zlo,0xf,$remi | ||
242 | ldx [$Htbl+$nhi],$Thi | ||
243 | sll $remi,3,$remi | ||
244 | ldx [$rem_4bit+$remi],$rem | ||
245 | srlx $Zlo,4,$Zlo | ||
246 | mov 13,$cnt | ||
247 | sllx $Zhi,60,$tmp | ||
248 | xor $Tlo,$Zlo,$Zlo | ||
249 | srlx $Zhi,4,$Zhi | ||
250 | xor $Zlo,$tmp,$Zlo | ||
251 | |||
252 | and $Zlo,0xf,$remi | ||
253 | and $nlo,0xf0,$nhi | ||
254 | and $nlo,0x0f,$nlo | ||
255 | ba .Lgmult_inner | ||
256 | sll $nlo,4,$nlo | ||
257 | .align 32 | ||
258 | .Lgmult_inner: | ||
259 | ldx [$Htblo+$nlo],$Tlo | ||
260 | sll $remi,3,$remi | ||
261 | xor $Thi,$Zhi,$Zhi | ||
262 | ldx [$Htbl+$nlo],$Thi | ||
263 | srlx $Zlo,4,$Zlo | ||
264 | xor $rem,$Zhi,$Zhi | ||
265 | ldx [$rem_4bit+$remi],$rem | ||
266 | sllx $Zhi,60,$tmp | ||
267 | xor $Tlo,$Zlo,$Zlo | ||
268 | ldub [$Xi+$cnt],$nlo | ||
269 | srlx $Zhi,4,$Zhi | ||
270 | xor $Zlo,$tmp,$Zlo | ||
271 | xor $Thi,$Zhi,$Zhi | ||
272 | and $Zlo,0xf,$remi | ||
273 | |||
274 | ldx [$Htblo+$nhi],$Tlo | ||
275 | sll $remi,3,$remi | ||
276 | xor $rem,$Zhi,$Zhi | ||
277 | ldx [$Htbl+$nhi],$Thi | ||
278 | srlx $Zlo,4,$Zlo | ||
279 | ldx [$rem_4bit+$remi],$rem | ||
280 | sllx $Zhi,60,$tmp | ||
281 | srlx $Zhi,4,$Zhi | ||
282 | and $nlo,0xf0,$nhi | ||
283 | addcc $cnt,-1,$cnt | ||
284 | xor $Zlo,$tmp,$Zlo | ||
285 | and $nlo,0x0f,$nlo | ||
286 | xor $Tlo,$Zlo,$Zlo | ||
287 | sll $nlo,4,$nlo | ||
288 | blu .Lgmult_inner | ||
289 | and $Zlo,0xf,$remi | ||
290 | |||
291 | ldx [$Htblo+$nlo],$Tlo | ||
292 | sll $remi,3,$remi | ||
293 | xor $Thi,$Zhi,$Zhi | ||
294 | ldx [$Htbl+$nlo],$Thi | ||
295 | srlx $Zlo,4,$Zlo | ||
296 | xor $rem,$Zhi,$Zhi | ||
297 | ldx [$rem_4bit+$remi],$rem | ||
298 | sllx $Zhi,60,$tmp | ||
299 | xor $Tlo,$Zlo,$Zlo | ||
300 | srlx $Zhi,4,$Zhi | ||
301 | xor $Zlo,$tmp,$Zlo | ||
302 | xor $Thi,$Zhi,$Zhi | ||
303 | and $Zlo,0xf,$remi | ||
304 | |||
305 | ldx [$Htblo+$nhi],$Tlo | ||
306 | sll $remi,3,$remi | ||
307 | xor $rem,$Zhi,$Zhi | ||
308 | ldx [$Htbl+$nhi],$Thi | ||
309 | srlx $Zlo,4,$Zlo | ||
310 | ldx [$rem_4bit+$remi],$rem | ||
311 | sllx $Zhi,60,$tmp | ||
312 | xor $Tlo,$Zlo,$Zlo | ||
313 | srlx $Zhi,4,$Zhi | ||
314 | xor $Zlo,$tmp,$Zlo | ||
315 | xor $Thi,$Zhi,$Zhi | ||
316 | stx $Zlo,[$Xi+8] | ||
317 | xor $rem,$Zhi,$Zhi | ||
318 | stx $Zhi,[$Xi] | ||
319 | |||
320 | ret | ||
321 | restore | ||
322 | .type gcm_gmult_4bit,#function | ||
323 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) | ||
324 | .asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | ||
325 | .align 4 | ||
326 | ___ | ||
327 | |||
328 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
329 | print $code; | ||
330 | close STDOUT; | ||