diff options
Diffstat (limited to 'src/lib/libcrypto/modes/asm/ghash-x86.pl')
-rw-r--r-- | src/lib/libcrypto/modes/asm/ghash-x86.pl | 1326 |
1 files changed, 0 insertions, 1326 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl deleted file mode 100644 index 47833582b6..0000000000 --- a/src/lib/libcrypto/modes/asm/ghash-x86.pl +++ /dev/null | |||
@@ -1,1326 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # March, May, June 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two | ||
15 | # code paths: vanilla x86 and vanilla MMX. Former will be executed on | ||
16 | # 486 and Pentium, latter on all others. MMX GHASH features so called | ||
17 | # "528B" variant of "4-bit" method utilizing additional 256+16 bytes | ||
18 | # of per-key storage [+512 bytes shared table]. Performance results | ||
19 | # are for streamed GHASH subroutine and are expressed in cycles per | ||
20 | # processed byte, less is better: | ||
21 | # | ||
22 | # gcc 2.95.3(*) MMX assembler x86 assembler | ||
23 | # | ||
24 | # Pentium 105/111(**) - 50 | ||
25 | # PIII 68 /75 12.2 24 | ||
26 | # P4 125/125 17.8 84(***) | ||
27 | # Opteron 66 /70 10.1 30 | ||
28 | # Core2 54 /67 8.4 18 | ||
29 | # | ||
30 | # (*) gcc 3.4.x was observed to generate few percent slower code, | ||
31 | # which is one of reasons why 2.95.3 results were chosen, | ||
32 | # another reason is lack of 3.4.x results for older CPUs; | ||
33 | # comparison with MMX results is not completely fair, because C | ||
34 | # results are for vanilla "256B" implementation, while | ||
35 | # assembler results are for "528B";-) | ||
36 | # (**) second number is result for code compiled with -fPIC flag, | ||
37 | # which is actually more relevant, because assembler code is | ||
38 | # position-independent; | ||
39 | # (***) see comment in non-MMX routine for further details; | ||
40 | # | ||
41 | # To summarize, it's >2-5 times faster than gcc-generated code. To | ||
42 | # anchor it to something else SHA1 assembler processes one byte in | ||
43 | # 11-13 cycles on contemporary x86 cores. As for choice of MMX in | ||
44 | # particular, see comment at the end of the file... | ||
45 | |||
46 | # May 2010 | ||
47 | # | ||
48 | # Add PCLMULQDQ version performing at 2.10 cycles per processed byte. | ||
49 | # The question is how close is it to theoretical limit? The pclmulqdq | ||
50 | # instruction latency appears to be 14 cycles and there can't be more | ||
51 | # than 2 of them executing at any given time. This means that single | ||
52 | # Karatsuba multiplication would take 28 cycles *plus* few cycles for | ||
53 | # pre- and post-processing. Then multiplication has to be followed by | ||
54 | # modulo-reduction. Given that aggregated reduction method [see | ||
55 | # "Carry-less Multiplication and Its Usage for Computing the GCM Mode" | ||
56 | # white paper by Intel] allows you to perform reduction only once in | ||
57 | # a while we can assume that asymptotic performance can be estimated | ||
58 | # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction | ||
59 | # and Naggr is the aggregation factor. | ||
60 | # | ||
61 | # Before we proceed to this implementation let's have closer look at | ||
62 | # the best-performing code suggested by Intel in their white paper. | ||
63 | # By tracing inter-register dependencies Tmod is estimated as ~19 | ||
64 | # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per | ||
65 | # processed byte. As implied, this is quite optimistic estimate, | ||
66 | # because it does not account for Karatsuba pre- and post-processing, | ||
67 | # which for a single multiplication is ~5 cycles. Unfortunately Intel | ||
68 | # does not provide performance data for GHASH alone. But benchmarking | ||
69 | # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt | ||
70 | # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that | ||
71 | # the result accounts even for pre-computing of degrees of the hash | ||
72 | # key H, but its portion is negligible at 16KB buffer size. | ||
73 | # | ||
74 | # Moving on to the implementation in question. Tmod is estimated as | ||
75 | # ~13 cycles and Naggr is 2, giving asymptotic performance of ... | ||
76 | # 2.16. How is it possible that measured performance is better than | ||
77 | # optimistic theoretical estimate? There is one thing Intel failed | ||
78 | # to recognize. By serializing GHASH with CTR in same subroutine | ||
79 | # former's performance is really limited to above (Tmul + Tmod/Naggr) | ||
80 | # equation. But if GHASH procedure is detached, the modulo-reduction | ||
81 | # can be interleaved with Naggr-1 multiplications at instruction level | ||
82 | # and under ideal conditions even disappear from the equation. So that | ||
83 | # optimistic theoretical estimate for this implementation is ... | ||
84 | # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, | ||
85 | # at least for such small Naggr. I'd argue that (28+Tproc/Naggr), | ||
86 | # where Tproc is time required for Karatsuba pre- and post-processing, | ||
87 | # is more realistic estimate. In this case it gives ... 1.91 cycles. | ||
88 | # Or in other words, depending on how well we can interleave reduction | ||
89 | # and one of the two multiplications the performance should be between | ||
90 | # 1.91 and 2.16. As already mentioned, this implementation processes | ||
91 | # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart | ||
92 | # - in 2.02. x86_64 performance is better, because larger register | ||
93 | # bank allows to interleave reduction and multiplication better. | ||
94 | # | ||
95 | # Does it make sense to increase Naggr? To start with it's virtually | ||
96 | # impossible in 32-bit mode, because of limited register bank | ||
97 | # capacity. Otherwise improvement has to be weighed agiainst slower | ||
98 | # setup, as well as code size and complexity increase. As even | ||
99 | # optimistic estimate doesn't promise 30% performance improvement, | ||
100 | # there are currently no plans to increase Naggr. | ||
101 | # | ||
102 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | ||
103 | # providing access to a Westmere-based system on behalf of Intel | ||
104 | # Open Source Technology Centre. | ||
105 | |||
106 | # January 2010 | ||
107 | # | ||
108 | # Tweaked to optimize transitions between integer and FP operations | ||
109 | # on same XMM register, PCLMULQDQ subroutine was measured to process | ||
110 | # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. | ||
111 | # The minor regression on Westmere is outweighed by ~15% improvement | ||
112 | # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in | ||
113 | # similar manner resulted in almost 20% degradation on Sandy Bridge, | ||
114 | # where original 64-bit code processes one byte in 1.95 cycles. | ||
115 | |||
116 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
117 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
118 | require "x86asm.pl"; | ||
119 | |||
120 | &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
121 | |||
122 | $sse2=0; | ||
123 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
124 | |||
125 | ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); | ||
126 | $inp = "edi"; | ||
127 | $Htbl = "esi"; | ||
128 | |||
129 | $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse | ||
130 | # than unrolled, which has to be weighted against | ||
131 | # 2.5x x86-specific code size reduction. | ||
132 | |||
133 | sub x86_loop { | ||
134 | my $off = shift; | ||
135 | my $rem = "eax"; | ||
136 | |||
137 | &mov ($Zhh,&DWP(4,$Htbl,$Zll)); | ||
138 | &mov ($Zhl,&DWP(0,$Htbl,$Zll)); | ||
139 | &mov ($Zlh,&DWP(12,$Htbl,$Zll)); | ||
140 | &mov ($Zll,&DWP(8,$Htbl,$Zll)); | ||
141 | &xor ($rem,$rem); # avoid partial register stalls on PIII | ||
142 | |||
143 | # shrd practically kills P4, 2.5x deterioration, but P4 has | ||
144 | # MMX code-path to execute. shrd runs tad faster [than twice | ||
145 | # the shifts, move's and or's] on pre-MMX Pentium (as well as | ||
146 | # PIII and Core2), *but* minimizes code size, spares register | ||
147 | # and thus allows to fold the loop... | ||
148 | if (!$unroll) { | ||
149 | my $cnt = $inp; | ||
150 | &mov ($cnt,15); | ||
151 | &jmp (&label("x86_loop")); | ||
152 | &set_label("x86_loop",16); | ||
153 | for($i=1;$i<=2;$i++) { | ||
154 | &mov (&LB($rem),&LB($Zll)); | ||
155 | &shrd ($Zll,$Zlh,4); | ||
156 | &and (&LB($rem),0xf); | ||
157 | &shrd ($Zlh,$Zhl,4); | ||
158 | &shrd ($Zhl,$Zhh,4); | ||
159 | &shr ($Zhh,4); | ||
160 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | ||
161 | |||
162 | &mov (&LB($rem),&BP($off,"esp",$cnt)); | ||
163 | if ($i&1) { | ||
164 | &and (&LB($rem),0xf0); | ||
165 | } else { | ||
166 | &shl (&LB($rem),4); | ||
167 | } | ||
168 | |||
169 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | ||
170 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | ||
171 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | ||
172 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | ||
173 | |||
174 | if ($i&1) { | ||
175 | &dec ($cnt); | ||
176 | &js (&label("x86_break")); | ||
177 | } else { | ||
178 | &jmp (&label("x86_loop")); | ||
179 | } | ||
180 | } | ||
181 | &set_label("x86_break",16); | ||
182 | } else { | ||
183 | for($i=1;$i<32;$i++) { | ||
184 | &comment($i); | ||
185 | &mov (&LB($rem),&LB($Zll)); | ||
186 | &shrd ($Zll,$Zlh,4); | ||
187 | &and (&LB($rem),0xf); | ||
188 | &shrd ($Zlh,$Zhl,4); | ||
189 | &shrd ($Zhl,$Zhh,4); | ||
190 | &shr ($Zhh,4); | ||
191 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | ||
192 | |||
193 | if ($i&1) { | ||
194 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | ||
195 | &and (&LB($rem),0xf0); | ||
196 | } else { | ||
197 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | ||
198 | &shl (&LB($rem),4); | ||
199 | } | ||
200 | |||
201 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | ||
202 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | ||
203 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | ||
204 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | ||
205 | } | ||
206 | } | ||
207 | &bswap ($Zll); | ||
208 | &bswap ($Zlh); | ||
209 | &bswap ($Zhl); | ||
210 | if (!$x86only) { | ||
211 | &bswap ($Zhh); | ||
212 | } else { | ||
213 | &mov ("eax",$Zhh); | ||
214 | &bswap ("eax"); | ||
215 | &mov ($Zhh,"eax"); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | if ($unroll) { | ||
220 | &function_begin_B("_x86_gmult_4bit_inner"); | ||
221 | &x86_loop(4); | ||
222 | &ret (); | ||
223 | &function_end_B("_x86_gmult_4bit_inner"); | ||
224 | } | ||
225 | |||
226 | sub deposit_rem_4bit { | ||
227 | my $bias = shift; | ||
228 | |||
229 | &mov (&DWP($bias+0, "esp"),0x0000<<16); | ||
230 | &mov (&DWP($bias+4, "esp"),0x1C20<<16); | ||
231 | &mov (&DWP($bias+8, "esp"),0x3840<<16); | ||
232 | &mov (&DWP($bias+12,"esp"),0x2460<<16); | ||
233 | &mov (&DWP($bias+16,"esp"),0x7080<<16); | ||
234 | &mov (&DWP($bias+20,"esp"),0x6CA0<<16); | ||
235 | &mov (&DWP($bias+24,"esp"),0x48C0<<16); | ||
236 | &mov (&DWP($bias+28,"esp"),0x54E0<<16); | ||
237 | &mov (&DWP($bias+32,"esp"),0xE100<<16); | ||
238 | &mov (&DWP($bias+36,"esp"),0xFD20<<16); | ||
239 | &mov (&DWP($bias+40,"esp"),0xD940<<16); | ||
240 | &mov (&DWP($bias+44,"esp"),0xC560<<16); | ||
241 | &mov (&DWP($bias+48,"esp"),0x9180<<16); | ||
242 | &mov (&DWP($bias+52,"esp"),0x8DA0<<16); | ||
243 | &mov (&DWP($bias+56,"esp"),0xA9C0<<16); | ||
244 | &mov (&DWP($bias+60,"esp"),0xB5E0<<16); | ||
245 | } | ||
246 | |||
247 | $suffix = $x86only ? "" : "_x86"; | ||
248 | |||
249 | &function_begin("gcm_gmult_4bit".$suffix); | ||
250 | &stack_push(16+4+1); # +1 for stack alignment | ||
251 | &mov ($inp,&wparam(0)); # load Xi | ||
252 | &mov ($Htbl,&wparam(1)); # load Htable | ||
253 | |||
254 | &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] | ||
255 | &mov ($Zhl,&DWP(4,$inp)); | ||
256 | &mov ($Zlh,&DWP(8,$inp)); | ||
257 | &mov ($Zll,&DWP(12,$inp)); | ||
258 | |||
259 | &deposit_rem_4bit(16); | ||
260 | |||
261 | &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack | ||
262 | &mov (&DWP(4,"esp"),$Zhl); | ||
263 | &mov (&DWP(8,"esp"),$Zlh); | ||
264 | &mov (&DWP(12,"esp"),$Zll); | ||
265 | &shr ($Zll,20); | ||
266 | &and ($Zll,0xf0); | ||
267 | |||
268 | if ($unroll) { | ||
269 | &call ("_x86_gmult_4bit_inner"); | ||
270 | } else { | ||
271 | &x86_loop(0); | ||
272 | &mov ($inp,&wparam(0)); | ||
273 | } | ||
274 | |||
275 | &mov (&DWP(12,$inp),$Zll); | ||
276 | &mov (&DWP(8,$inp),$Zlh); | ||
277 | &mov (&DWP(4,$inp),$Zhl); | ||
278 | &mov (&DWP(0,$inp),$Zhh); | ||
279 | &stack_pop(16+4+1); | ||
280 | &function_end("gcm_gmult_4bit".$suffix); | ||
281 | |||
282 | &function_begin("gcm_ghash_4bit".$suffix); | ||
283 | &stack_push(16+4+1); # +1 for 64-bit alignment | ||
284 | &mov ($Zll,&wparam(0)); # load Xi | ||
285 | &mov ($Htbl,&wparam(1)); # load Htable | ||
286 | &mov ($inp,&wparam(2)); # load in | ||
287 | &mov ("ecx",&wparam(3)); # load len | ||
288 | &add ("ecx",$inp); | ||
289 | &mov (&wparam(3),"ecx"); | ||
290 | |||
291 | &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] | ||
292 | &mov ($Zhl,&DWP(4,$Zll)); | ||
293 | &mov ($Zlh,&DWP(8,$Zll)); | ||
294 | &mov ($Zll,&DWP(12,$Zll)); | ||
295 | |||
296 | &deposit_rem_4bit(16); | ||
297 | |||
298 | &set_label("x86_outer_loop",16); | ||
299 | &xor ($Zll,&DWP(12,$inp)); # xor with input | ||
300 | &xor ($Zlh,&DWP(8,$inp)); | ||
301 | &xor ($Zhl,&DWP(4,$inp)); | ||
302 | &xor ($Zhh,&DWP(0,$inp)); | ||
303 | &mov (&DWP(12,"esp"),$Zll); # dump it on stack | ||
304 | &mov (&DWP(8,"esp"),$Zlh); | ||
305 | &mov (&DWP(4,"esp"),$Zhl); | ||
306 | &mov (&DWP(0,"esp"),$Zhh); | ||
307 | |||
308 | &shr ($Zll,20); | ||
309 | &and ($Zll,0xf0); | ||
310 | |||
311 | if ($unroll) { | ||
312 | &call ("_x86_gmult_4bit_inner"); | ||
313 | } else { | ||
314 | &x86_loop(0); | ||
315 | &mov ($inp,&wparam(2)); | ||
316 | } | ||
317 | &lea ($inp,&DWP(16,$inp)); | ||
318 | &cmp ($inp,&wparam(3)); | ||
319 | &mov (&wparam(2),$inp) if (!$unroll); | ||
320 | &jb (&label("x86_outer_loop")); | ||
321 | |||
322 | &mov ($inp,&wparam(0)); # load Xi | ||
323 | &mov (&DWP(12,$inp),$Zll); | ||
324 | &mov (&DWP(8,$inp),$Zlh); | ||
325 | &mov (&DWP(4,$inp),$Zhl); | ||
326 | &mov (&DWP(0,$inp),$Zhh); | ||
327 | &stack_pop(16+4+1); | ||
328 | &function_end("gcm_ghash_4bit".$suffix); | ||
329 | |||
330 | if (!$x86only) {{{ | ||
331 | |||
332 | &static_label("rem_4bit"); | ||
333 | |||
334 | if (!$sse2) {{ # pure-MMX "May" version... | ||
335 | |||
336 | $S=12; # shift factor for rem_4bit | ||
337 | |||
338 | &function_begin_B("_mmx_gmult_4bit_inner"); | ||
339 | # MMX version performs 3.5 times better on P4 (see comment in non-MMX | ||
340 | # routine for further details), 100% better on Opteron, ~70% better | ||
341 | # on Core2 and PIII... In other words effort is considered to be well | ||
342 | # spent... Since initial release the loop was unrolled in order to | ||
343 | # "liberate" register previously used as loop counter. Instead it's | ||
344 | # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. | ||
345 | # The path involves move of Z.lo from MMX to integer register, | ||
346 | # effective address calculation and finally merge of value to Z.hi. | ||
347 | # Reference to rem_4bit is scheduled so late that I had to >>4 | ||
348 | # rem_4bit elements. This resulted in 20-45% procent improvement | ||
349 | # on contemporary µ-archs. | ||
350 | { | ||
351 | my $cnt; | ||
352 | my $rem_4bit = "eax"; | ||
353 | my @rem = ($Zhh,$Zll); | ||
354 | my $nhi = $Zhl; | ||
355 | my $nlo = $Zlh; | ||
356 | |||
357 | my ($Zlo,$Zhi) = ("mm0","mm1"); | ||
358 | my $tmp = "mm2"; | ||
359 | |||
360 | &xor ($nlo,$nlo); # avoid partial register stalls on PIII | ||
361 | &mov ($nhi,$Zll); | ||
362 | &mov (&LB($nlo),&LB($nhi)); | ||
363 | &shl (&LB($nlo),4); | ||
364 | &and ($nhi,0xf0); | ||
365 | &movq ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
366 | &movq ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
367 | &movd ($rem[0],$Zlo); | ||
368 | |||
369 | for ($cnt=28;$cnt>=-2;$cnt--) { | ||
370 | my $odd = $cnt&1; | ||
371 | my $nix = $odd ? $nlo : $nhi; | ||
372 | |||
373 | &shl (&LB($nlo),4) if ($odd); | ||
374 | &psrlq ($Zlo,4); | ||
375 | &movq ($tmp,$Zhi); | ||
376 | &psrlq ($Zhi,4); | ||
377 | &pxor ($Zlo,&QWP(8,$Htbl,$nix)); | ||
378 | &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); | ||
379 | &psllq ($tmp,60); | ||
380 | &and ($nhi,0xf0) if ($odd); | ||
381 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); | ||
382 | &and ($rem[0],0xf); | ||
383 | &pxor ($Zhi,&QWP(0,$Htbl,$nix)); | ||
384 | &mov ($nhi,$nlo) if (!$odd && $cnt>=0); | ||
385 | &movd ($rem[1],$Zlo); | ||
386 | &pxor ($Zlo,$tmp); | ||
387 | |||
388 | push (@rem,shift(@rem)); # "rotate" registers | ||
389 | } | ||
390 | |||
391 | &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] | ||
392 | |||
393 | &psrlq ($Zlo,32); # lower part of Zlo is already there | ||
394 | &movd ($Zhl,$Zhi); | ||
395 | &psrlq ($Zhi,32); | ||
396 | &movd ($Zlh,$Zlo); | ||
397 | &movd ($Zhh,$Zhi); | ||
398 | &shl ($inp,4); # compensate for rem_4bit[i] being >>4 | ||
399 | |||
400 | &bswap ($Zll); | ||
401 | &bswap ($Zhl); | ||
402 | &bswap ($Zlh); | ||
403 | &xor ($Zhh,$inp); | ||
404 | &bswap ($Zhh); | ||
405 | |||
406 | &ret (); | ||
407 | } | ||
408 | &function_end_B("_mmx_gmult_4bit_inner"); | ||
409 | |||
410 | &function_begin("gcm_gmult_4bit_mmx"); | ||
411 | &mov ($inp,&wparam(0)); # load Xi | ||
412 | &mov ($Htbl,&wparam(1)); # load Htable | ||
413 | |||
414 | &picsetup("eax"); | ||
415 | &picsymbol("eax", &label("rem_4bit"), "eax"); | ||
416 | |||
417 | &movz ($Zll,&BP(15,$inp)); | ||
418 | |||
419 | &call ("_mmx_gmult_4bit_inner"); | ||
420 | |||
421 | &mov ($inp,&wparam(0)); # load Xi | ||
422 | &emms (); | ||
423 | &mov (&DWP(12,$inp),$Zll); | ||
424 | &mov (&DWP(4,$inp),$Zhl); | ||
425 | &mov (&DWP(8,$inp),$Zlh); | ||
426 | &mov (&DWP(0,$inp),$Zhh); | ||
427 | &function_end("gcm_gmult_4bit_mmx"); | ||
428 | |||
429 | # Streamed version performs 20% better on P4, 7% on Opteron, | ||
430 | # 10% on Core2 and PIII... | ||
431 | &function_begin("gcm_ghash_4bit_mmx"); | ||
432 | &mov ($Zhh,&wparam(0)); # load Xi | ||
433 | &mov ($Htbl,&wparam(1)); # load Htable | ||
434 | &mov ($inp,&wparam(2)); # load in | ||
435 | &mov ($Zlh,&wparam(3)); # load len | ||
436 | |||
437 | &picsetup("eax"); | ||
438 | &picsymbol("eax", &label("rem_4bit"), "eax"); | ||
439 | |||
440 | &add ($Zlh,$inp); | ||
441 | &mov (&wparam(3),$Zlh); # len to point at the end of input | ||
442 | &stack_push(4+1); # +1 for stack alignment | ||
443 | |||
444 | &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] | ||
445 | &mov ($Zhl,&DWP(4,$Zhh)); | ||
446 | &mov ($Zlh,&DWP(8,$Zhh)); | ||
447 | &mov ($Zhh,&DWP(0,$Zhh)); | ||
448 | &jmp (&label("mmx_outer_loop")); | ||
449 | |||
450 | &set_label("mmx_outer_loop",16); | ||
451 | &xor ($Zll,&DWP(12,$inp)); | ||
452 | &xor ($Zhl,&DWP(4,$inp)); | ||
453 | &xor ($Zlh,&DWP(8,$inp)); | ||
454 | &xor ($Zhh,&DWP(0,$inp)); | ||
455 | &mov (&wparam(2),$inp); | ||
456 | &mov (&DWP(12,"esp"),$Zll); | ||
457 | &mov (&DWP(4,"esp"),$Zhl); | ||
458 | &mov (&DWP(8,"esp"),$Zlh); | ||
459 | &mov (&DWP(0,"esp"),$Zhh); | ||
460 | |||
461 | &mov ($inp,"esp"); | ||
462 | &shr ($Zll,24); | ||
463 | |||
464 | &call ("_mmx_gmult_4bit_inner"); | ||
465 | |||
466 | &mov ($inp,&wparam(2)); | ||
467 | &lea ($inp,&DWP(16,$inp)); | ||
468 | &cmp ($inp,&wparam(3)); | ||
469 | &jb (&label("mmx_outer_loop")); | ||
470 | |||
471 | &mov ($inp,&wparam(0)); # load Xi | ||
472 | &emms (); | ||
473 | &mov (&DWP(12,$inp),$Zll); | ||
474 | &mov (&DWP(4,$inp),$Zhl); | ||
475 | &mov (&DWP(8,$inp),$Zlh); | ||
476 | &mov (&DWP(0,$inp),$Zhh); | ||
477 | |||
478 | &stack_pop(4+1); | ||
479 | &function_end("gcm_ghash_4bit_mmx"); | ||
480 | |||
481 | }} else {{ # "June" MMX version... | ||
482 | # ... has slower "April" gcm_gmult_4bit_mmx with folded | ||
483 | # loop. This is done to conserve code size... | ||
484 | $S=16; # shift factor for rem_4bit | ||
485 | |||
486 | sub mmx_loop() { | ||
487 | # MMX version performs 2.8 times better on P4 (see comment in non-MMX | ||
488 | # routine for further details), 40% better on Opteron and Core2, 50% | ||
489 | # better on PIII... In other words effort is considered to be well | ||
490 | # spent... | ||
491 | my $inp = shift; | ||
492 | my $rem_4bit = shift; | ||
493 | my $cnt = $Zhh; | ||
494 | my $nhi = $Zhl; | ||
495 | my $nlo = $Zlh; | ||
496 | my $rem = $Zll; | ||
497 | |||
498 | my ($Zlo,$Zhi) = ("mm0","mm1"); | ||
499 | my $tmp = "mm2"; | ||
500 | |||
501 | &xor ($nlo,$nlo); # avoid partial register stalls on PIII | ||
502 | &mov ($nhi,$Zll); | ||
503 | &mov (&LB($nlo),&LB($nhi)); | ||
504 | &mov ($cnt,14); | ||
505 | &shl (&LB($nlo),4); | ||
506 | &and ($nhi,0xf0); | ||
507 | &movq ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
508 | &movq ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
509 | &movd ($rem,$Zlo); | ||
510 | &jmp (&label("mmx_loop")); | ||
511 | |||
512 | &set_label("mmx_loop",16); | ||
513 | &psrlq ($Zlo,4); | ||
514 | &and ($rem,0xf); | ||
515 | &movq ($tmp,$Zhi); | ||
516 | &psrlq ($Zhi,4); | ||
517 | &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); | ||
518 | &mov (&LB($nlo),&BP(0,$inp,$cnt)); | ||
519 | &psllq ($tmp,60); | ||
520 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
521 | &dec ($cnt); | ||
522 | &movd ($rem,$Zlo); | ||
523 | &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); | ||
524 | &mov ($nhi,$nlo); | ||
525 | &pxor ($Zlo,$tmp); | ||
526 | &js (&label("mmx_break")); | ||
527 | |||
528 | &shl (&LB($nlo),4); | ||
529 | &and ($rem,0xf); | ||
530 | &psrlq ($Zlo,4); | ||
531 | &and ($nhi,0xf0); | ||
532 | &movq ($tmp,$Zhi); | ||
533 | &psrlq ($Zhi,4); | ||
534 | &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
535 | &psllq ($tmp,60); | ||
536 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
537 | &movd ($rem,$Zlo); | ||
538 | &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
539 | &pxor ($Zlo,$tmp); | ||
540 | &jmp (&label("mmx_loop")); | ||
541 | |||
542 | &set_label("mmx_break",16); | ||
543 | &shl (&LB($nlo),4); | ||
544 | &and ($rem,0xf); | ||
545 | &psrlq ($Zlo,4); | ||
546 | &and ($nhi,0xf0); | ||
547 | &movq ($tmp,$Zhi); | ||
548 | &psrlq ($Zhi,4); | ||
549 | &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
550 | &psllq ($tmp,60); | ||
551 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
552 | &movd ($rem,$Zlo); | ||
553 | &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
554 | &pxor ($Zlo,$tmp); | ||
555 | |||
556 | &psrlq ($Zlo,4); | ||
557 | &and ($rem,0xf); | ||
558 | &movq ($tmp,$Zhi); | ||
559 | &psrlq ($Zhi,4); | ||
560 | &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); | ||
561 | &psllq ($tmp,60); | ||
562 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
563 | &movd ($rem,$Zlo); | ||
564 | &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); | ||
565 | &pxor ($Zlo,$tmp); | ||
566 | |||
567 | &psrlq ($Zlo,32); # lower part of Zlo is already there | ||
568 | &movd ($Zhl,$Zhi); | ||
569 | &psrlq ($Zhi,32); | ||
570 | &movd ($Zlh,$Zlo); | ||
571 | &movd ($Zhh,$Zhi); | ||
572 | |||
573 | &bswap ($Zll); | ||
574 | &bswap ($Zhl); | ||
575 | &bswap ($Zlh); | ||
576 | &bswap ($Zhh); | ||
577 | } | ||
578 | |||
579 | &function_begin("gcm_gmult_4bit_mmx"); | ||
580 | &mov ($inp,&wparam(0)); # load Xi | ||
581 | &mov ($Htbl,&wparam(1)); # load Htable | ||
582 | |||
583 | &picsetup("eax"); | ||
584 | &picsymbol("eax", &label("rem_4bit"), "eax"); | ||
585 | |||
586 | &movz ($Zll,&BP(15,$inp)); | ||
587 | |||
588 | &mmx_loop($inp,"eax"); | ||
589 | |||
590 | &emms (); | ||
591 | &mov (&DWP(12,$inp),$Zll); | ||
592 | &mov (&DWP(4,$inp),$Zhl); | ||
593 | &mov (&DWP(8,$inp),$Zlh); | ||
594 | &mov (&DWP(0,$inp),$Zhh); | ||
595 | &function_end("gcm_gmult_4bit_mmx"); | ||
596 | |||
597 | ###################################################################### | ||
598 | # Below subroutine is "528B" variant of "4-bit" GCM GHASH function | ||
599 | # (see gcm128.c for details). It provides further 20-40% performance | ||
600 | # improvement over above mentioned "May" version. | ||
601 | |||
602 | &static_label("rem_8bit"); | ||
603 | |||
604 | &function_begin("gcm_ghash_4bit_mmx"); | ||
605 | { my ($Zlo,$Zhi) = ("mm7","mm6"); | ||
606 | my $rem_8bit = "esi"; | ||
607 | my $Htbl = "ebx"; | ||
608 | |||
609 | # parameter block | ||
610 | &mov ("eax",&wparam(0)); # Xi | ||
611 | &mov ("ebx",&wparam(1)); # Htable | ||
612 | &mov ("ecx",&wparam(2)); # inp | ||
613 | &mov ("edx",&wparam(3)); # len | ||
614 | &mov ("ebp","esp"); # original %esp | ||
615 | |||
616 | &picsetup($rem_8bit); | ||
617 | &picsymbol($rem_8bit, &label("rem_8bit"), $rem_8bit); | ||
618 | |||
619 | &sub ("esp",512+16+16); # allocate stack frame... | ||
620 | &and ("esp",-64); # ...and align it | ||
621 | &sub ("esp",16); # place for (u8)(H[]<<4) | ||
622 | |||
623 | &add ("edx","ecx"); # pointer to the end of input | ||
624 | &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi | ||
625 | &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len | ||
626 | &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp | ||
627 | |||
628 | { my @lo = ("mm0","mm1","mm2"); | ||
629 | my @hi = ("mm3","mm4","mm5"); | ||
630 | my @tmp = ("mm6","mm7"); | ||
631 | my ($off1,$off2,$i) = (0,0,); | ||
632 | |||
633 | &add ($Htbl,128); # optimize for size | ||
634 | &lea ("edi",&DWP(16+128,"esp")); | ||
635 | &lea ("ebp",&DWP(16+256+128,"esp")); | ||
636 | |||
637 | # decompose Htable (low and high parts are kept separately), | ||
638 | # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... | ||
639 | for ($i=0;$i<18;$i++) { | ||
640 | |||
641 | &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); | ||
642 | &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); | ||
643 | &psllq ($tmp[1],60) if ($i>1); | ||
644 | &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); | ||
645 | &por ($lo[2],$tmp[1]) if ($i>1); | ||
646 | &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); | ||
647 | &psrlq ($lo[1],4) if ($i>0 && $i<17); | ||
648 | &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); | ||
649 | &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); | ||
650 | &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); | ||
651 | &psrlq ($hi[1],4) if ($i>0 && $i<17); | ||
652 | &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); | ||
653 | &shl ("edx",4) if ($i<16); | ||
654 | &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); | ||
655 | |||
656 | unshift (@lo,pop(@lo)); # "rotate" registers | ||
657 | unshift (@hi,pop(@hi)); | ||
658 | unshift (@tmp,pop(@tmp)); | ||
659 | $off1 += 8 if ($i>0); | ||
660 | $off2 += 8 if ($i>1); | ||
661 | } | ||
662 | } | ||
663 | |||
664 | &movq ($Zhi,&QWP(0,"eax")); | ||
665 | &mov ("ebx",&DWP(8,"eax")); | ||
666 | &mov ("edx",&DWP(12,"eax")); # load Xi | ||
667 | |||
668 | &set_label("outer",16); | ||
669 | { my $nlo = "eax"; | ||
670 | my $dat = "edx"; | ||
671 | my @nhi = ("edi","ebp"); | ||
672 | my @rem = ("ebx","ecx"); | ||
673 | my @red = ("mm0","mm1","mm2"); | ||
674 | my $tmp = "mm3"; | ||
675 | |||
676 | &xor ($dat,&DWP(12,"ecx")); # merge input data | ||
677 | &xor ("ebx",&DWP(8,"ecx")); | ||
678 | &pxor ($Zhi,&QWP(0,"ecx")); | ||
679 | &lea ("ecx",&DWP(16,"ecx")); # inp+=16 | ||
680 | #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi | ||
681 | &mov (&DWP(528+8,"esp"),"ebx"); | ||
682 | &movq (&QWP(528+0,"esp"),$Zhi); | ||
683 | &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp | ||
684 | |||
685 | &xor ($nlo,$nlo); | ||
686 | &rol ($dat,8); | ||
687 | &mov (&LB($nlo),&LB($dat)); | ||
688 | &mov ($nhi[1],$nlo); | ||
689 | &and (&LB($nlo),0x0f); | ||
690 | &shr ($nhi[1],4); | ||
691 | &pxor ($red[0],$red[0]); | ||
692 | &rol ($dat,8); # next byte | ||
693 | &pxor ($red[1],$red[1]); | ||
694 | &pxor ($red[2],$red[2]); | ||
695 | |||
696 | # Just like in "May" version modulo-schedule for critical path in | ||
697 | # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' | ||
698 | # is scheduled so late that rem_8bit[] has to be shifted *right* | ||
699 | # by 16, which is why last argument to pinsrw is 2, which | ||
700 | # corresponds to <<32=<<48>>16... | ||
701 | for ($j=11,$i=0;$i<15;$i++) { | ||
702 | |||
703 | if ($i>0) { | ||
704 | &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] | ||
705 | &rol ($dat,8); # next byte | ||
706 | &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
707 | |||
708 | &pxor ($Zlo,$tmp); | ||
709 | &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); | ||
710 | &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) | ||
711 | } else { | ||
712 | &movq ($Zlo,&QWP(16,"esp",$nlo,8)); | ||
713 | &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
714 | } | ||
715 | |||
716 | &mov (&LB($nlo),&LB($dat)); | ||
717 | &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0 && $j>=0); | ||
718 | |||
719 | &movd ($rem[0],$Zlo); | ||
720 | &movz ($rem[1],&LB($rem[1])) if ($i>0); | ||
721 | &psrlq ($Zlo,8); # Z>>=8 | ||
722 | |||
723 | &movq ($tmp,$Zhi); | ||
724 | &mov ($nhi[0],$nlo); | ||
725 | &psrlq ($Zhi,8); | ||
726 | |||
727 | &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 | ||
728 | &and (&LB($nlo),0x0f); | ||
729 | &psllq ($tmp,56); | ||
730 | |||
731 | &pxor ($Zhi,$red[1]) if ($i>1); | ||
732 | &shr ($nhi[0],4); | ||
733 | &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); | ||
734 | |||
735 | unshift (@red,pop(@red)); # "rotate" registers | ||
736 | unshift (@rem,pop(@rem)); | ||
737 | unshift (@nhi,pop(@nhi)); | ||
738 | } | ||
739 | |||
740 | &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] | ||
741 | &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
742 | &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) | ||
743 | |||
744 | &pxor ($Zlo,$tmp); | ||
745 | &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); | ||
746 | &movz ($rem[1],&LB($rem[1])); | ||
747 | |||
748 | &pxor ($red[2],$red[2]); # clear 2nd word | ||
749 | &psllq ($red[1],4); | ||
750 | |||
751 | &movd ($rem[0],$Zlo); | ||
752 | &psrlq ($Zlo,4); # Z>>=4 | ||
753 | |||
754 | &movq ($tmp,$Zhi); | ||
755 | &psrlq ($Zhi,4); | ||
756 | &shl ($rem[0],4); # rem<<4 | ||
757 | |||
758 | &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] | ||
759 | &psllq ($tmp,60); | ||
760 | &movz ($rem[0],&LB($rem[0])); | ||
761 | |||
762 | &pxor ($Zlo,$tmp); | ||
763 | &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); | ||
764 | |||
765 | &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); | ||
766 | &pxor ($Zhi,$red[1]); | ||
767 | |||
768 | &movd ($dat,$Zlo); | ||
769 | &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 | ||
770 | |||
771 | &psllq ($red[0],12); # correct by <<16>>4 | ||
772 | &pxor ($Zhi,$red[0]); | ||
773 | &psrlq ($Zlo,32); | ||
774 | &pxor ($Zhi,$red[2]); | ||
775 | |||
776 | &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp | ||
777 | &movd ("ebx",$Zlo); | ||
778 | &movq ($tmp,$Zhi); # 01234567 | ||
779 | &psllw ($Zhi,8); # 1.3.5.7. | ||
780 | &psrlw ($tmp,8); # .0.2.4.6 | ||
781 | &por ($Zhi,$tmp); # 10325476 | ||
782 | &bswap ($dat); | ||
783 | &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 | ||
784 | &bswap ("ebx"); | ||
785 | |||
786 | &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? | ||
787 | &jne (&label("outer")); | ||
788 | } | ||
789 | |||
790 | &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi | ||
791 | &mov (&DWP(12,"eax"),"edx"); | ||
792 | &mov (&DWP(8,"eax"),"ebx"); | ||
793 | &movq (&QWP(0,"eax"),$Zhi); | ||
794 | |||
795 | &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp | ||
796 | &emms (); | ||
797 | } | ||
798 | &function_end("gcm_ghash_4bit_mmx"); | ||
799 | }} | ||
800 | |||
801 | if ($sse2) {{ | ||
802 | ###################################################################### | ||
803 | # PCLMULQDQ version. | ||
804 | |||
805 | $Xip="eax"; | ||
806 | $Htbl="edx"; | ||
807 | $const="ecx"; | ||
808 | $inp="esi"; | ||
809 | $len="ebx"; | ||
810 | |||
811 | ($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; | ||
812 | ($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); | ||
813 | ($Xn,$Xhn)=("xmm6","xmm7"); | ||
814 | |||
815 | &static_label("bswap"); | ||
816 | |||
817 | sub clmul64x64_T2 { # minimal "register" pressure | ||
818 | my ($Xhi,$Xi,$Hkey)=@_; | ||
819 | |||
820 | &movdqa ($Xhi,$Xi); # | ||
821 | &pshufd ($T1,$Xi,0b01001110); | ||
822 | &pshufd ($T2,$Hkey,0b01001110); | ||
823 | &pxor ($T1,$Xi); # | ||
824 | &pxor ($T2,$Hkey); | ||
825 | |||
826 | &pclmulqdq ($Xi,$Hkey,0x00); ####### | ||
827 | &pclmulqdq ($Xhi,$Hkey,0x11); ####### | ||
828 | &pclmulqdq ($T1,$T2,0x00); ####### | ||
829 | &xorps ($T1,$Xi); # | ||
830 | &xorps ($T1,$Xhi); # | ||
831 | |||
832 | &movdqa ($T2,$T1); # | ||
833 | &psrldq ($T1,8); | ||
834 | &pslldq ($T2,8); # | ||
835 | &pxor ($Xhi,$T1); | ||
836 | &pxor ($Xi,$T2); # | ||
837 | } | ||
838 | |||
839 | sub clmul64x64_T3 { | ||
840 | # Even though this subroutine offers visually better ILP, it | ||
841 | # was empirically found to be a tad slower than above version. | ||
842 | # At least in gcm_ghash_clmul context. But it's just as well, | ||
843 | # because loop modulo-scheduling is possible only thanks to | ||
844 | # minimized "register" pressure... | ||
845 | my ($Xhi,$Xi,$Hkey)=@_; | ||
846 | |||
847 | &movdqa ($T1,$Xi); # | ||
848 | &movdqa ($Xhi,$Xi); | ||
849 | &pclmulqdq ($Xi,$Hkey,0x00); ####### | ||
850 | &pclmulqdq ($Xhi,$Hkey,0x11); ####### | ||
851 | &pshufd ($T2,$T1,0b01001110); # | ||
852 | &pshufd ($T3,$Hkey,0b01001110); | ||
853 | &pxor ($T2,$T1); # | ||
854 | &pxor ($T3,$Hkey); | ||
855 | &pclmulqdq ($T2,$T3,0x00); ####### | ||
856 | &pxor ($T2,$Xi); # | ||
857 | &pxor ($T2,$Xhi); # | ||
858 | |||
859 | &movdqa ($T3,$T2); # | ||
860 | &psrldq ($T2,8); | ||
861 | &pslldq ($T3,8); # | ||
862 | &pxor ($Xhi,$T2); | ||
863 | &pxor ($Xi,$T3); # | ||
864 | } | ||
865 | |||
866 | if (1) { # Algorithm 9 with <<1 twist. | ||
867 | # Reduction is shorter and uses only two | ||
868 | # temporary registers, which makes it better | ||
869 | # candidate for interleaving with 64x64 | ||
870 | # multiplication. Pre-modulo-scheduled loop | ||
871 | # was found to be ~20% faster than Algorithm 5 | ||
872 | # below. Algorithm 9 was therefore chosen for | ||
873 | # further optimization... | ||
874 | |||
875 | sub reduction_alg9 { # 17/13 times faster than Intel version | ||
876 | my ($Xhi,$Xi) = @_; | ||
877 | |||
878 | # 1st phase | ||
879 | &movdqa ($T1,$Xi); # | ||
880 | &psllq ($Xi,1); | ||
881 | &pxor ($Xi,$T1); # | ||
882 | &psllq ($Xi,5); # | ||
883 | &pxor ($Xi,$T1); # | ||
884 | &psllq ($Xi,57); # | ||
885 | &movdqa ($T2,$Xi); # | ||
886 | &pslldq ($Xi,8); | ||
887 | &psrldq ($T2,8); # | ||
888 | &pxor ($Xi,$T1); | ||
889 | &pxor ($Xhi,$T2); # | ||
890 | |||
891 | # 2nd phase | ||
892 | &movdqa ($T2,$Xi); | ||
893 | &psrlq ($Xi,5); | ||
894 | &pxor ($Xi,$T2); # | ||
895 | &psrlq ($Xi,1); # | ||
896 | &pxor ($Xi,$T2); # | ||
897 | &pxor ($T2,$Xhi); | ||
898 | &psrlq ($Xi,1); # | ||
899 | &pxor ($Xi,$T2); # | ||
900 | } | ||
901 | |||
902 | &function_begin_B("gcm_init_clmul"); | ||
903 | &mov ($Htbl,&wparam(0)); | ||
904 | &mov ($Xip,&wparam(1)); | ||
905 | |||
906 | &picsetup($const); | ||
907 | &picsymbol($const, &label("bswap"), $const); | ||
908 | |||
909 | &movdqu ($Hkey,&QWP(0,$Xip)); | ||
910 | &pshufd ($Hkey,$Hkey,0b01001110);# dword swap | ||
911 | |||
912 | # <<1 twist | ||
913 | &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword | ||
914 | &movdqa ($T1,$Hkey); | ||
915 | &psllq ($Hkey,1); | ||
916 | &pxor ($T3,$T3); # | ||
917 | &psrlq ($T1,63); | ||
918 | &pcmpgtd ($T3,$T2); # broadcast carry bit | ||
919 | &pslldq ($T1,8); | ||
920 | &por ($Hkey,$T1); # H<<=1 | ||
921 | |||
922 | # magic reduction | ||
923 | &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial | ||
924 | &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial | ||
925 | |||
926 | # calculate H^2 | ||
927 | &movdqa ($Xi,$Hkey); | ||
928 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
929 | &reduction_alg9 ($Xhi,$Xi); | ||
930 | |||
931 | &movdqu (&QWP(0,$Htbl),$Hkey); # save H | ||
932 | &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 | ||
933 | |||
934 | &ret (); | ||
935 | &function_end_B("gcm_init_clmul"); | ||
936 | |||
937 | &function_begin_B("gcm_gmult_clmul"); | ||
938 | &mov ($Xip,&wparam(0)); | ||
939 | &mov ($Htbl,&wparam(1)); | ||
940 | |||
941 | &picsetup($const); | ||
942 | &picsymbol($const, &label("bswap"), $const); | ||
943 | |||
944 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
945 | &movdqa ($T3,&QWP(0,$const)); | ||
946 | &movups ($Hkey,&QWP(0,$Htbl)); | ||
947 | &pshufb ($Xi,$T3); | ||
948 | |||
949 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
950 | &reduction_alg9 ($Xhi,$Xi); | ||
951 | |||
952 | &pshufb ($Xi,$T3); | ||
953 | &movdqu (&QWP(0,$Xip),$Xi); | ||
954 | |||
955 | &ret (); | ||
956 | &function_end_B("gcm_gmult_clmul"); | ||
957 | |||
958 | &function_begin("gcm_ghash_clmul"); | ||
959 | &mov ($Xip,&wparam(0)); | ||
960 | &mov ($Htbl,&wparam(1)); | ||
961 | &mov ($inp,&wparam(2)); | ||
962 | &mov ($len,&wparam(3)); | ||
963 | |||
964 | &picsetup($const); | ||
965 | &picsymbol($const, &label("bswap"), $const); | ||
966 | |||
967 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
968 | &movdqa ($T3,&QWP(0,$const)); | ||
969 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
970 | &pshufb ($Xi,$T3); | ||
971 | |||
972 | &sub ($len,0x10); | ||
973 | &jz (&label("odd_tail")); | ||
974 | |||
975 | ####### | ||
976 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
977 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
978 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
979 | # | ||
980 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
981 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
982 | &pshufb ($T1,$T3); | ||
983 | &pshufb ($Xn,$T3); | ||
984 | &pxor ($Xi,$T1); # Ii+Xi | ||
985 | |||
986 | &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
987 | &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
988 | |||
989 | &lea ($inp,&DWP(32,$inp)); # i+=2 | ||
990 | &sub ($len,0x20); | ||
991 | &jbe (&label("even_tail")); | ||
992 | |||
993 | &set_label("mod_loop"); | ||
994 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
995 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
996 | &movups ($Hkey,&QWP(0,$Htbl)); # load H | ||
997 | |||
998 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
999 | &pxor ($Xhi,$Xhn); | ||
1000 | |||
1001 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
1002 | &pshufb ($T1,$T3); | ||
1003 | &pshufb ($Xn,$T3); | ||
1004 | |||
1005 | &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 | ||
1006 | &movdqa ($Xhn,$Xn); | ||
1007 | &pxor ($Xhi,$T1); # "Ii+Xi", consume early | ||
1008 | |||
1009 | &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase | ||
1010 | &psllq ($Xi,1); | ||
1011 | &pxor ($Xi,$T1); # | ||
1012 | &psllq ($Xi,5); # | ||
1013 | &pxor ($Xi,$T1); # | ||
1014 | &pclmulqdq ($Xn,$Hkey,0x00); ####### | ||
1015 | &psllq ($Xi,57); # | ||
1016 | &movdqa ($T2,$Xi); # | ||
1017 | &pslldq ($Xi,8); | ||
1018 | &psrldq ($T2,8); # | ||
1019 | &pxor ($Xi,$T1); | ||
1020 | &pshufd ($T1,$T3,0b01001110); | ||
1021 | &pxor ($Xhi,$T2); # | ||
1022 | &pxor ($T1,$T3); | ||
1023 | &pshufd ($T3,$Hkey,0b01001110); | ||
1024 | &pxor ($T3,$Hkey); # | ||
1025 | |||
1026 | &pclmulqdq ($Xhn,$Hkey,0x11); ####### | ||
1027 | &movdqa ($T2,$Xi); # 2nd phase | ||
1028 | &psrlq ($Xi,5); | ||
1029 | &pxor ($Xi,$T2); # | ||
1030 | &psrlq ($Xi,1); # | ||
1031 | &pxor ($Xi,$T2); # | ||
1032 | &pxor ($T2,$Xhi); | ||
1033 | &psrlq ($Xi,1); # | ||
1034 | &pxor ($Xi,$T2); # | ||
1035 | |||
1036 | &pclmulqdq ($T1,$T3,0x00); ####### | ||
1037 | &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1038 | &xorps ($T1,$Xn); # | ||
1039 | &xorps ($T1,$Xhn); # | ||
1040 | |||
1041 | &movdqa ($T3,$T1); # | ||
1042 | &psrldq ($T1,8); | ||
1043 | &pslldq ($T3,8); # | ||
1044 | &pxor ($Xhn,$T1); | ||
1045 | &pxor ($Xn,$T3); # | ||
1046 | &movdqa ($T3,&QWP(0,$const)); | ||
1047 | |||
1048 | &lea ($inp,&DWP(32,$inp)); | ||
1049 | &sub ($len,0x20); | ||
1050 | &ja (&label("mod_loop")); | ||
1051 | |||
1052 | &set_label("even_tail"); | ||
1053 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1054 | |||
1055 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1056 | &pxor ($Xhi,$Xhn); | ||
1057 | |||
1058 | &reduction_alg9 ($Xhi,$Xi); | ||
1059 | |||
1060 | &test ($len,$len); | ||
1061 | &jnz (&label("done")); | ||
1062 | |||
1063 | &movups ($Hkey,&QWP(0,$Htbl)); # load H | ||
1064 | &set_label("odd_tail"); | ||
1065 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1066 | &pshufb ($T1,$T3); | ||
1067 | &pxor ($Xi,$T1); # Ii+Xi | ||
1068 | |||
1069 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
1070 | &reduction_alg9 ($Xhi,$Xi); | ||
1071 | |||
1072 | &set_label("done"); | ||
1073 | &pshufb ($Xi,$T3); | ||
1074 | &movdqu (&QWP(0,$Xip),$Xi); | ||
1075 | &function_end("gcm_ghash_clmul"); | ||
1076 | |||
1077 | } else { # Algorithm 5. Kept for reference purposes. | ||
1078 | |||
1079 | sub reduction_alg5 { # 19/16 times faster than Intel version | ||
1080 | my ($Xhi,$Xi)=@_; | ||
1081 | |||
1082 | # <<1 | ||
1083 | &movdqa ($T1,$Xi); # | ||
1084 | &movdqa ($T2,$Xhi); | ||
1085 | &pslld ($Xi,1); | ||
1086 | &pslld ($Xhi,1); # | ||
1087 | &psrld ($T1,31); | ||
1088 | &psrld ($T2,31); # | ||
1089 | &movdqa ($T3,$T1); | ||
1090 | &pslldq ($T1,4); | ||
1091 | &psrldq ($T3,12); # | ||
1092 | &pslldq ($T2,4); | ||
1093 | &por ($Xhi,$T3); # | ||
1094 | &por ($Xi,$T1); | ||
1095 | &por ($Xhi,$T2); # | ||
1096 | |||
1097 | # 1st phase | ||
1098 | &movdqa ($T1,$Xi); | ||
1099 | &movdqa ($T2,$Xi); | ||
1100 | &movdqa ($T3,$Xi); # | ||
1101 | &pslld ($T1,31); | ||
1102 | &pslld ($T2,30); | ||
1103 | &pslld ($Xi,25); # | ||
1104 | &pxor ($T1,$T2); | ||
1105 | &pxor ($T1,$Xi); # | ||
1106 | &movdqa ($T2,$T1); # | ||
1107 | &pslldq ($T1,12); | ||
1108 | &psrldq ($T2,4); # | ||
1109 | &pxor ($T3,$T1); | ||
1110 | |||
1111 | # 2nd phase | ||
1112 | &pxor ($Xhi,$T3); # | ||
1113 | &movdqa ($Xi,$T3); | ||
1114 | &movdqa ($T1,$T3); | ||
1115 | &psrld ($Xi,1); # | ||
1116 | &psrld ($T1,2); | ||
1117 | &psrld ($T3,7); # | ||
1118 | &pxor ($Xi,$T1); | ||
1119 | &pxor ($Xhi,$T2); | ||
1120 | &pxor ($Xi,$T3); # | ||
1121 | &pxor ($Xi,$Xhi); # | ||
1122 | } | ||
1123 | |||
1124 | &function_begin_B("gcm_init_clmul"); | ||
1125 | &mov ($Htbl,&wparam(0)); | ||
1126 | &mov ($Xip,&wparam(1)); | ||
1127 | |||
1128 | &picsetup($const); | ||
1129 | &picsymbol($const, &label("bswap"), $const); | ||
1130 | |||
1131 | &movdqu ($Hkey,&QWP(0,$Xip)); | ||
1132 | &pshufd ($Hkey,$Hkey,0b01001110);# dword swap | ||
1133 | |||
1134 | # calculate H^2 | ||
1135 | &movdqa ($Xi,$Hkey); | ||
1136 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); | ||
1137 | &reduction_alg5 ($Xhi,$Xi); | ||
1138 | |||
1139 | &movdqu (&QWP(0,$Htbl),$Hkey); # save H | ||
1140 | &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 | ||
1141 | |||
1142 | &ret (); | ||
1143 | &function_end_B("gcm_init_clmul"); | ||
1144 | |||
1145 | &function_begin_B("gcm_gmult_clmul"); | ||
1146 | &mov ($Xip,&wparam(0)); | ||
1147 | &mov ($Htbl,&wparam(1)); | ||
1148 | |||
1149 | &picsetup($const); | ||
1150 | &picsymbol($const, &label("bswap"), $const); | ||
1151 | |||
1152 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
1153 | &movdqa ($Xn,&QWP(0,$const)); | ||
1154 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
1155 | &pshufb ($Xi,$Xn); | ||
1156 | |||
1157 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); | ||
1158 | &reduction_alg5 ($Xhi,$Xi); | ||
1159 | |||
1160 | &pshufb ($Xi,$Xn); | ||
1161 | &movdqu (&QWP(0,$Xip),$Xi); | ||
1162 | |||
1163 | &ret (); | ||
1164 | &function_end_B("gcm_gmult_clmul"); | ||
1165 | |||
1166 | &function_begin("gcm_ghash_clmul"); | ||
1167 | &mov ($Xip,&wparam(0)); | ||
1168 | &mov ($Htbl,&wparam(1)); | ||
1169 | &mov ($inp,&wparam(2)); | ||
1170 | &mov ($len,&wparam(3)); | ||
1171 | |||
1172 | &picsetup($const); | ||
1173 | &picsymbol($const, &label("bswap"), $const); | ||
1174 | |||
1175 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
1176 | &movdqa ($T3,&QWP(0,$const)); | ||
1177 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
1178 | &pshufb ($Xi,$T3); | ||
1179 | |||
1180 | &sub ($len,0x10); | ||
1181 | &jz (&label("odd_tail")); | ||
1182 | |||
1183 | ####### | ||
1184 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
1185 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
1186 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
1187 | # | ||
1188 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1189 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
1190 | &pshufb ($T1,$T3); | ||
1191 | &pshufb ($Xn,$T3); | ||
1192 | &pxor ($Xi,$T1); # Ii+Xi | ||
1193 | |||
1194 | &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
1195 | &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1196 | |||
1197 | &sub ($len,0x20); | ||
1198 | &lea ($inp,&DWP(32,$inp)); # i+=2 | ||
1199 | &jbe (&label("even_tail")); | ||
1200 | |||
1201 | &set_label("mod_loop"); | ||
1202 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1203 | &movdqu ($Hkey,&QWP(0,$Htbl)); # load H | ||
1204 | |||
1205 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1206 | &pxor ($Xhi,$Xhn); | ||
1207 | |||
1208 | &reduction_alg5 ($Xhi,$Xi); | ||
1209 | |||
1210 | ####### | ||
1211 | &movdqa ($T3,&QWP(0,$const)); | ||
1212 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1213 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
1214 | &pshufb ($T1,$T3); | ||
1215 | &pshufb ($Xn,$T3); | ||
1216 | &pxor ($Xi,$T1); # Ii+Xi | ||
1217 | |||
1218 | &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
1219 | &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1220 | |||
1221 | &sub ($len,0x20); | ||
1222 | &lea ($inp,&DWP(32,$inp)); | ||
1223 | &ja (&label("mod_loop")); | ||
1224 | |||
1225 | &set_label("even_tail"); | ||
1226 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1227 | |||
1228 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1229 | &pxor ($Xhi,$Xhn); | ||
1230 | |||
1231 | &reduction_alg5 ($Xhi,$Xi); | ||
1232 | |||
1233 | &movdqa ($T3,&QWP(0,$const)); | ||
1234 | &test ($len,$len); | ||
1235 | &jnz (&label("done")); | ||
1236 | |||
1237 | &movdqu ($Hkey,&QWP(0,$Htbl)); # load H | ||
1238 | &set_label("odd_tail"); | ||
1239 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1240 | &pshufb ($T1,$T3); | ||
1241 | &pxor ($Xi,$T1); # Ii+Xi | ||
1242 | |||
1243 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
1244 | &reduction_alg5 ($Xhi,$Xi); | ||
1245 | |||
1246 | &movdqa ($T3,&QWP(0,$const)); | ||
1247 | &set_label("done"); | ||
1248 | &pshufb ($Xi,$T3); | ||
1249 | &movdqu (&QWP(0,$Xip),$Xi); | ||
1250 | &function_end("gcm_ghash_clmul"); | ||
1251 | |||
1252 | } | ||
1253 | |||
1254 | &rodataseg(); | ||
1255 | &set_label("bswap",64); | ||
1256 | &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | ||
1257 | &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial | ||
1258 | &previous(); | ||
1259 | }} # $sse2 | ||
1260 | |||
1261 | &rodataseg(); | ||
1262 | &set_label("rem_4bit",64); | ||
1263 | &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); | ||
1264 | &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); | ||
1265 | &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); | ||
1266 | &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); | ||
1267 | &set_label("rem_8bit",64); | ||
1268 | &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); | ||
1269 | &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); | ||
1270 | &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); | ||
1271 | &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); | ||
1272 | &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); | ||
1273 | &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); | ||
1274 | &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); | ||
1275 | &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); | ||
1276 | &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); | ||
1277 | &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); | ||
1278 | &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); | ||
1279 | &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); | ||
1280 | &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); | ||
1281 | &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); | ||
1282 | &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); | ||
1283 | &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); | ||
1284 | &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); | ||
1285 | &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); | ||
1286 | &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); | ||
1287 | &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); | ||
1288 | &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); | ||
1289 | &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); | ||
1290 | &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); | ||
1291 | &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); | ||
1292 | &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); | ||
1293 | &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); | ||
1294 | &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); | ||
1295 | &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); | ||
1296 | &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); | ||
1297 | &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); | ||
1298 | &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); | ||
1299 | &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); | ||
1300 | &previous(); | ||
1301 | }}} # !$x86only | ||
1302 | |||
1303 | &asm_finish(); | ||
1304 | |||
1305 | # A question was risen about choice of vanilla MMX. Or rather why wasn't | ||
1306 | # SSE2 chosen instead? In addition to the fact that MMX runs on legacy | ||
1307 | # CPUs such as PIII, "4-bit" MMX version was observed to provide better | ||
1308 | # performance than *corresponding* SSE2 one even on contemporary CPUs. | ||
1309 | # SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 | ||
1310 | # implementation featuring full range of lookup-table sizes, but with | ||
1311 | # per-invocation lookup table setup. Latter means that table size is | ||
1312 | # chosen depending on how much data is to be hashed in every given call, | ||
1313 | # more data - larger table. Best reported result for Core2 is ~4 cycles | ||
1314 | # per processed byte out of 64KB block. This number accounts even for | ||
1315 | # 64KB table setup overhead. As discussed in gcm128.c we choose to be | ||
1316 | # more conservative in respect to lookup table sizes, but how do the | ||
1317 | # results compare? Minimalistic "256B" MMX version delivers ~11 cycles | ||
1318 | # on same platform. As also discussed in gcm128.c, next in line "8-bit | ||
1319 | # Shoup's" or "4KB" method should deliver twice the performance of | ||
1320 | # "256B" one, in other words not worse than ~6 cycles per byte. It | ||
1321 | # should be also be noted that in SSE2 case improvement can be "super- | ||
1322 | # linear," i.e. more than twice, mostly because >>8 maps to single | ||
1323 | # instruction on SSE2 register. This is unlike "4-bit" case when >>4 | ||
1324 | # maps to same amount of instructions in both MMX and SSE2 cases. | ||
1325 | # Bottom line is that switch to SSE2 is considered to be justifiable | ||
1326 | # only in case we choose to implement "8-bit" method... | ||