summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2024-12-04 13:14:45 +0000
committerjsing <>2024-12-04 13:14:45 +0000
commita4d744c17d44ef3b5e9e9dc14503acbd25bd3c54 (patch)
tree903deb798dec7bf63dac60782d87a8daa0d0126e /src
parent1c3ce6cc8e538cecc33ed58f89d969af28952dea (diff)
downloadopenbsd-a4d744c17d44ef3b5e9e9dc14503acbd25bd3c54.tar.gz
openbsd-a4d744c17d44ef3b5e9e9dc14503acbd25bd3c54.tar.bz2
openbsd-a4d744c17d44ef3b5e9e9dc14503acbd25bd3c54.zip
Another now unused perlasm script can bite the dust.
Diffstat (limited to 'src')
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl1267
1 files changed, 0 insertions, 1267 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
deleted file mode 100755
index e080251df4..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ /dev/null
@@ -1,1267 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27# gcc 3.4 32-bit asm cycles/byte
28# Opteron +45% +20% 6.8
29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56# x86_64 SSSE3 AVX
57# P4 9.8 -
58# Opteron 6.6 -
59# Core2 6.7 6.1/+10% -
60# Atom 11.0 9.7/+13% -
61# Westmere 7.1 5.6/+27% -
62# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
63
64$flavour = shift;
65$output = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77 $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80 $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83 $1>=10);
84
85open OUT,"| \"$^X\" $xlate $flavour $output";
86*STDOUT=*OUT;
87
88$ctx="%rdi"; # 1st arg
89$inp="%rsi"; # 2nd arg
90$num="%rdx"; # 3rd arg
91
92# reassign arguments in order to produce more compact code
93$ctx="%r8";
94$inp="%r9";
95$num="%r10";
96
97$t0="%eax";
98$t1="%ebx";
99$t2="%ecx";
100@xi=("%edx","%ebp");
101$A="%esi";
102$B="%edi";
103$C="%r11d";
104$D="%r12d";
105$E="%r13d";
106
107@V=($A,$B,$C,$D,$E);
108
109sub BODY_00_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111my $j=$i+1;
112$code.=<<___ if ($i==0);
113 mov `4*$i`($inp),$xi[0]
114 bswap $xi[0]
115 mov $xi[0],`4*$i`(%rsp)
116___
117$code.=<<___ if ($i<15);
118 mov $c,$t0
119 mov `4*$j`($inp),$xi[1]
120 mov $a,$t2
121 xor $d,$t0
122 bswap $xi[1]
123 rol \$5,$t2
124 lea 0x5a827999($xi[0],$e),$e
125 and $b,$t0
126 mov $xi[1],`4*$j`(%rsp)
127 add $t2,$e
128 xor $d,$t0
129 rol \$30,$b
130 add $t0,$e
131___
132$code.=<<___ if ($i>=15);
133 mov `4*($j%16)`(%rsp),$xi[1]
134 mov $c,$t0
135 mov $a,$t2
136 xor `4*(($j+2)%16)`(%rsp),$xi[1]
137 xor $d,$t0
138 rol \$5,$t2
139 xor `4*(($j+8)%16)`(%rsp),$xi[1]
140 and $b,$t0
141 lea 0x5a827999($xi[0],$e),$e
142 xor `4*(($j+13)%16)`(%rsp),$xi[1]
143 xor $d,$t0
144 rol \$1,$xi[1]
145 add $t2,$e
146 rol \$30,$b
147 mov $xi[1],`4*($j%16)`(%rsp)
148 add $t0,$e
149___
150unshift(@xi,pop(@xi));
151}
152
153sub BODY_20_39 {
154my ($i,$a,$b,$c,$d,$e)=@_;
155my $j=$i+1;
156my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
157$code.=<<___ if ($i<79);
158 mov `4*($j%16)`(%rsp),$xi[1]
159 mov $c,$t0
160 mov $a,$t2
161 xor `4*(($j+2)%16)`(%rsp),$xi[1]
162 xor $b,$t0
163 rol \$5,$t2
164 lea $K($xi[0],$e),$e
165 xor `4*(($j+8)%16)`(%rsp),$xi[1]
166 xor $d,$t0
167 add $t2,$e
168 xor `4*(($j+13)%16)`(%rsp),$xi[1]
169 rol \$30,$b
170 add $t0,$e
171 rol \$1,$xi[1]
172___
173$code.=<<___ if ($i<76);
174 mov $xi[1],`4*($j%16)`(%rsp)
175___
176$code.=<<___ if ($i==79);
177 mov $c,$t0
178 mov $a,$t2
179 xor $b,$t0
180 lea $K($xi[0],$e),$e
181 rol \$5,$t2
182 xor $d,$t0
183 add $t2,$e
184 rol \$30,$b
185 add $t0,$e
186___
187unshift(@xi,pop(@xi));
188}
189
190sub BODY_40_59 {
191my ($i,$a,$b,$c,$d,$e)=@_;
192my $j=$i+1;
193$code.=<<___;
194 mov `4*($j%16)`(%rsp),$xi[1]
195 mov $c,$t0
196 mov $c,$t1
197 xor `4*(($j+2)%16)`(%rsp),$xi[1]
198 and $d,$t0
199 mov $a,$t2
200 xor `4*(($j+8)%16)`(%rsp),$xi[1]
201 xor $d,$t1
202 lea 0x8f1bbcdc($xi[0],$e),$e
203 rol \$5,$t2
204 xor `4*(($j+13)%16)`(%rsp),$xi[1]
205 add $t0,$e
206 and $b,$t1
207 rol \$1,$xi[1]
208 add $t1,$e
209 rol \$30,$b
210 mov $xi[1],`4*($j%16)`(%rsp)
211 add $t2,$e
212___
213unshift(@xi,pop(@xi));
214}
215
216$code.=<<___;
217.text
218.extern OPENSSL_ia32cap_P
219.hidden OPENSSL_ia32cap_P
220
221.globl sha1_block_data_order
222.type sha1_block_data_order,\@function,3
223.align 16
224sha1_block_data_order:
225 _CET_ENDBR
226 mov OPENSSL_ia32cap_P+0(%rip),%r9d
227 mov OPENSSL_ia32cap_P+4(%rip),%r8d
228 test \$IA32CAP_MASK1_SSSE3,%r8d # check SSSE3 bit
229 jz .Lialu
230___
231$code.=<<___ if ($avx);
232 and \$IA32CAP_MASK1_AVX,%r8d # mask AVX bit
233 and \$IA32CAP_MASK0_INTEL,%r9d # mask "Intel CPU" bit
234 or %r9d,%r8d
235 cmp \$(IA32CAP_MASK0_INTEL | IA32CAP_MASK1_AVX),%r8d
236 je _avx_shortcut
237___
238$code.=<<___;
239 jmp _ssse3_shortcut
240
241.align 16
242.Lialu:
243 push %rbx
244 push %rbp
245 push %r12
246 push %r13
247 mov %rsp,%r11
248 mov %rdi,$ctx # reassigned argument
249 sub \$`8+16*4`,%rsp
250 mov %rsi,$inp # reassigned argument
251 and \$-64,%rsp
252 mov %rdx,$num # reassigned argument
253 mov %r11,`16*4`(%rsp)
254.Lprologue:
255
256 mov 0($ctx),$A
257 mov 4($ctx),$B
258 mov 8($ctx),$C
259 mov 12($ctx),$D
260 mov 16($ctx),$E
261 jmp .Lloop
262
263.align 16
264.Lloop:
265___
266for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
267for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
268for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
269for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
270$code.=<<___;
271 add 0($ctx),$A
272 add 4($ctx),$B
273 add 8($ctx),$C
274 add 12($ctx),$D
275 add 16($ctx),$E
276 mov $A,0($ctx)
277 mov $B,4($ctx)
278 mov $C,8($ctx)
279 mov $D,12($ctx)
280 mov $E,16($ctx)
281
282 sub \$1,$num
283 lea `16*4`($inp),$inp
284 jnz .Lloop
285
286 mov `16*4`(%rsp),%rsi
287 mov (%rsi),%r13
288 mov 8(%rsi),%r12
289 mov 16(%rsi),%rbp
290 mov 24(%rsi),%rbx
291 lea 32(%rsi),%rsp
292.Lepilogue:
293 ret
294.size sha1_block_data_order,.-sha1_block_data_order
295___
296{{{
297my $Xi=4;
298my @X=map("%xmm$_",(4..7,0..3));
299my @Tx=map("%xmm$_",(8..10));
300my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
301my @T=("%esi","%edi");
302my $j=0;
303my $K_XX_XX="%r11";
304
305my $_rol=sub { &rol(@_) };
306my $_ror=sub { &ror(@_) };
307
308$code.=<<___;
309.type sha1_block_data_order_ssse3,\@function,3
310.align 16
311sha1_block_data_order_ssse3:
312_ssse3_shortcut:
313 _CET_ENDBR
314 push %rbx
315 push %rbp
316 push %r12
317 lea `-64-($win64?5*16:0)`(%rsp),%rsp
318___
319$code.=<<___ if ($win64);
320 movaps %xmm6,64+0(%rsp)
321 movaps %xmm7,64+16(%rsp)
322 movaps %xmm8,64+32(%rsp)
323 movaps %xmm9,64+48(%rsp)
324 movaps %xmm10,64+64(%rsp)
325.Lprologue_ssse3:
326___
327$code.=<<___;
328 mov %rdi,$ctx # reassigned argument
329 mov %rsi,$inp # reassigned argument
330 mov %rdx,$num # reassigned argument
331
332 shl \$6,$num
333 add $inp,$num
334 lea K_XX_XX(%rip),$K_XX_XX
335
336 mov 0($ctx),$A # load context
337 mov 4($ctx),$B
338 mov 8($ctx),$C
339 mov 12($ctx),$D
340 mov $B,@T[0] # magic seed
341 mov 16($ctx),$E
342
343 movdqa 64($K_XX_XX),@X[2] # pbswap mask
344 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
345 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
346 movdqu 16($inp),@X[-3&7]
347 movdqu 32($inp),@X[-2&7]
348 movdqu 48($inp),@X[-1&7]
349 pshufb @X[2],@X[-4&7] # byte swap
350 add \$64,$inp
351 pshufb @X[2],@X[-3&7]
352 pshufb @X[2],@X[-2&7]
353 pshufb @X[2],@X[-1&7]
354 paddd @Tx[1],@X[-4&7] # add K_00_19
355 paddd @Tx[1],@X[-3&7]
356 paddd @Tx[1],@X[-2&7]
357 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
358 psubd @Tx[1],@X[-4&7] # restore X[]
359 movdqa @X[-3&7],16(%rsp)
360 psubd @Tx[1],@X[-3&7]
361 movdqa @X[-2&7],32(%rsp)
362 psubd @Tx[1],@X[-2&7]
363 jmp .Loop_ssse3
364___
365
366sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
367{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
368 my $arg = pop;
369 $arg = "\$$arg" if ($arg*1 eq $arg);
370 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
371}
372
373sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
374{ use integer;
375 my $body = shift;
376 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
377 my ($a,$b,$c,$d,$e);
378
379 &movdqa (@X[0],@X[-3&7]);
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &movdqa (@Tx[0],@X[-1&7]);
383 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
384 eval(shift(@insns));
385 eval(shift(@insns));
386
387 &paddd (@Tx[1],@X[-1&7]);
388 eval(shift(@insns));
389 eval(shift(@insns));
390 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
391 eval(shift(@insns));
392 eval(shift(@insns));
393 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
394 eval(shift(@insns));
395 eval(shift(@insns));
396
397 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
398 eval(shift(@insns));
399 eval(shift(@insns));
400 eval(shift(@insns));
401 eval(shift(@insns));
402
403 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
404 eval(shift(@insns));
405 eval(shift(@insns));
406 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
407 eval(shift(@insns));
408 eval(shift(@insns));
409
410 &movdqa (@Tx[2],@X[0]);
411 &movdqa (@Tx[0],@X[0]);
412 eval(shift(@insns));
413 eval(shift(@insns));
414 eval(shift(@insns));
415 eval(shift(@insns));
416
417 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
418 &paddd (@X[0],@X[0]);
419 eval(shift(@insns));
420 eval(shift(@insns));
421 eval(shift(@insns));
422 eval(shift(@insns));
423
424 &psrld (@Tx[0],31);
425 eval(shift(@insns));
426 eval(shift(@insns));
427 &movdqa (@Tx[1],@Tx[2]);
428 eval(shift(@insns));
429 eval(shift(@insns));
430
431 &psrld (@Tx[2],30);
432 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
433 eval(shift(@insns));
434 eval(shift(@insns));
435 eval(shift(@insns));
436 eval(shift(@insns));
437
438 &pslld (@Tx[1],2);
439 &pxor (@X[0],@Tx[2]);
440 eval(shift(@insns));
441 eval(shift(@insns));
442 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
443 eval(shift(@insns));
444 eval(shift(@insns));
445
446 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
447
448 foreach (@insns) { eval; } # remaining instructions [if any]
449
450 $Xi++; push(@X,shift(@X)); # "rotate" X[]
451 push(@Tx,shift(@Tx));
452}
453
454sub Xupdate_ssse3_32_79()
455{ use integer;
456 my $body = shift;
457 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
458 my ($a,$b,$c,$d,$e);
459
460 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
461 eval(shift(@insns)); # body_20_39
462 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
463 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
464 eval(shift(@insns));
465 eval(shift(@insns));
466 eval(shift(@insns)); # rol
467
468 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
469 eval(shift(@insns));
470 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
471 if ($Xi%5) {
472 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
473 } else { # ... or load next one
474 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
475 }
476 &paddd (@Tx[1],@X[-1&7]);
477 eval(shift(@insns)); # ror
478 eval(shift(@insns));
479
480 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
481 eval(shift(@insns)); # body_20_39
482 eval(shift(@insns));
483 eval(shift(@insns));
484 eval(shift(@insns)); # rol
485
486 &movdqa (@Tx[0],@X[0]);
487 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
488 eval(shift(@insns));
489 eval(shift(@insns));
490 eval(shift(@insns)); # ror
491 eval(shift(@insns));
492
493 &pslld (@X[0],2);
494 eval(shift(@insns)); # body_20_39
495 eval(shift(@insns));
496 &psrld (@Tx[0],30);
497 eval(shift(@insns));
498 eval(shift(@insns)); # rol
499 eval(shift(@insns));
500 eval(shift(@insns));
501 eval(shift(@insns)); # ror
502 eval(shift(@insns));
503
504 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
505 eval(shift(@insns)); # body_20_39
506 eval(shift(@insns));
507 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
508 eval(shift(@insns));
509 eval(shift(@insns)); # rol
510 eval(shift(@insns));
511 eval(shift(@insns));
512 eval(shift(@insns)); # rol
513 eval(shift(@insns));
514
515 foreach (@insns) { eval; } # remaining instructions
516
517 $Xi++; push(@X,shift(@X)); # "rotate" X[]
518 push(@Tx,shift(@Tx));
519}
520
521sub Xuplast_ssse3_80()
522{ use integer;
523 my $body = shift;
524 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
525 my ($a,$b,$c,$d,$e);
526
527 eval(shift(@insns));
528 &paddd (@Tx[1],@X[-1&7]);
529 eval(shift(@insns));
530 eval(shift(@insns));
531 eval(shift(@insns));
532 eval(shift(@insns));
533
534 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
535
536 foreach (@insns) { eval; } # remaining instructions
537
538 &cmp ($inp,$num);
539 &je (".Ldone_ssse3");
540
541 unshift(@Tx,pop(@Tx));
542
543 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
544 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
545 &movdqu (@X[-4&7],"0($inp)"); # load input
546 &movdqu (@X[-3&7],"16($inp)");
547 &movdqu (@X[-2&7],"32($inp)");
548 &movdqu (@X[-1&7],"48($inp)");
549 &pshufb (@X[-4&7],@X[2]); # byte swap
550 &add ($inp,64);
551
552 $Xi=0;
553}
554
555sub Xloop_ssse3()
556{ use integer;
557 my $body = shift;
558 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
559 my ($a,$b,$c,$d,$e);
560
561 eval(shift(@insns));
562 eval(shift(@insns));
563 &pshufb (@X[($Xi-3)&7],@X[2]);
564 eval(shift(@insns));
565 eval(shift(@insns));
566 &paddd (@X[($Xi-4)&7],@Tx[1]);
567 eval(shift(@insns));
568 eval(shift(@insns));
569 eval(shift(@insns));
570 eval(shift(@insns));
571 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
572 eval(shift(@insns));
573 eval(shift(@insns));
574 &psubd (@X[($Xi-4)&7],@Tx[1]);
575
576 foreach (@insns) { eval; }
577 $Xi++;
578}
579
580sub Xtail_ssse3()
581{ use integer;
582 my $body = shift;
583 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
584 my ($a,$b,$c,$d,$e);
585
586 foreach (@insns) { eval; }
587}
588
589sub body_00_19 () {
590 (
591 '($a,$b,$c,$d,$e)=@V;'.
592 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
593 '&xor ($c,$d);',
594 '&mov (@T[1],$a);', # $b in next round
595 '&$_rol ($a,5);',
596 '&and (@T[0],$c);', # ($b&($c^$d))
597 '&xor ($c,$d);', # restore $c
598 '&xor (@T[0],$d);',
599 '&add ($e,$a);',
600 '&$_ror ($b,$j?7:2);', # $b>>>2
601 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
602 );
603}
604
605sub body_20_39 () {
606 (
607 '($a,$b,$c,$d,$e)=@V;'.
608 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
609 '&xor (@T[0],$d);', # ($b^$d)
610 '&mov (@T[1],$a);', # $b in next round
611 '&$_rol ($a,5);',
612 '&xor (@T[0],$c);', # ($b^$d^$c)
613 '&add ($e,$a);',
614 '&$_ror ($b,7);', # $b>>>2
615 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
616 );
617}
618
619sub body_40_59 () {
620 (
621 '($a,$b,$c,$d,$e)=@V;'.
622 '&mov (@T[1],$c);',
623 '&xor ($c,$d);',
624 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
625 '&and (@T[1],$d);',
626 '&and (@T[0],$c);', # ($b&($c^$d))
627 '&$_ror ($b,7);', # $b>>>2
628 '&add ($e,@T[1]);',
629 '&mov (@T[1],$a);', # $b in next round
630 '&$_rol ($a,5);',
631 '&add ($e,@T[0]);',
632 '&xor ($c,$d);', # restore $c
633 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
634 );
635}
636$code.=<<___;
637.align 16
638.Loop_ssse3:
639___
640 &Xupdate_ssse3_16_31(\&body_00_19);
641 &Xupdate_ssse3_16_31(\&body_00_19);
642 &Xupdate_ssse3_16_31(\&body_00_19);
643 &Xupdate_ssse3_16_31(\&body_00_19);
644 &Xupdate_ssse3_32_79(\&body_00_19);
645 &Xupdate_ssse3_32_79(\&body_20_39);
646 &Xupdate_ssse3_32_79(\&body_20_39);
647 &Xupdate_ssse3_32_79(\&body_20_39);
648 &Xupdate_ssse3_32_79(\&body_20_39);
649 &Xupdate_ssse3_32_79(\&body_20_39);
650 &Xupdate_ssse3_32_79(\&body_40_59);
651 &Xupdate_ssse3_32_79(\&body_40_59);
652 &Xupdate_ssse3_32_79(\&body_40_59);
653 &Xupdate_ssse3_32_79(\&body_40_59);
654 &Xupdate_ssse3_32_79(\&body_40_59);
655 &Xupdate_ssse3_32_79(\&body_20_39);
656 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
657
658 $saved_j=$j; @saved_V=@V;
659
660 &Xloop_ssse3(\&body_20_39);
661 &Xloop_ssse3(\&body_20_39);
662 &Xloop_ssse3(\&body_20_39);
663
664$code.=<<___;
665 add 0($ctx),$A # update context
666 add 4($ctx),@T[0]
667 add 8($ctx),$C
668 add 12($ctx),$D
669 mov $A,0($ctx)
670 add 16($ctx),$E
671 mov @T[0],4($ctx)
672 mov @T[0],$B # magic seed
673 mov $C,8($ctx)
674 mov $D,12($ctx)
675 mov $E,16($ctx)
676 jmp .Loop_ssse3
677
678.align 16
679.Ldone_ssse3:
680___
681 $j=$saved_j; @V=@saved_V;
682
683 &Xtail_ssse3(\&body_20_39);
684 &Xtail_ssse3(\&body_20_39);
685 &Xtail_ssse3(\&body_20_39);
686
687$code.=<<___;
688 add 0($ctx),$A # update context
689 add 4($ctx),@T[0]
690 add 8($ctx),$C
691 mov $A,0($ctx)
692 add 12($ctx),$D
693 mov @T[0],4($ctx)
694 add 16($ctx),$E
695 mov $C,8($ctx)
696 mov $D,12($ctx)
697 mov $E,16($ctx)
698___
699$code.=<<___ if ($win64);
700 movaps 64+0(%rsp),%xmm6
701 movaps 64+16(%rsp),%xmm7
702 movaps 64+32(%rsp),%xmm8
703 movaps 64+48(%rsp),%xmm9
704 movaps 64+64(%rsp),%xmm10
705___
706$code.=<<___;
707 lea `64+($win64?5*16:0)`(%rsp),%rsi
708 mov 0(%rsi),%r12
709 mov 8(%rsi),%rbp
710 mov 16(%rsi),%rbx
711 lea 24(%rsi),%rsp
712.Lepilogue_ssse3:
713 ret
714.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
715___
716
717if ($avx) {
718my $Xi=4;
719my @X=map("%xmm$_",(4..7,0..3));
720my @Tx=map("%xmm$_",(8..10));
721my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
722my @T=("%esi","%edi");
723my $j=0;
724my $K_XX_XX="%r11";
725
726my $_rol=sub { &shld(@_[0],@_) };
727my $_ror=sub { &shrd(@_[0],@_) };
728
729$code.=<<___;
730.type sha1_block_data_order_avx,\@function,3
731.align 16
732sha1_block_data_order_avx:
733_avx_shortcut:
734 _CET_ENDBR
735 push %rbx
736 push %rbp
737 push %r12
738 lea `-64-($win64?5*16:0)`(%rsp),%rsp
739___
740$code.=<<___ if ($win64);
741 movaps %xmm6,64+0(%rsp)
742 movaps %xmm7,64+16(%rsp)
743 movaps %xmm8,64+32(%rsp)
744 movaps %xmm9,64+48(%rsp)
745 movaps %xmm10,64+64(%rsp)
746.Lprologue_avx:
747___
748$code.=<<___;
749 mov %rdi,$ctx # reassigned argument
750 mov %rsi,$inp # reassigned argument
751 mov %rdx,$num # reassigned argument
752 vzeroupper
753
754 shl \$6,$num
755 add $inp,$num
756 lea K_XX_XX(%rip),$K_XX_XX
757
758 mov 0($ctx),$A # load context
759 mov 4($ctx),$B
760 mov 8($ctx),$C
761 mov 12($ctx),$D
762 mov $B,@T[0] # magic seed
763 mov 16($ctx),$E
764
765 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
766 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
767 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
768 vmovdqu 16($inp),@X[-3&7]
769 vmovdqu 32($inp),@X[-2&7]
770 vmovdqu 48($inp),@X[-1&7]
771 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
772 add \$64,$inp
773 vpshufb @X[2],@X[-3&7],@X[-3&7]
774 vpshufb @X[2],@X[-2&7],@X[-2&7]
775 vpshufb @X[2],@X[-1&7],@X[-1&7]
776 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
777 vpaddd @Tx[1],@X[-3&7],@X[1]
778 vpaddd @Tx[1],@X[-2&7],@X[2]
779 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
780 vmovdqa @X[1],16(%rsp)
781 vmovdqa @X[2],32(%rsp)
782 jmp .Loop_avx
783___
784
785sub Xupdate_avx_16_31() # recall that $Xi starts with 4
786{ use integer;
787 my $body = shift;
788 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
789 my ($a,$b,$c,$d,$e);
790
791 eval(shift(@insns));
792 eval(shift(@insns));
793 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
794 eval(shift(@insns));
795 eval(shift(@insns));
796
797 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
798 eval(shift(@insns));
799 eval(shift(@insns));
800 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
801 eval(shift(@insns));
802 eval(shift(@insns));
803 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
804 eval(shift(@insns));
805 eval(shift(@insns));
806
807 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
808 eval(shift(@insns));
809 eval(shift(@insns));
810 eval(shift(@insns));
811 eval(shift(@insns));
812
813 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
814 eval(shift(@insns));
815 eval(shift(@insns));
816 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
817 eval(shift(@insns));
818 eval(shift(@insns));
819
820 &vpsrld (@Tx[0],@X[0],31);
821 eval(shift(@insns));
822 eval(shift(@insns));
823 eval(shift(@insns));
824 eval(shift(@insns));
825
826 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
827 &vpaddd (@X[0],@X[0],@X[0]);
828 eval(shift(@insns));
829 eval(shift(@insns));
830 eval(shift(@insns));
831 eval(shift(@insns));
832
833 &vpsrld (@Tx[1],@Tx[2],30);
834 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
835 eval(shift(@insns));
836 eval(shift(@insns));
837 eval(shift(@insns));
838 eval(shift(@insns));
839
840 &vpslld (@Tx[2],@Tx[2],2);
841 &vpxor (@X[0],@X[0],@Tx[1]);
842 eval(shift(@insns));
843 eval(shift(@insns));
844 eval(shift(@insns));
845 eval(shift(@insns));
846
847 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
848 eval(shift(@insns));
849 eval(shift(@insns));
850 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
851 eval(shift(@insns));
852 eval(shift(@insns));
853
854
855 foreach (@insns) { eval; } # remaining instructions [if any]
856
857 $Xi++; push(@X,shift(@X)); # "rotate" X[]
858 push(@Tx,shift(@Tx));
859}
860
861sub Xupdate_avx_32_79()
862{ use integer;
863 my $body = shift;
864 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
865 my ($a,$b,$c,$d,$e);
866
867 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
868 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
869 eval(shift(@insns)); # body_20_39
870 eval(shift(@insns));
871 eval(shift(@insns));
872 eval(shift(@insns)); # rol
873
874 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
875 eval(shift(@insns));
876 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
877 if ($Xi%5) {
878 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
879 } else { # ... or load next one
880 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
881 }
882 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
883 eval(shift(@insns)); # ror
884 eval(shift(@insns));
885
886 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
887 eval(shift(@insns)); # body_20_39
888 eval(shift(@insns));
889 eval(shift(@insns));
890 eval(shift(@insns)); # rol
891
892 &vpsrld (@Tx[0],@X[0],30);
893 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
894 eval(shift(@insns));
895 eval(shift(@insns));
896 eval(shift(@insns)); # ror
897 eval(shift(@insns));
898
899 &vpslld (@X[0],@X[0],2);
900 eval(shift(@insns)); # body_20_39
901 eval(shift(@insns));
902 eval(shift(@insns));
903 eval(shift(@insns)); # rol
904 eval(shift(@insns));
905 eval(shift(@insns));
906 eval(shift(@insns)); # ror
907 eval(shift(@insns));
908
909 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
910 eval(shift(@insns)); # body_20_39
911 eval(shift(@insns));
912 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
913 eval(shift(@insns));
914 eval(shift(@insns)); # rol
915 eval(shift(@insns));
916 eval(shift(@insns));
917 eval(shift(@insns)); # rol
918 eval(shift(@insns));
919
920 foreach (@insns) { eval; } # remaining instructions
921
922 $Xi++; push(@X,shift(@X)); # "rotate" X[]
923 push(@Tx,shift(@Tx));
924}
925
926sub Xuplast_avx_80()
927{ use integer;
928 my $body = shift;
929 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
930 my ($a,$b,$c,$d,$e);
931
932 eval(shift(@insns));
933 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
934 eval(shift(@insns));
935 eval(shift(@insns));
936 eval(shift(@insns));
937 eval(shift(@insns));
938
939 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
940
941 foreach (@insns) { eval; } # remaining instructions
942
943 &cmp ($inp,$num);
944 &je (".Ldone_avx");
945
946 unshift(@Tx,pop(@Tx));
947
948 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
949 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
950 &vmovdqu(@X[-4&7],"0($inp)"); # load input
951 &vmovdqu(@X[-3&7],"16($inp)");
952 &vmovdqu(@X[-2&7],"32($inp)");
953 &vmovdqu(@X[-1&7],"48($inp)");
954 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
955 &add ($inp,64);
956
957 $Xi=0;
958}
959
960sub Xloop_avx()
961{ use integer;
962 my $body = shift;
963 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
964 my ($a,$b,$c,$d,$e);
965
966 eval(shift(@insns));
967 eval(shift(@insns));
968 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
969 eval(shift(@insns));
970 eval(shift(@insns));
971 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
972 eval(shift(@insns));
973 eval(shift(@insns));
974 eval(shift(@insns));
975 eval(shift(@insns));
976 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
977 eval(shift(@insns));
978 eval(shift(@insns));
979
980 foreach (@insns) { eval; }
981 $Xi++;
982}
983
984sub Xtail_avx()
985{ use integer;
986 my $body = shift;
987 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
988 my ($a,$b,$c,$d,$e);
989
990 foreach (@insns) { eval; }
991}
992
993$code.=<<___;
994.align 16
995.Loop_avx:
996___
997 &Xupdate_avx_16_31(\&body_00_19);
998 &Xupdate_avx_16_31(\&body_00_19);
999 &Xupdate_avx_16_31(\&body_00_19);
1000 &Xupdate_avx_16_31(\&body_00_19);
1001 &Xupdate_avx_32_79(\&body_00_19);
1002 &Xupdate_avx_32_79(\&body_20_39);
1003 &Xupdate_avx_32_79(\&body_20_39);
1004 &Xupdate_avx_32_79(\&body_20_39);
1005 &Xupdate_avx_32_79(\&body_20_39);
1006 &Xupdate_avx_32_79(\&body_20_39);
1007 &Xupdate_avx_32_79(\&body_40_59);
1008 &Xupdate_avx_32_79(\&body_40_59);
1009 &Xupdate_avx_32_79(\&body_40_59);
1010 &Xupdate_avx_32_79(\&body_40_59);
1011 &Xupdate_avx_32_79(\&body_40_59);
1012 &Xupdate_avx_32_79(\&body_20_39);
1013 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1014
1015 $saved_j=$j; @saved_V=@V;
1016
1017 &Xloop_avx(\&body_20_39);
1018 &Xloop_avx(\&body_20_39);
1019 &Xloop_avx(\&body_20_39);
1020
1021$code.=<<___;
1022 add 0($ctx),$A # update context
1023 add 4($ctx),@T[0]
1024 add 8($ctx),$C
1025 add 12($ctx),$D
1026 mov $A,0($ctx)
1027 add 16($ctx),$E
1028 mov @T[0],4($ctx)
1029 mov @T[0],$B # magic seed
1030 mov $C,8($ctx)
1031 mov $D,12($ctx)
1032 mov $E,16($ctx)
1033 jmp .Loop_avx
1034
1035.align 16
1036.Ldone_avx:
1037___
1038 $j=$saved_j; @V=@saved_V;
1039
1040 &Xtail_avx(\&body_20_39);
1041 &Xtail_avx(\&body_20_39);
1042 &Xtail_avx(\&body_20_39);
1043
1044$code.=<<___;
1045 vzeroupper
1046
1047 add 0($ctx),$A # update context
1048 add 4($ctx),@T[0]
1049 add 8($ctx),$C
1050 mov $A,0($ctx)
1051 add 12($ctx),$D
1052 mov @T[0],4($ctx)
1053 add 16($ctx),$E
1054 mov $C,8($ctx)
1055 mov $D,12($ctx)
1056 mov $E,16($ctx)
1057___
1058$code.=<<___ if ($win64);
1059 movaps 64+0(%rsp),%xmm6
1060 movaps 64+16(%rsp),%xmm7
1061 movaps 64+32(%rsp),%xmm8
1062 movaps 64+48(%rsp),%xmm9
1063 movaps 64+64(%rsp),%xmm10
1064___
1065$code.=<<___;
1066 lea `64+($win64?5*16:0)`(%rsp),%rsi
1067 mov 0(%rsi),%r12
1068 mov 8(%rsi),%rbp
1069 mov 16(%rsi),%rbx
1070 lea 24(%rsi),%rsp
1071.Lepilogue_avx:
1072 ret
1073.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1074___
1075}
1076$code.=<<___;
1077.section .rodata
1078.align 64
1079K_XX_XX:
1080.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1081.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1082.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1083.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1084.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1085.text
1086___
1087}}}
1088$code.=<<___;
1089.align 64
1090___
1091
1092# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1093# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1094if ($win64) {
1095$rec="%rcx";
1096$frame="%rdx";
1097$context="%r8";
1098$disp="%r9";
1099
1100$code.=<<___;
1101.extern __imp_RtlVirtualUnwind
1102.type se_handler,\@abi-omnipotent
1103.align 16
1104se_handler:
1105 _CET_ENDBR
1106 push %rsi
1107 push %rdi
1108 push %rbx
1109 push %rbp
1110 push %r12
1111 push %r13
1112 push %r14
1113 push %r15
1114 pushfq
1115 sub \$64,%rsp
1116
1117 mov 120($context),%rax # pull context->Rax
1118 mov 248($context),%rbx # pull context->Rip
1119
1120 lea .Lprologue(%rip),%r10
1121 cmp %r10,%rbx # context->Rip<.Lprologue
1122 jb .Lcommon_seh_tail
1123
1124 mov 152($context),%rax # pull context->Rsp
1125
1126 lea .Lepilogue(%rip),%r10
1127 cmp %r10,%rbx # context->Rip>=.Lepilogue
1128 jae .Lcommon_seh_tail
1129
1130 mov `16*4`(%rax),%rax # pull saved stack pointer
1131 lea 32(%rax),%rax
1132
1133 mov -8(%rax),%rbx
1134 mov -16(%rax),%rbp
1135 mov -24(%rax),%r12
1136 mov -32(%rax),%r13
1137 mov %rbx,144($context) # restore context->Rbx
1138 mov %rbp,160($context) # restore context->Rbp
1139 mov %r12,216($context) # restore context->R12
1140 mov %r13,224($context) # restore context->R13
1141
1142 jmp .Lcommon_seh_tail
1143.size se_handler,.-se_handler
1144
1145.type ssse3_handler,\@abi-omnipotent
1146.align 16
1147ssse3_handler:
1148 push %rsi
1149 push %rdi
1150 push %rbx
1151 push %rbp
1152 push %r12
1153 push %r13
1154 push %r14
1155 push %r15
1156 pushfq
1157 sub \$64,%rsp
1158
1159 mov 120($context),%rax # pull context->Rax
1160 mov 248($context),%rbx # pull context->Rip
1161
1162 mov 8($disp),%rsi # disp->ImageBase
1163 mov 56($disp),%r11 # disp->HandlerData
1164
1165 mov 0(%r11),%r10d # HandlerData[0]
1166 lea (%rsi,%r10),%r10 # prologue label
1167 cmp %r10,%rbx # context->Rip<prologue label
1168 jb .Lcommon_seh_tail
1169
1170 mov 152($context),%rax # pull context->Rsp
1171
1172 mov 4(%r11),%r10d # HandlerData[1]
1173 lea (%rsi,%r10),%r10 # epilogue label
1174 cmp %r10,%rbx # context->Rip>=epilogue label
1175 jae .Lcommon_seh_tail
1176
1177 lea 64(%rax),%rsi
1178 lea 512($context),%rdi # &context.Xmm6
1179 mov \$10,%ecx
1180 .long 0xa548f3fc # cld; rep movsq
1181 lea `24+64+5*16`(%rax),%rax # adjust stack pointer
1182
1183 mov -8(%rax),%rbx
1184 mov -16(%rax),%rbp
1185 mov -24(%rax),%r12
1186 mov %rbx,144($context) # restore context->Rbx
1187 mov %rbp,160($context) # restore context->Rbp
1188 mov %r12,216($context) # restore cotnext->R12
1189
1190.Lcommon_seh_tail:
1191 mov 8(%rax),%rdi
1192 mov 16(%rax),%rsi
1193 mov %rax,152($context) # restore context->Rsp
1194 mov %rsi,168($context) # restore context->Rsi
1195 mov %rdi,176($context) # restore context->Rdi
1196
1197 mov 40($disp),%rdi # disp->ContextRecord
1198 mov $context,%rsi # context
1199 mov \$154,%ecx # sizeof(CONTEXT)
1200 .long 0xa548f3fc # cld; rep movsq
1201
1202 mov $disp,%rsi
1203 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1204 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1205 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1206 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1207 mov 40(%rsi),%r10 # disp->ContextRecord
1208 lea 56(%rsi),%r11 # &disp->HandlerData
1209 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1210 mov %r10,32(%rsp) # arg5
1211 mov %r11,40(%rsp) # arg6
1212 mov %r12,48(%rsp) # arg7
1213 mov %rcx,56(%rsp) # arg8, (NULL)
1214 call *__imp_RtlVirtualUnwind(%rip)
1215
1216 mov \$1,%eax # ExceptionContinueSearch
1217 add \$64,%rsp
1218 popfq
1219 pop %r15
1220 pop %r14
1221 pop %r13
1222 pop %r12
1223 pop %rbp
1224 pop %rbx
1225 pop %rdi
1226 pop %rsi
1227 ret
1228.size ssse3_handler,.-ssse3_handler
1229
1230.section .pdata
1231.align 4
1232 .rva .LSEH_begin_sha1_block_data_order
1233 .rva .LSEH_end_sha1_block_data_order
1234 .rva .LSEH_info_sha1_block_data_order
1235 .rva .LSEH_begin_sha1_block_data_order_ssse3
1236 .rva .LSEH_end_sha1_block_data_order_ssse3
1237 .rva .LSEH_info_sha1_block_data_order_ssse3
1238___
1239$code.=<<___ if ($avx);
1240 .rva .LSEH_begin_sha1_block_data_order_avx
1241 .rva .LSEH_end_sha1_block_data_order_avx
1242 .rva .LSEH_info_sha1_block_data_order_avx
1243___
1244$code.=<<___;
1245.section .xdata
1246.align 8
1247.LSEH_info_sha1_block_data_order:
1248 .byte 9,0,0,0
1249 .rva se_handler
1250.LSEH_info_sha1_block_data_order_ssse3:
1251 .byte 9,0,0,0
1252 .rva ssse3_handler
1253 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1254___
1255$code.=<<___ if ($avx);
1256.LSEH_info_sha1_block_data_order_avx:
1257 .byte 9,0,0,0
1258 .rva ssse3_handler
1259 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1260___
1261}
1262
1263####################################################################
1264
1265$code =~ s/\`([^\`]*)\`/eval $1/gem;
1266print $code;
1267close STDOUT;