diff options
author | tb <> | 2023-04-15 18:37:36 +0000 |
---|---|---|
committer | tb <> | 2023-04-15 18:37:36 +0000 |
commit | 33ffd98a48be03b84c6c382e055bb6e8a007a930 (patch) | |
tree | ae349dcf0eb4d420f5ebaeda02dc9b992199eb41 /src | |
parent | 9148c84b488f1ebcd570038939692a638d9bff32 (diff) | |
download | openbsd-33ffd98a48be03b84c6c382e055bb6e8a007a930.tar.gz openbsd-33ffd98a48be03b84c6c382e055bb6e8a007a930.tar.bz2 openbsd-33ffd98a48be03b84c6c382e055bb6e8a007a930.zip |
Remove now unused GF2m perlasm generators
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-gf2m.pl | 278 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/x86-gf2m.pl | 312 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gf2m.pl | 390 |
3 files changed, 0 insertions, 980 deletions
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl deleted file mode 100644 index 8915924641..0000000000 --- a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl +++ /dev/null | |||
@@ -1,278 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | ||
13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | ||
14 | # C for the time being... Except that it has two code paths: pure | ||
15 | # integer code suitable for any ARMv4 and later CPU and NEON code | ||
16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | ||
17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | ||
18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | ||
19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | ||
20 | # length, more for longer keys. Even though NEON 1x1 multiplication | ||
21 | # runs in even less cycles, ~30, improvement is measurable only on | ||
22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | ||
23 | |||
24 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
25 | open STDOUT,">$output"; | ||
26 | |||
27 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
28 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
29 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
30 | |||
31 | $code=<<___; | ||
32 | #include "arm_arch.h" | ||
33 | |||
34 | .text | ||
35 | .code 32 | ||
36 | |||
37 | #if __ARM_ARCH__>=7 | ||
38 | .fpu neon | ||
39 | |||
40 | .type mul_1x1_neon,%function | ||
41 | .align 5 | ||
42 | mul_1x1_neon: | ||
43 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | ||
44 | vmull.p8 `&Q("d0")`,d16,d17 @ a·bb | ||
45 | vshl.u64 `&Dlo("q2")`,d16,#16 | ||
46 | vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb | ||
47 | vshl.u64 `&Dlo("q3")`,d16,#24 | ||
48 | vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb | ||
49 | vshr.u64 `&Dlo("q1")`,#8 | ||
50 | vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb | ||
51 | vshl.u64 `&Dhi("q1")`,#24 | ||
52 | veor d0,`&Dlo("q1")` | ||
53 | vshr.u64 `&Dlo("q2")`,#16 | ||
54 | veor d0,`&Dhi("q1")` | ||
55 | vshl.u64 `&Dhi("q2")`,#16 | ||
56 | veor d0,`&Dlo("q2")` | ||
57 | vshr.u64 `&Dlo("q3")`,#24 | ||
58 | veor d0,`&Dhi("q2")` | ||
59 | vshl.u64 `&Dhi("q3")`,#8 | ||
60 | veor d0,`&Dlo("q3")` | ||
61 | veor d0,`&Dhi("q3")` | ||
62 | bx lr | ||
63 | .size mul_1x1_neon,.-mul_1x1_neon | ||
64 | #endif | ||
65 | ___ | ||
66 | ################ | ||
67 | # private interface to mul_1x1_ialu | ||
68 | # | ||
69 | $a="r1"; | ||
70 | $b="r0"; | ||
71 | |||
72 | ($a0,$a1,$a2,$a12,$a4,$a14)= | ||
73 | ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); | ||
74 | |||
75 | $mask="r12"; | ||
76 | |||
77 | $code.=<<___; | ||
78 | .type mul_1x1_ialu,%function | ||
79 | .align 5 | ||
80 | mul_1x1_ialu: | ||
81 | mov $a0,#0 | ||
82 | bic $a1,$a,#3<<30 @ a1=a&0x3fffffff | ||
83 | str $a0,[sp,#0] @ tab[0]=0 | ||
84 | add $a2,$a1,$a1 @ a2=a1<<1 | ||
85 | str $a1,[sp,#4] @ tab[1]=a1 | ||
86 | eor $a12,$a1,$a2 @ a1^a2 | ||
87 | str $a2,[sp,#8] @ tab[2]=a2 | ||
88 | mov $a4,$a1,lsl#2 @ a4=a1<<2 | ||
89 | str $a12,[sp,#12] @ tab[3]=a1^a2 | ||
90 | eor $a14,$a1,$a4 @ a1^a4 | ||
91 | str $a4,[sp,#16] @ tab[4]=a4 | ||
92 | eor $a0,$a2,$a4 @ a2^a4 | ||
93 | str $a14,[sp,#20] @ tab[5]=a1^a4 | ||
94 | eor $a12,$a12,$a4 @ a1^a2^a4 | ||
95 | str $a0,[sp,#24] @ tab[6]=a2^a4 | ||
96 | and $i0,$mask,$b,lsl#2 | ||
97 | str $a12,[sp,#28] @ tab[7]=a1^a2^a4 | ||
98 | |||
99 | and $i1,$mask,$b,lsr#1 | ||
100 | ldr $lo,[sp,$i0] @ tab[b & 0x7] | ||
101 | and $i0,$mask,$b,lsr#4 | ||
102 | ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] | ||
103 | and $i1,$mask,$b,lsr#7 | ||
104 | ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] | ||
105 | eor $lo,$lo,$t1,lsl#3 @ stall | ||
106 | mov $hi,$t1,lsr#29 | ||
107 | ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] | ||
108 | |||
109 | and $i0,$mask,$b,lsr#10 | ||
110 | eor $lo,$lo,$t0,lsl#6 | ||
111 | eor $hi,$hi,$t0,lsr#26 | ||
112 | ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] | ||
113 | |||
114 | and $i1,$mask,$b,lsr#13 | ||
115 | eor $lo,$lo,$t1,lsl#9 | ||
116 | eor $hi,$hi,$t1,lsr#23 | ||
117 | ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] | ||
118 | |||
119 | and $i0,$mask,$b,lsr#16 | ||
120 | eor $lo,$lo,$t0,lsl#12 | ||
121 | eor $hi,$hi,$t0,lsr#20 | ||
122 | ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] | ||
123 | |||
124 | and $i1,$mask,$b,lsr#19 | ||
125 | eor $lo,$lo,$t1,lsl#15 | ||
126 | eor $hi,$hi,$t1,lsr#17 | ||
127 | ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] | ||
128 | |||
129 | and $i0,$mask,$b,lsr#22 | ||
130 | eor $lo,$lo,$t0,lsl#18 | ||
131 | eor $hi,$hi,$t0,lsr#14 | ||
132 | ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] | ||
133 | |||
134 | and $i1,$mask,$b,lsr#25 | ||
135 | eor $lo,$lo,$t1,lsl#21 | ||
136 | eor $hi,$hi,$t1,lsr#11 | ||
137 | ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] | ||
138 | |||
139 | tst $a,#1<<30 | ||
140 | and $i0,$mask,$b,lsr#28 | ||
141 | eor $lo,$lo,$t0,lsl#24 | ||
142 | eor $hi,$hi,$t0,lsr#8 | ||
143 | ldr $t0,[sp,$i0] @ tab[b >> 30 ] | ||
144 | |||
145 | eorne $lo,$lo,$b,lsl#30 | ||
146 | eorne $hi,$hi,$b,lsr#2 | ||
147 | tst $a,#1<<31 | ||
148 | eor $lo,$lo,$t1,lsl#27 | ||
149 | eor $hi,$hi,$t1,lsr#5 | ||
150 | eorne $lo,$lo,$b,lsl#31 | ||
151 | eorne $hi,$hi,$b,lsr#1 | ||
152 | eor $lo,$lo,$t0,lsl#30 | ||
153 | eor $hi,$hi,$t0,lsr#2 | ||
154 | |||
155 | mov pc,lr | ||
156 | .size mul_1x1_ialu,.-mul_1x1_ialu | ||
157 | ___ | ||
158 | ################ | ||
159 | # void bn_GF2m_mul_2x2(BN_ULONG *r, | ||
160 | # BN_ULONG a1,BN_ULONG a0, | ||
161 | # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 | ||
162 | |||
163 | ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); | ||
164 | |||
165 | $code.=<<___; | ||
166 | .global bn_GF2m_mul_2x2 | ||
167 | .type bn_GF2m_mul_2x2,%function | ||
168 | .align 5 | ||
169 | bn_GF2m_mul_2x2: | ||
170 | #if __ARM_ARCH__>=7 | ||
171 | ldr r12,.LOPENSSL_armcap | ||
172 | .Lpic: ldr r12,[pc,r12] | ||
173 | tst r12,#1 | ||
174 | beq .Lialu | ||
175 | |||
176 | veor $A1,$A1 | ||
177 | vmov $B1,r3,r3 @ two copies of b1 | ||
178 | vmov.32 ${A1}[0],r1 @ a1 | ||
179 | |||
180 | veor $A0,$A0 | ||
181 | vld1.32 ${B0}[],[sp,:32] @ two copies of b0 | ||
182 | vmov.32 ${A0}[0],r2 @ a0 | ||
183 | mov r12,lr | ||
184 | |||
185 | vmov d16,$A1 | ||
186 | vmov d17,$B1 | ||
187 | bl mul_1x1_neon @ a1·b1 | ||
188 | vmov $A1B1,d0 | ||
189 | |||
190 | vmov d16,$A0 | ||
191 | vmov d17,$B0 | ||
192 | bl mul_1x1_neon @ a0·b0 | ||
193 | vmov $A0B0,d0 | ||
194 | |||
195 | veor d16,$A0,$A1 | ||
196 | veor d17,$B0,$B1 | ||
197 | veor $A0,$A0B0,$A1B1 | ||
198 | bl mul_1x1_neon @ (a0+a1)·(b0+b1) | ||
199 | |||
200 | veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
201 | vshl.u64 d1,d0,#32 | ||
202 | vshr.u64 d0,d0,#32 | ||
203 | veor $A0B0,d1 | ||
204 | veor $A1B1,d0 | ||
205 | vst1.32 {${A0B0}[0]},[r0,:32]! | ||
206 | vst1.32 {${A0B0}[1]},[r0,:32]! | ||
207 | vst1.32 {${A1B1}[0]},[r0,:32]! | ||
208 | vst1.32 {${A1B1}[1]},[r0,:32] | ||
209 | bx r12 | ||
210 | .align 4 | ||
211 | .Lialu: | ||
212 | #endif | ||
213 | ___ | ||
214 | $ret="r10"; # reassigned 1st argument | ||
215 | $code.=<<___; | ||
216 | stmdb sp!,{r4-r10,lr} | ||
217 | mov $ret,r0 @ reassign 1st argument | ||
218 | mov $b,r3 @ $b=b1 | ||
219 | ldr r3,[sp,#32] @ load b0 | ||
220 | mov $mask,#7<<2 | ||
221 | sub sp,sp,#32 @ allocate tab[8] | ||
222 | |||
223 | bl mul_1x1_ialu @ a1·b1 | ||
224 | str $lo,[$ret,#8] | ||
225 | str $hi,[$ret,#12] | ||
226 | |||
227 | eor $b,$b,r3 @ flip b0 and b1 | ||
228 | eor $a,$a,r2 @ flip a0 and a1 | ||
229 | eor r3,r3,$b | ||
230 | eor r2,r2,$a | ||
231 | eor $b,$b,r3 | ||
232 | eor $a,$a,r2 | ||
233 | bl mul_1x1_ialu @ a0·b0 | ||
234 | str $lo,[$ret] | ||
235 | str $hi,[$ret,#4] | ||
236 | |||
237 | eor $a,$a,r2 | ||
238 | eor $b,$b,r3 | ||
239 | bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | ||
240 | ___ | ||
241 | @r=map("r$_",(6..9)); | ||
242 | $code.=<<___; | ||
243 | ldmia $ret,{@r[0]-@r[3]} | ||
244 | eor $lo,$lo,$hi | ||
245 | eor $hi,$hi,@r[1] | ||
246 | eor $lo,$lo,@r[0] | ||
247 | eor $hi,$hi,@r[2] | ||
248 | eor $lo,$lo,@r[3] | ||
249 | eor $hi,$hi,@r[3] | ||
250 | str $hi,[$ret,#8] | ||
251 | eor $lo,$lo,$hi | ||
252 | add sp,sp,#32 @ destroy tab[8] | ||
253 | str $lo,[$ret,#4] | ||
254 | |||
255 | #if __ARM_ARCH__>=5 | ||
256 | ldmia sp!,{r4-r10,pc} | ||
257 | #else | ||
258 | ldmia sp!,{r4-r10,lr} | ||
259 | tst lr,#1 | ||
260 | moveq pc,lr @ be binary compatible with V4, yet | ||
261 | bx lr @ interoperable with Thumb ISA:-) | ||
262 | #endif | ||
263 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
264 | #if __ARM_ARCH__>=7 | ||
265 | .align 5 | ||
266 | .LOPENSSL_armcap: | ||
267 | .word OPENSSL_armcap_P-(.Lpic+8) | ||
268 | #endif | ||
269 | .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
270 | .align 5 | ||
271 | |||
272 | .comm OPENSSL_armcap_P,4,4 | ||
273 | ___ | ||
274 | |||
275 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
276 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
277 | print $code; | ||
278 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl deleted file mode 100644 index cb2f2a5c30..0000000000 --- a/src/lib/libcrypto/bn/asm/x86-gf2m.pl +++ /dev/null | |||
@@ -1,312 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
14 | # the time being... Except that it has three code paths: pure integer | ||
15 | # code suitable for any x86 CPU, MMX code suitable for PIII and later | ||
16 | # and PCLMULQDQ suitable for Westmere and later. Improvement varies | ||
17 | # from one benchmark and µ-arch to another. Below are interval values | ||
18 | # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | ||
19 | # code: | ||
20 | # | ||
21 | # PIII 16%-30% | ||
22 | # P4 12%-12% | ||
23 | # Opteron 18%-40% | ||
24 | # Core2 19%-44% | ||
25 | # Atom 38%-64% | ||
26 | # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | ||
27 | # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | ||
28 | # | ||
29 | # Note that above improvement coefficients are not coefficients for | ||
30 | # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | ||
31 | # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | ||
32 | # is more and more dominated by other subroutines, most notably by | ||
33 | # BN_GF2m_mod[_mul]_arr... | ||
34 | |||
35 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
36 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
37 | require "x86asm.pl"; | ||
38 | |||
39 | &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | ||
40 | |||
41 | $sse2=0; | ||
42 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
43 | |||
44 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
45 | |||
46 | $a="eax"; | ||
47 | $b="ebx"; | ||
48 | ($a1,$a2,$a4)=("ecx","edx","ebp"); | ||
49 | |||
50 | $R="mm0"; | ||
51 | @T=("mm1","mm2"); | ||
52 | ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | ||
53 | @i=("esi","edi"); | ||
54 | |||
55 | if (!$x86only) { | ||
56 | &function_begin_B("_mul_1x1_mmx"); | ||
57 | &sub ("esp",32+4); | ||
58 | &mov ($a1,$a); | ||
59 | &lea ($a2,&DWP(0,$a,$a)); | ||
60 | &and ($a1,0x3fffffff); | ||
61 | &lea ($a4,&DWP(0,$a2,$a2)); | ||
62 | &mov (&DWP(0*4,"esp"),0); | ||
63 | &and ($a2,0x7fffffff); | ||
64 | &movd ($A,$a); | ||
65 | &movd ($B,$b); | ||
66 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
67 | &xor ($a1,$a2); # a1^a2 | ||
68 | &pxor ($B31,$B31); | ||
69 | &pxor ($B30,$B30); | ||
70 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
71 | &xor ($a2,$a4); # a2^a4 | ||
72 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
73 | &pcmpgtd($B31,$A); # broadcast 31st bit | ||
74 | &paddd ($A,$A); # $A<<=1 | ||
75 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
76 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
77 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
78 | &pand ($B31,$B); | ||
79 | &pcmpgtd($B30,$A); # broadcast 30th bit | ||
80 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
81 | &xor ($a4,$a1); # a1^a2^a4 | ||
82 | &psllq ($B31,31); | ||
83 | &pand ($B30,$B); | ||
84 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
85 | &mov (@i[0],0x7); | ||
86 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
87 | &mov ($a4,@i[0]); | ||
88 | &and (@i[0],$b); | ||
89 | &shr ($b,3); | ||
90 | &mov (@i[1],$a4); | ||
91 | &psllq ($B30,30); | ||
92 | &and (@i[1],$b); | ||
93 | &shr ($b,3); | ||
94 | &movd ($R,&DWP(0,"esp",@i[0],4)); | ||
95 | &mov (@i[0],$a4); | ||
96 | &and (@i[0],$b); | ||
97 | &shr ($b,3); | ||
98 | for($n=1;$n<9;$n++) { | ||
99 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
100 | &mov (@i[1],$a4); | ||
101 | &psllq (@T[1],3*$n); | ||
102 | &and (@i[1],$b); | ||
103 | &shr ($b,3); | ||
104 | &pxor ($R,@T[1]); | ||
105 | |||
106 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
107 | } | ||
108 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
109 | &pxor ($R,$B30); | ||
110 | &psllq (@T[1],3*$n++); | ||
111 | &pxor ($R,@T[1]); | ||
112 | |||
113 | &movd (@T[0],&DWP(0,"esp",@i[0],4)); | ||
114 | &pxor ($R,$B31); | ||
115 | &psllq (@T[0],3*$n); | ||
116 | &add ("esp",32+4); | ||
117 | &pxor ($R,@T[0]); | ||
118 | &ret (); | ||
119 | &function_end_B("_mul_1x1_mmx"); | ||
120 | } | ||
121 | |||
122 | ($lo,$hi)=("eax","edx"); | ||
123 | @T=("ecx","ebp"); | ||
124 | |||
125 | &function_begin_B("_mul_1x1_ialu"); | ||
126 | &sub ("esp",32+4); | ||
127 | &mov ($a1,$a); | ||
128 | &lea ($a2,&DWP(0,$a,$a)); | ||
129 | &lea ($a4,&DWP(0,"",$a,4)); | ||
130 | &and ($a1,0x3fffffff); | ||
131 | &lea (@i[1],&DWP(0,$lo,$lo)); | ||
132 | &sar ($lo,31); # broadcast 31st bit | ||
133 | &mov (&DWP(0*4,"esp"),0); | ||
134 | &and ($a2,0x7fffffff); | ||
135 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
136 | &xor ($a1,$a2); # a1^a2 | ||
137 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
138 | &xor ($a2,$a4); # a2^a4 | ||
139 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
140 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
141 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
142 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
143 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
144 | &xor ($a4,$a1); # a1^a2^a4 | ||
145 | &sar (@i[1],31); # broadcast 30th bit | ||
146 | &and ($lo,$b); | ||
147 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
148 | &and (@i[1],$b); | ||
149 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
150 | &mov ($hi,$lo); | ||
151 | &shl ($lo,31); | ||
152 | &mov (@T[0],@i[1]); | ||
153 | &shr ($hi,1); | ||
154 | |||
155 | &mov (@i[0],0x7); | ||
156 | &shl (@i[1],30); | ||
157 | &and (@i[0],$b); | ||
158 | &shr (@T[0],2); | ||
159 | &xor ($lo,@i[1]); | ||
160 | |||
161 | &shr ($b,3); | ||
162 | &mov (@i[1],0x7); # 5-byte instruction!? | ||
163 | &and (@i[1],$b); | ||
164 | &shr ($b,3); | ||
165 | &xor ($hi,@T[0]); | ||
166 | &xor ($lo,&DWP(0,"esp",@i[0],4)); | ||
167 | &mov (@i[0],0x7); | ||
168 | &and (@i[0],$b); | ||
169 | &shr ($b,3); | ||
170 | for($n=1;$n<9;$n++) { | ||
171 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
172 | &mov (@i[1],0x7); | ||
173 | &mov (@T[0],@T[1]); | ||
174 | &shl (@T[1],3*$n); | ||
175 | &and (@i[1],$b); | ||
176 | &shr (@T[0],32-3*$n); | ||
177 | &xor ($lo,@T[1]); | ||
178 | &shr ($b,3); | ||
179 | &xor ($hi,@T[0]); | ||
180 | |||
181 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
182 | } | ||
183 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
184 | &mov (@T[0],@T[1]); | ||
185 | &shl (@T[1],3*$n); | ||
186 | &mov (@i[1],&DWP(0,"esp",@i[0],4)); | ||
187 | &shr (@T[0],32-3*$n); $n++; | ||
188 | &mov (@i[0],@i[1]); | ||
189 | &xor ($lo,@T[1]); | ||
190 | &shl (@i[1],3*$n); | ||
191 | &xor ($hi,@T[0]); | ||
192 | &shr (@i[0],32-3*$n); | ||
193 | &xor ($lo,@i[1]); | ||
194 | &xor ($hi,@i[0]); | ||
195 | |||
196 | &add ("esp",32+4); | ||
197 | &ret (); | ||
198 | &function_end_B("_mul_1x1_ialu"); | ||
199 | |||
200 | # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); | ||
201 | &function_begin_B("bn_GF2m_mul_2x2"); | ||
202 | if (!$x86only) { | ||
203 | &picsetup("edx"); | ||
204 | &picsymbol("edx", "OPENSSL_ia32cap_P", "edx"); | ||
205 | &mov ("eax",&DWP(0,"edx")); | ||
206 | &mov ("edx",&DWP(4,"edx")); | ||
207 | &test ("eax","\$IA32CAP_MASK0_MMX"); # check MMX bit | ||
208 | &jz (&label("ialu")); | ||
209 | if ($sse2) { | ||
210 | &test ("eax","\$IA32CAP_MASK0_FXSR"); # check FXSR bit | ||
211 | &jz (&label("mmx")); | ||
212 | &test ("edx","\$IA32CAP_MASK1_PCLMUL"); # check PCLMULQDQ bit | ||
213 | &jz (&label("mmx")); | ||
214 | |||
215 | &movups ("xmm0",&QWP(8,"esp")); | ||
216 | &shufps ("xmm0","xmm0",0b10110001); | ||
217 | &pclmulqdq ("xmm0","xmm0",1); | ||
218 | &mov ("eax",&DWP(4,"esp")); | ||
219 | &movups (&QWP(0,"eax"),"xmm0"); | ||
220 | &ret (); | ||
221 | |||
222 | &set_label("mmx",16); | ||
223 | } | ||
224 | &push ("ebp"); | ||
225 | &push ("ebx"); | ||
226 | &push ("esi"); | ||
227 | &push ("edi"); | ||
228 | &mov ($a,&wparam(1)); | ||
229 | &mov ($b,&wparam(3)); | ||
230 | &call ("_mul_1x1_mmx"); # a1·b1 | ||
231 | &movq ("mm7",$R); | ||
232 | |||
233 | &mov ($a,&wparam(2)); | ||
234 | &mov ($b,&wparam(4)); | ||
235 | &call ("_mul_1x1_mmx"); # a0·b0 | ||
236 | &movq ("mm6",$R); | ||
237 | |||
238 | &mov ($a,&wparam(1)); | ||
239 | &mov ($b,&wparam(3)); | ||
240 | &xor ($a,&wparam(2)); | ||
241 | &xor ($b,&wparam(4)); | ||
242 | &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | ||
243 | &pxor ($R,"mm7"); | ||
244 | &mov ($a,&wparam(0)); | ||
245 | &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | ||
246 | |||
247 | &movq ($A,$R); | ||
248 | &psllq ($R,32); | ||
249 | &pop ("edi"); | ||
250 | &psrlq ($A,32); | ||
251 | &pop ("esi"); | ||
252 | &pxor ($R,"mm6"); | ||
253 | &pop ("ebx"); | ||
254 | &pxor ($A,"mm7"); | ||
255 | &movq (&QWP(0,$a),$R); | ||
256 | &pop ("ebp"); | ||
257 | &movq (&QWP(8,$a),$A); | ||
258 | &emms (); | ||
259 | &ret (); | ||
260 | &set_label("ialu",16); | ||
261 | } | ||
262 | &push ("ebp"); | ||
263 | &push ("ebx"); | ||
264 | &push ("esi"); | ||
265 | &push ("edi"); | ||
266 | &stack_push(4+1); | ||
267 | |||
268 | &mov ($a,&wparam(1)); | ||
269 | &mov ($b,&wparam(3)); | ||
270 | &call ("_mul_1x1_ialu"); # a1·b1 | ||
271 | &mov (&DWP(8,"esp"),$lo); | ||
272 | &mov (&DWP(12,"esp"),$hi); | ||
273 | |||
274 | &mov ($a,&wparam(2)); | ||
275 | &mov ($b,&wparam(4)); | ||
276 | &call ("_mul_1x1_ialu"); # a0·b0 | ||
277 | &mov (&DWP(0,"esp"),$lo); | ||
278 | &mov (&DWP(4,"esp"),$hi); | ||
279 | |||
280 | &mov ($a,&wparam(1)); | ||
281 | &mov ($b,&wparam(3)); | ||
282 | &xor ($a,&wparam(2)); | ||
283 | &xor ($b,&wparam(4)); | ||
284 | &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | ||
285 | |||
286 | &mov ("ebp",&wparam(0)); | ||
287 | @r=("ebx","ecx","edi","esi"); | ||
288 | &mov (@r[0],&DWP(0,"esp")); | ||
289 | &mov (@r[1],&DWP(4,"esp")); | ||
290 | &mov (@r[2],&DWP(8,"esp")); | ||
291 | &mov (@r[3],&DWP(12,"esp")); | ||
292 | |||
293 | &xor ($lo,$hi); | ||
294 | &xor ($hi,@r[1]); | ||
295 | &xor ($lo,@r[0]); | ||
296 | &mov (&DWP(0,"ebp"),@r[0]); | ||
297 | &xor ($hi,@r[2]); | ||
298 | &mov (&DWP(12,"ebp"),@r[3]); | ||
299 | &xor ($lo,@r[3]); | ||
300 | &stack_pop(4+1); | ||
301 | &xor ($hi,@r[3]); | ||
302 | &pop ("edi"); | ||
303 | &xor ($lo,$hi); | ||
304 | &pop ("esi"); | ||
305 | &mov (&DWP(8,"ebp"),$hi); | ||
306 | &pop ("ebx"); | ||
307 | &mov (&DWP(4,"ebp"),$lo); | ||
308 | &pop ("ebp"); | ||
309 | &ret (); | ||
310 | &function_end_B("bn_GF2m_mul_2x2"); | ||
311 | |||
312 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl deleted file mode 100644 index 6985725b20..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl +++ /dev/null | |||
@@ -1,390 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
14 | # the time being... Except that it has two code paths: code suitable | ||
15 | # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and | ||
16 | # later. Improvement varies from one benchmark and µ-arch to another. | ||
17 | # Vanilla code path is at most 20% faster than compiler-generated code | ||
18 | # [not very impressive], while PCLMULQDQ - whole 85%-160% better on | ||
19 | # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that | ||
20 | # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not | ||
21 | # all CPU time is burnt in it... | ||
22 | |||
23 | $flavour = shift; | ||
24 | $output = shift; | ||
25 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
26 | |||
27 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
28 | |||
29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
30 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
31 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
32 | die "can't locate x86_64-xlate.pl"; | ||
33 | |||
34 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
35 | *STDOUT=*OUT; | ||
36 | |||
37 | ($lo,$hi)=("%rax","%rdx"); $a=$lo; | ||
38 | ($i0,$i1)=("%rsi","%rdi"); | ||
39 | ($t0,$t1)=("%rbx","%rcx"); | ||
40 | ($b,$mask)=("%rbp","%r8"); | ||
41 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); | ||
42 | ($R,$Tx)=("%xmm0","%xmm1"); | ||
43 | |||
44 | $code.=<<___; | ||
45 | .text | ||
46 | |||
47 | .type _mul_1x1,\@abi-omnipotent | ||
48 | .align 16 | ||
49 | _mul_1x1: | ||
50 | sub \$128+8,%rsp | ||
51 | mov \$-1,$a1 | ||
52 | lea ($a,$a),$i0 | ||
53 | shr \$3,$a1 | ||
54 | lea (,$a,4),$i1 | ||
55 | and $a,$a1 # a1=a&0x1fffffffffffffff | ||
56 | lea (,$a,8),$a8 | ||
57 | sar \$63,$a # broadcast 63rd bit | ||
58 | lea ($a1,$a1),$a2 | ||
59 | sar \$63,$i0 # broadcast 62nd bit | ||
60 | lea (,$a1,4),$a4 | ||
61 | and $b,$a | ||
62 | sar \$63,$i1 # broadcast 61st bit | ||
63 | mov $a,$hi # $a is $lo | ||
64 | shl \$63,$lo | ||
65 | and $b,$i0 | ||
66 | shr \$1,$hi | ||
67 | mov $i0,$t1 | ||
68 | shl \$62,$i0 | ||
69 | and $b,$i1 | ||
70 | shr \$2,$t1 | ||
71 | xor $i0,$lo | ||
72 | mov $i1,$t0 | ||
73 | shl \$61,$i1 | ||
74 | xor $t1,$hi | ||
75 | shr \$3,$t0 | ||
76 | xor $i1,$lo | ||
77 | xor $t0,$hi | ||
78 | |||
79 | mov $a1,$a12 | ||
80 | movq \$0,0(%rsp) # tab[0]=0 | ||
81 | xor $a2,$a12 # a1^a2 | ||
82 | mov $a1,8(%rsp) # tab[1]=a1 | ||
83 | mov $a4,$a48 | ||
84 | mov $a2,16(%rsp) # tab[2]=a2 | ||
85 | xor $a8,$a48 # a4^a8 | ||
86 | mov $a12,24(%rsp) # tab[3]=a1^a2 | ||
87 | |||
88 | xor $a4,$a1 | ||
89 | mov $a4,32(%rsp) # tab[4]=a4 | ||
90 | xor $a4,$a2 | ||
91 | mov $a1,40(%rsp) # tab[5]=a1^a4 | ||
92 | xor $a4,$a12 | ||
93 | mov $a2,48(%rsp) # tab[6]=a2^a4 | ||
94 | xor $a48,$a1 # a1^a4^a4^a8=a1^a8 | ||
95 | mov $a12,56(%rsp) # tab[7]=a1^a2^a4 | ||
96 | xor $a48,$a2 # a2^a4^a4^a8=a1^a8 | ||
97 | |||
98 | mov $a8,64(%rsp) # tab[8]=a8 | ||
99 | xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 | ||
100 | mov $a1,72(%rsp) # tab[9]=a1^a8 | ||
101 | xor $a4,$a1 # a1^a8^a4 | ||
102 | mov $a2,80(%rsp) # tab[10]=a2^a8 | ||
103 | xor $a4,$a2 # a2^a8^a4 | ||
104 | mov $a12,88(%rsp) # tab[11]=a1^a2^a8 | ||
105 | |||
106 | xor $a4,$a12 # a1^a2^a8^a4 | ||
107 | mov $a48,96(%rsp) # tab[12]=a4^a8 | ||
108 | mov $mask,$i0 | ||
109 | mov $a1,104(%rsp) # tab[13]=a1^a4^a8 | ||
110 | and $b,$i0 | ||
111 | mov $a2,112(%rsp) # tab[14]=a2^a4^a8 | ||
112 | shr \$4,$b | ||
113 | mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 | ||
114 | mov $mask,$i1 | ||
115 | and $b,$i1 | ||
116 | shr \$4,$b | ||
117 | |||
118 | movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 | ||
119 | mov $mask,$i0 | ||
120 | and $b,$i0 | ||
121 | shr \$4,$b | ||
122 | ___ | ||
123 | for ($n=1;$n<8;$n++) { | ||
124 | $code.=<<___; | ||
125 | mov (%rsp,$i1,8),$t1 | ||
126 | mov $mask,$i1 | ||
127 | mov $t1,$t0 | ||
128 | shl \$`8*$n-4`,$t1 | ||
129 | and $b,$i1 | ||
130 | movq (%rsp,$i0,8),$Tx | ||
131 | shr \$`64-(8*$n-4)`,$t0 | ||
132 | xor $t1,$lo | ||
133 | pslldq \$$n,$Tx | ||
134 | mov $mask,$i0 | ||
135 | shr \$4,$b | ||
136 | xor $t0,$hi | ||
137 | and $b,$i0 | ||
138 | shr \$4,$b | ||
139 | pxor $Tx,$R | ||
140 | ___ | ||
141 | } | ||
142 | $code.=<<___; | ||
143 | mov (%rsp,$i1,8),$t1 | ||
144 | mov $t1,$t0 | ||
145 | shl \$`8*$n-4`,$t1 | ||
146 | movd $R,$i0 | ||
147 | shr \$`64-(8*$n-4)`,$t0 | ||
148 | xor $t1,$lo | ||
149 | psrldq \$8,$R | ||
150 | xor $t0,$hi | ||
151 | movd $R,$i1 | ||
152 | xor $i0,$lo | ||
153 | xor $i1,$hi | ||
154 | |||
155 | add \$128+8,%rsp | ||
156 | ret | ||
157 | .Lend_mul_1x1: | ||
158 | .size _mul_1x1,.-_mul_1x1 | ||
159 | ___ | ||
160 | |||
161 | ($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order | ||
162 | ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order | ||
163 | |||
164 | $code.=<<___; | ||
165 | .extern OPENSSL_ia32cap_P | ||
166 | .hidden OPENSSL_ia32cap_P | ||
167 | .globl bn_GF2m_mul_2x2 | ||
168 | .type bn_GF2m_mul_2x2,\@abi-omnipotent | ||
169 | .align 16 | ||
170 | bn_GF2m_mul_2x2: | ||
171 | mov OPENSSL_ia32cap_P+4(%rip),%eax | ||
172 | bt \$IA32CAP_BIT1_PCLMUL,%eax | ||
173 | jnc .Lvanilla_mul_2x2 | ||
174 | |||
175 | movd $a1,%xmm0 | ||
176 | movd $b1,%xmm1 | ||
177 | movd $a0,%xmm2 | ||
178 | ___ | ||
179 | $code.=<<___ if ($win64); | ||
180 | movq 40(%rsp),%xmm3 | ||
181 | ___ | ||
182 | $code.=<<___ if (!$win64); | ||
183 | movd $b0,%xmm3 | ||
184 | ___ | ||
185 | $code.=<<___; | ||
186 | movdqa %xmm0,%xmm4 | ||
187 | movdqa %xmm1,%xmm5 | ||
188 | pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 | ||
189 | pxor %xmm2,%xmm4 | ||
190 | pxor %xmm3,%xmm5 | ||
191 | pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 | ||
192 | pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) | ||
193 | xorps %xmm0,%xmm4 | ||
194 | xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
195 | movdqa %xmm4,%xmm5 | ||
196 | pslldq \$8,%xmm4 | ||
197 | psrldq \$8,%xmm5 | ||
198 | pxor %xmm4,%xmm2 | ||
199 | pxor %xmm5,%xmm0 | ||
200 | movdqu %xmm2,0($rp) | ||
201 | movdqu %xmm0,16($rp) | ||
202 | ret | ||
203 | |||
204 | .align 16 | ||
205 | .Lvanilla_mul_2x2: | ||
206 | lea -8*17(%rsp),%rsp | ||
207 | ___ | ||
208 | $code.=<<___ if ($win64); | ||
209 | mov `8*17+40`(%rsp),$b0 | ||
210 | mov %rdi,8*15(%rsp) | ||
211 | mov %rsi,8*16(%rsp) | ||
212 | ___ | ||
213 | $code.=<<___; | ||
214 | mov %r14,8*10(%rsp) | ||
215 | mov %r13,8*11(%rsp) | ||
216 | mov %r12,8*12(%rsp) | ||
217 | mov %rbp,8*13(%rsp) | ||
218 | mov %rbx,8*14(%rsp) | ||
219 | .Lbody_mul_2x2: | ||
220 | mov $rp,32(%rsp) # save the arguments | ||
221 | mov $a1,40(%rsp) | ||
222 | mov $a0,48(%rsp) | ||
223 | mov $b1,56(%rsp) | ||
224 | mov $b0,64(%rsp) | ||
225 | |||
226 | mov \$0xf,$mask | ||
227 | mov $a1,$a | ||
228 | mov $b1,$b | ||
229 | call _mul_1x1 # a1·b1 | ||
230 | mov $lo,16(%rsp) | ||
231 | mov $hi,24(%rsp) | ||
232 | |||
233 | mov 48(%rsp),$a | ||
234 | mov 64(%rsp),$b | ||
235 | call _mul_1x1 # a0·b0 | ||
236 | mov $lo,0(%rsp) | ||
237 | mov $hi,8(%rsp) | ||
238 | |||
239 | mov 40(%rsp),$a | ||
240 | mov 56(%rsp),$b | ||
241 | xor 48(%rsp),$a | ||
242 | xor 64(%rsp),$b | ||
243 | call _mul_1x1 # (a0+a1)·(b0+b1) | ||
244 | ___ | ||
245 | @r=("%rbx","%rcx","%rdi","%rsi"); | ||
246 | $code.=<<___; | ||
247 | mov 0(%rsp),@r[0] | ||
248 | mov 8(%rsp),@r[1] | ||
249 | mov 16(%rsp),@r[2] | ||
250 | mov 24(%rsp),@r[3] | ||
251 | mov 32(%rsp),%rbp | ||
252 | |||
253 | xor $hi,$lo | ||
254 | xor @r[1],$hi | ||
255 | xor @r[0],$lo | ||
256 | mov @r[0],0(%rbp) | ||
257 | xor @r[2],$hi | ||
258 | mov @r[3],24(%rbp) | ||
259 | xor @r[3],$lo | ||
260 | xor @r[3],$hi | ||
261 | xor $hi,$lo | ||
262 | mov $hi,16(%rbp) | ||
263 | mov $lo,8(%rbp) | ||
264 | |||
265 | mov 8*10(%rsp),%r14 | ||
266 | mov 8*11(%rsp),%r13 | ||
267 | mov 8*12(%rsp),%r12 | ||
268 | mov 8*13(%rsp),%rbp | ||
269 | mov 8*14(%rsp),%rbx | ||
270 | ___ | ||
271 | $code.=<<___ if ($win64); | ||
272 | mov 8*15(%rsp),%rdi | ||
273 | mov 8*16(%rsp),%rsi | ||
274 | ___ | ||
275 | $code.=<<___; | ||
276 | lea 8*17(%rsp),%rsp | ||
277 | ret | ||
278 | .Lend_mul_2x2: | ||
279 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
280 | .align 16 | ||
281 | ___ | ||
282 | |||
283 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
284 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
285 | if ($win64) { | ||
286 | $rec="%rcx"; | ||
287 | $frame="%rdx"; | ||
288 | $context="%r8"; | ||
289 | $disp="%r9"; | ||
290 | |||
291 | $code.=<<___; | ||
292 | .extern __imp_RtlVirtualUnwind | ||
293 | |||
294 | .type se_handler,\@abi-omnipotent | ||
295 | .align 16 | ||
296 | se_handler: | ||
297 | push %rsi | ||
298 | push %rdi | ||
299 | push %rbx | ||
300 | push %rbp | ||
301 | push %r12 | ||
302 | push %r13 | ||
303 | push %r14 | ||
304 | push %r15 | ||
305 | pushfq | ||
306 | sub \$64,%rsp | ||
307 | |||
308 | mov 152($context),%rax # pull context->Rsp | ||
309 | mov 248($context),%rbx # pull context->Rip | ||
310 | |||
311 | lea .Lbody_mul_2x2(%rip),%r10 | ||
312 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
313 | jb .Lin_prologue | ||
314 | |||
315 | mov 8*10(%rax),%r14 # mimic epilogue | ||
316 | mov 8*11(%rax),%r13 | ||
317 | mov 8*12(%rax),%r12 | ||
318 | mov 8*13(%rax),%rbp | ||
319 | mov 8*14(%rax),%rbx | ||
320 | mov 8*15(%rax),%rdi | ||
321 | mov 8*16(%rax),%rsi | ||
322 | |||
323 | mov %rbx,144($context) # restore context->Rbx | ||
324 | mov %rbp,160($context) # restore context->Rbp | ||
325 | mov %rsi,168($context) # restore context->Rsi | ||
326 | mov %rdi,176($context) # restore context->Rdi | ||
327 | mov %r12,216($context) # restore context->R12 | ||
328 | mov %r13,224($context) # restore context->R13 | ||
329 | mov %r14,232($context) # restore context->R14 | ||
330 | |||
331 | .Lin_prologue: | ||
332 | lea 8*17(%rax),%rax | ||
333 | mov %rax,152($context) # restore context->Rsp | ||
334 | |||
335 | mov 40($disp),%rdi # disp->ContextRecord | ||
336 | mov $context,%rsi # context | ||
337 | mov \$154,%ecx # sizeof(CONTEXT) | ||
338 | .long 0xa548f3fc # cld; rep movsq | ||
339 | |||
340 | mov $disp,%rsi | ||
341 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
342 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
343 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
344 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
345 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
346 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
347 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
348 | mov %r10,32(%rsp) # arg5 | ||
349 | mov %r11,40(%rsp) # arg6 | ||
350 | mov %r12,48(%rsp) # arg7 | ||
351 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
352 | call *__imp_RtlVirtualUnwind(%rip) | ||
353 | |||
354 | mov \$1,%eax # ExceptionContinueSearch | ||
355 | add \$64,%rsp | ||
356 | popfq | ||
357 | pop %r15 | ||
358 | pop %r14 | ||
359 | pop %r13 | ||
360 | pop %r12 | ||
361 | pop %rbp | ||
362 | pop %rbx | ||
363 | pop %rdi | ||
364 | pop %rsi | ||
365 | ret | ||
366 | .size se_handler,.-se_handler | ||
367 | |||
368 | .section .pdata | ||
369 | .align 4 | ||
370 | .rva _mul_1x1 | ||
371 | .rva .Lend_mul_1x1 | ||
372 | .rva .LSEH_info_1x1 | ||
373 | |||
374 | .rva .Lvanilla_mul_2x2 | ||
375 | .rva .Lend_mul_2x2 | ||
376 | .rva .LSEH_info_2x2 | ||
377 | .section .xdata | ||
378 | .align 8 | ||
379 | .LSEH_info_1x1: | ||
380 | .byte 0x01,0x07,0x02,0x00 | ||
381 | .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 | ||
382 | .LSEH_info_2x2: | ||
383 | .byte 9,0,0,0 | ||
384 | .rva se_handler | ||
385 | ___ | ||
386 | } | ||
387 | |||
388 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
389 | print $code; | ||
390 | close STDOUT; | ||