diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-gf2m.pl | 278 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86-gf2m.pl | 312 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gf2m.pl | 390 |
3 files changed, 0 insertions, 980 deletions
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl deleted file mode 100644 index 8915924641..0000000000 --- a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl +++ /dev/null | |||
| @@ -1,278 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | ||
| 13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | ||
| 14 | # C for the time being... Except that it has two code paths: pure | ||
| 15 | # integer code suitable for any ARMv4 and later CPU and NEON code | ||
| 16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | ||
| 17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | ||
| 18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | ||
| 19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | ||
| 20 | # length, more for longer keys. Even though NEON 1x1 multiplication | ||
| 21 | # runs in even less cycles, ~30, improvement is measurable only on | ||
| 22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | ||
| 23 | |||
| 24 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 25 | open STDOUT,">$output"; | ||
| 26 | |||
| 27 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
| 28 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
| 29 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
| 30 | |||
| 31 | $code=<<___; | ||
| 32 | #include "arm_arch.h" | ||
| 33 | |||
| 34 | .text | ||
| 35 | .code 32 | ||
| 36 | |||
| 37 | #if __ARM_ARCH__>=7 | ||
| 38 | .fpu neon | ||
| 39 | |||
| 40 | .type mul_1x1_neon,%function | ||
| 41 | .align 5 | ||
| 42 | mul_1x1_neon: | ||
| 43 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | ||
| 44 | vmull.p8 `&Q("d0")`,d16,d17 @ a·bb | ||
| 45 | vshl.u64 `&Dlo("q2")`,d16,#16 | ||
| 46 | vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb | ||
| 47 | vshl.u64 `&Dlo("q3")`,d16,#24 | ||
| 48 | vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb | ||
| 49 | vshr.u64 `&Dlo("q1")`,#8 | ||
| 50 | vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb | ||
| 51 | vshl.u64 `&Dhi("q1")`,#24 | ||
| 52 | veor d0,`&Dlo("q1")` | ||
| 53 | vshr.u64 `&Dlo("q2")`,#16 | ||
| 54 | veor d0,`&Dhi("q1")` | ||
| 55 | vshl.u64 `&Dhi("q2")`,#16 | ||
| 56 | veor d0,`&Dlo("q2")` | ||
| 57 | vshr.u64 `&Dlo("q3")`,#24 | ||
| 58 | veor d0,`&Dhi("q2")` | ||
| 59 | vshl.u64 `&Dhi("q3")`,#8 | ||
| 60 | veor d0,`&Dlo("q3")` | ||
| 61 | veor d0,`&Dhi("q3")` | ||
| 62 | bx lr | ||
| 63 | .size mul_1x1_neon,.-mul_1x1_neon | ||
| 64 | #endif | ||
| 65 | ___ | ||
| 66 | ################ | ||
| 67 | # private interface to mul_1x1_ialu | ||
| 68 | # | ||
| 69 | $a="r1"; | ||
| 70 | $b="r0"; | ||
| 71 | |||
| 72 | ($a0,$a1,$a2,$a12,$a4,$a14)= | ||
| 73 | ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); | ||
| 74 | |||
| 75 | $mask="r12"; | ||
| 76 | |||
| 77 | $code.=<<___; | ||
| 78 | .type mul_1x1_ialu,%function | ||
| 79 | .align 5 | ||
| 80 | mul_1x1_ialu: | ||
| 81 | mov $a0,#0 | ||
| 82 | bic $a1,$a,#3<<30 @ a1=a&0x3fffffff | ||
| 83 | str $a0,[sp,#0] @ tab[0]=0 | ||
| 84 | add $a2,$a1,$a1 @ a2=a1<<1 | ||
| 85 | str $a1,[sp,#4] @ tab[1]=a1 | ||
| 86 | eor $a12,$a1,$a2 @ a1^a2 | ||
| 87 | str $a2,[sp,#8] @ tab[2]=a2 | ||
| 88 | mov $a4,$a1,lsl#2 @ a4=a1<<2 | ||
| 89 | str $a12,[sp,#12] @ tab[3]=a1^a2 | ||
| 90 | eor $a14,$a1,$a4 @ a1^a4 | ||
| 91 | str $a4,[sp,#16] @ tab[4]=a4 | ||
| 92 | eor $a0,$a2,$a4 @ a2^a4 | ||
| 93 | str $a14,[sp,#20] @ tab[5]=a1^a4 | ||
| 94 | eor $a12,$a12,$a4 @ a1^a2^a4 | ||
| 95 | str $a0,[sp,#24] @ tab[6]=a2^a4 | ||
| 96 | and $i0,$mask,$b,lsl#2 | ||
| 97 | str $a12,[sp,#28] @ tab[7]=a1^a2^a4 | ||
| 98 | |||
| 99 | and $i1,$mask,$b,lsr#1 | ||
| 100 | ldr $lo,[sp,$i0] @ tab[b & 0x7] | ||
| 101 | and $i0,$mask,$b,lsr#4 | ||
| 102 | ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] | ||
| 103 | and $i1,$mask,$b,lsr#7 | ||
| 104 | ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] | ||
| 105 | eor $lo,$lo,$t1,lsl#3 @ stall | ||
| 106 | mov $hi,$t1,lsr#29 | ||
| 107 | ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] | ||
| 108 | |||
| 109 | and $i0,$mask,$b,lsr#10 | ||
| 110 | eor $lo,$lo,$t0,lsl#6 | ||
| 111 | eor $hi,$hi,$t0,lsr#26 | ||
| 112 | ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] | ||
| 113 | |||
| 114 | and $i1,$mask,$b,lsr#13 | ||
| 115 | eor $lo,$lo,$t1,lsl#9 | ||
| 116 | eor $hi,$hi,$t1,lsr#23 | ||
| 117 | ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] | ||
| 118 | |||
| 119 | and $i0,$mask,$b,lsr#16 | ||
| 120 | eor $lo,$lo,$t0,lsl#12 | ||
| 121 | eor $hi,$hi,$t0,lsr#20 | ||
| 122 | ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] | ||
| 123 | |||
| 124 | and $i1,$mask,$b,lsr#19 | ||
| 125 | eor $lo,$lo,$t1,lsl#15 | ||
| 126 | eor $hi,$hi,$t1,lsr#17 | ||
| 127 | ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] | ||
| 128 | |||
| 129 | and $i0,$mask,$b,lsr#22 | ||
| 130 | eor $lo,$lo,$t0,lsl#18 | ||
| 131 | eor $hi,$hi,$t0,lsr#14 | ||
| 132 | ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] | ||
| 133 | |||
| 134 | and $i1,$mask,$b,lsr#25 | ||
| 135 | eor $lo,$lo,$t1,lsl#21 | ||
| 136 | eor $hi,$hi,$t1,lsr#11 | ||
| 137 | ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] | ||
| 138 | |||
| 139 | tst $a,#1<<30 | ||
| 140 | and $i0,$mask,$b,lsr#28 | ||
| 141 | eor $lo,$lo,$t0,lsl#24 | ||
| 142 | eor $hi,$hi,$t0,lsr#8 | ||
| 143 | ldr $t0,[sp,$i0] @ tab[b >> 30 ] | ||
| 144 | |||
| 145 | eorne $lo,$lo,$b,lsl#30 | ||
| 146 | eorne $hi,$hi,$b,lsr#2 | ||
| 147 | tst $a,#1<<31 | ||
| 148 | eor $lo,$lo,$t1,lsl#27 | ||
| 149 | eor $hi,$hi,$t1,lsr#5 | ||
| 150 | eorne $lo,$lo,$b,lsl#31 | ||
| 151 | eorne $hi,$hi,$b,lsr#1 | ||
| 152 | eor $lo,$lo,$t0,lsl#30 | ||
| 153 | eor $hi,$hi,$t0,lsr#2 | ||
| 154 | |||
| 155 | mov pc,lr | ||
| 156 | .size mul_1x1_ialu,.-mul_1x1_ialu | ||
| 157 | ___ | ||
| 158 | ################ | ||
| 159 | # void bn_GF2m_mul_2x2(BN_ULONG *r, | ||
| 160 | # BN_ULONG a1,BN_ULONG a0, | ||
| 161 | # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 | ||
| 162 | |||
| 163 | ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); | ||
| 164 | |||
| 165 | $code.=<<___; | ||
| 166 | .global bn_GF2m_mul_2x2 | ||
| 167 | .type bn_GF2m_mul_2x2,%function | ||
| 168 | .align 5 | ||
| 169 | bn_GF2m_mul_2x2: | ||
| 170 | #if __ARM_ARCH__>=7 | ||
| 171 | ldr r12,.LOPENSSL_armcap | ||
| 172 | .Lpic: ldr r12,[pc,r12] | ||
| 173 | tst r12,#1 | ||
| 174 | beq .Lialu | ||
| 175 | |||
| 176 | veor $A1,$A1 | ||
| 177 | vmov $B1,r3,r3 @ two copies of b1 | ||
| 178 | vmov.32 ${A1}[0],r1 @ a1 | ||
| 179 | |||
| 180 | veor $A0,$A0 | ||
| 181 | vld1.32 ${B0}[],[sp,:32] @ two copies of b0 | ||
| 182 | vmov.32 ${A0}[0],r2 @ a0 | ||
| 183 | mov r12,lr | ||
| 184 | |||
| 185 | vmov d16,$A1 | ||
| 186 | vmov d17,$B1 | ||
| 187 | bl mul_1x1_neon @ a1·b1 | ||
| 188 | vmov $A1B1,d0 | ||
| 189 | |||
| 190 | vmov d16,$A0 | ||
| 191 | vmov d17,$B0 | ||
| 192 | bl mul_1x1_neon @ a0·b0 | ||
| 193 | vmov $A0B0,d0 | ||
| 194 | |||
| 195 | veor d16,$A0,$A1 | ||
| 196 | veor d17,$B0,$B1 | ||
| 197 | veor $A0,$A0B0,$A1B1 | ||
| 198 | bl mul_1x1_neon @ (a0+a1)·(b0+b1) | ||
| 199 | |||
| 200 | veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
| 201 | vshl.u64 d1,d0,#32 | ||
| 202 | vshr.u64 d0,d0,#32 | ||
| 203 | veor $A0B0,d1 | ||
| 204 | veor $A1B1,d0 | ||
| 205 | vst1.32 {${A0B0}[0]},[r0,:32]! | ||
| 206 | vst1.32 {${A0B0}[1]},[r0,:32]! | ||
| 207 | vst1.32 {${A1B1}[0]},[r0,:32]! | ||
| 208 | vst1.32 {${A1B1}[1]},[r0,:32] | ||
| 209 | bx r12 | ||
| 210 | .align 4 | ||
| 211 | .Lialu: | ||
| 212 | #endif | ||
| 213 | ___ | ||
| 214 | $ret="r10"; # reassigned 1st argument | ||
| 215 | $code.=<<___; | ||
| 216 | stmdb sp!,{r4-r10,lr} | ||
| 217 | mov $ret,r0 @ reassign 1st argument | ||
| 218 | mov $b,r3 @ $b=b1 | ||
| 219 | ldr r3,[sp,#32] @ load b0 | ||
| 220 | mov $mask,#7<<2 | ||
| 221 | sub sp,sp,#32 @ allocate tab[8] | ||
| 222 | |||
| 223 | bl mul_1x1_ialu @ a1·b1 | ||
| 224 | str $lo,[$ret,#8] | ||
| 225 | str $hi,[$ret,#12] | ||
| 226 | |||
| 227 | eor $b,$b,r3 @ flip b0 and b1 | ||
| 228 | eor $a,$a,r2 @ flip a0 and a1 | ||
| 229 | eor r3,r3,$b | ||
| 230 | eor r2,r2,$a | ||
| 231 | eor $b,$b,r3 | ||
| 232 | eor $a,$a,r2 | ||
| 233 | bl mul_1x1_ialu @ a0·b0 | ||
| 234 | str $lo,[$ret] | ||
| 235 | str $hi,[$ret,#4] | ||
| 236 | |||
| 237 | eor $a,$a,r2 | ||
| 238 | eor $b,$b,r3 | ||
| 239 | bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | ||
| 240 | ___ | ||
| 241 | @r=map("r$_",(6..9)); | ||
| 242 | $code.=<<___; | ||
| 243 | ldmia $ret,{@r[0]-@r[3]} | ||
| 244 | eor $lo,$lo,$hi | ||
| 245 | eor $hi,$hi,@r[1] | ||
| 246 | eor $lo,$lo,@r[0] | ||
| 247 | eor $hi,$hi,@r[2] | ||
| 248 | eor $lo,$lo,@r[3] | ||
| 249 | eor $hi,$hi,@r[3] | ||
| 250 | str $hi,[$ret,#8] | ||
| 251 | eor $lo,$lo,$hi | ||
| 252 | add sp,sp,#32 @ destroy tab[8] | ||
| 253 | str $lo,[$ret,#4] | ||
| 254 | |||
| 255 | #if __ARM_ARCH__>=5 | ||
| 256 | ldmia sp!,{r4-r10,pc} | ||
| 257 | #else | ||
| 258 | ldmia sp!,{r4-r10,lr} | ||
| 259 | tst lr,#1 | ||
| 260 | moveq pc,lr @ be binary compatible with V4, yet | ||
| 261 | bx lr @ interoperable with Thumb ISA:-) | ||
| 262 | #endif | ||
| 263 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 264 | #if __ARM_ARCH__>=7 | ||
| 265 | .align 5 | ||
| 266 | .LOPENSSL_armcap: | ||
| 267 | .word OPENSSL_armcap_P-(.Lpic+8) | ||
| 268 | #endif | ||
| 269 | .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 270 | .align 5 | ||
| 271 | |||
| 272 | .comm OPENSSL_armcap_P,4,4 | ||
| 273 | ___ | ||
| 274 | |||
| 275 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 276 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
| 277 | print $code; | ||
| 278 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl deleted file mode 100644 index cb2f2a5c30..0000000000 --- a/src/lib/libcrypto/bn/asm/x86-gf2m.pl +++ /dev/null | |||
| @@ -1,312 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... Except that it has three code paths: pure integer | ||
| 15 | # code suitable for any x86 CPU, MMX code suitable for PIII and later | ||
| 16 | # and PCLMULQDQ suitable for Westmere and later. Improvement varies | ||
| 17 | # from one benchmark and µ-arch to another. Below are interval values | ||
| 18 | # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | ||
| 19 | # code: | ||
| 20 | # | ||
| 21 | # PIII 16%-30% | ||
| 22 | # P4 12%-12% | ||
| 23 | # Opteron 18%-40% | ||
| 24 | # Core2 19%-44% | ||
| 25 | # Atom 38%-64% | ||
| 26 | # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | ||
| 27 | # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | ||
| 28 | # | ||
| 29 | # Note that above improvement coefficients are not coefficients for | ||
| 30 | # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | ||
| 31 | # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | ||
| 32 | # is more and more dominated by other subroutines, most notably by | ||
| 33 | # BN_GF2m_mod[_mul]_arr... | ||
| 34 | |||
| 35 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 36 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 37 | require "x86asm.pl"; | ||
| 38 | |||
| 39 | &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | ||
| 40 | |||
| 41 | $sse2=0; | ||
| 42 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 43 | |||
| 44 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 45 | |||
| 46 | $a="eax"; | ||
| 47 | $b="ebx"; | ||
| 48 | ($a1,$a2,$a4)=("ecx","edx","ebp"); | ||
| 49 | |||
| 50 | $R="mm0"; | ||
| 51 | @T=("mm1","mm2"); | ||
| 52 | ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | ||
| 53 | @i=("esi","edi"); | ||
| 54 | |||
| 55 | if (!$x86only) { | ||
| 56 | &function_begin_B("_mul_1x1_mmx"); | ||
| 57 | &sub ("esp",32+4); | ||
| 58 | &mov ($a1,$a); | ||
| 59 | &lea ($a2,&DWP(0,$a,$a)); | ||
| 60 | &and ($a1,0x3fffffff); | ||
| 61 | &lea ($a4,&DWP(0,$a2,$a2)); | ||
| 62 | &mov (&DWP(0*4,"esp"),0); | ||
| 63 | &and ($a2,0x7fffffff); | ||
| 64 | &movd ($A,$a); | ||
| 65 | &movd ($B,$b); | ||
| 66 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
| 67 | &xor ($a1,$a2); # a1^a2 | ||
| 68 | &pxor ($B31,$B31); | ||
| 69 | &pxor ($B30,$B30); | ||
| 70 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
| 71 | &xor ($a2,$a4); # a2^a4 | ||
| 72 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
| 73 | &pcmpgtd($B31,$A); # broadcast 31st bit | ||
| 74 | &paddd ($A,$A); # $A<<=1 | ||
| 75 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
| 76 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
| 77 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
| 78 | &pand ($B31,$B); | ||
| 79 | &pcmpgtd($B30,$A); # broadcast 30th bit | ||
| 80 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
| 81 | &xor ($a4,$a1); # a1^a2^a4 | ||
| 82 | &psllq ($B31,31); | ||
| 83 | &pand ($B30,$B); | ||
| 84 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
| 85 | &mov (@i[0],0x7); | ||
| 86 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
| 87 | &mov ($a4,@i[0]); | ||
| 88 | &and (@i[0],$b); | ||
| 89 | &shr ($b,3); | ||
| 90 | &mov (@i[1],$a4); | ||
| 91 | &psllq ($B30,30); | ||
| 92 | &and (@i[1],$b); | ||
| 93 | &shr ($b,3); | ||
| 94 | &movd ($R,&DWP(0,"esp",@i[0],4)); | ||
| 95 | &mov (@i[0],$a4); | ||
| 96 | &and (@i[0],$b); | ||
| 97 | &shr ($b,3); | ||
| 98 | for($n=1;$n<9;$n++) { | ||
| 99 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 100 | &mov (@i[1],$a4); | ||
| 101 | &psllq (@T[1],3*$n); | ||
| 102 | &and (@i[1],$b); | ||
| 103 | &shr ($b,3); | ||
| 104 | &pxor ($R,@T[1]); | ||
| 105 | |||
| 106 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 107 | } | ||
| 108 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 109 | &pxor ($R,$B30); | ||
| 110 | &psllq (@T[1],3*$n++); | ||
| 111 | &pxor ($R,@T[1]); | ||
| 112 | |||
| 113 | &movd (@T[0],&DWP(0,"esp",@i[0],4)); | ||
| 114 | &pxor ($R,$B31); | ||
| 115 | &psllq (@T[0],3*$n); | ||
| 116 | &add ("esp",32+4); | ||
| 117 | &pxor ($R,@T[0]); | ||
| 118 | &ret (); | ||
| 119 | &function_end_B("_mul_1x1_mmx"); | ||
| 120 | } | ||
| 121 | |||
| 122 | ($lo,$hi)=("eax","edx"); | ||
| 123 | @T=("ecx","ebp"); | ||
| 124 | |||
| 125 | &function_begin_B("_mul_1x1_ialu"); | ||
| 126 | &sub ("esp",32+4); | ||
| 127 | &mov ($a1,$a); | ||
| 128 | &lea ($a2,&DWP(0,$a,$a)); | ||
| 129 | &lea ($a4,&DWP(0,"",$a,4)); | ||
| 130 | &and ($a1,0x3fffffff); | ||
| 131 | &lea (@i[1],&DWP(0,$lo,$lo)); | ||
| 132 | &sar ($lo,31); # broadcast 31st bit | ||
| 133 | &mov (&DWP(0*4,"esp"),0); | ||
| 134 | &and ($a2,0x7fffffff); | ||
| 135 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
| 136 | &xor ($a1,$a2); # a1^a2 | ||
| 137 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
| 138 | &xor ($a2,$a4); # a2^a4 | ||
| 139 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
| 140 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
| 141 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
| 142 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
| 143 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
| 144 | &xor ($a4,$a1); # a1^a2^a4 | ||
| 145 | &sar (@i[1],31); # broadcast 30th bit | ||
| 146 | &and ($lo,$b); | ||
| 147 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
| 148 | &and (@i[1],$b); | ||
| 149 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
| 150 | &mov ($hi,$lo); | ||
| 151 | &shl ($lo,31); | ||
| 152 | &mov (@T[0],@i[1]); | ||
| 153 | &shr ($hi,1); | ||
| 154 | |||
| 155 | &mov (@i[0],0x7); | ||
| 156 | &shl (@i[1],30); | ||
| 157 | &and (@i[0],$b); | ||
| 158 | &shr (@T[0],2); | ||
| 159 | &xor ($lo,@i[1]); | ||
| 160 | |||
| 161 | &shr ($b,3); | ||
| 162 | &mov (@i[1],0x7); # 5-byte instruction!? | ||
| 163 | &and (@i[1],$b); | ||
| 164 | &shr ($b,3); | ||
| 165 | &xor ($hi,@T[0]); | ||
| 166 | &xor ($lo,&DWP(0,"esp",@i[0],4)); | ||
| 167 | &mov (@i[0],0x7); | ||
| 168 | &and (@i[0],$b); | ||
| 169 | &shr ($b,3); | ||
| 170 | for($n=1;$n<9;$n++) { | ||
| 171 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 172 | &mov (@i[1],0x7); | ||
| 173 | &mov (@T[0],@T[1]); | ||
| 174 | &shl (@T[1],3*$n); | ||
| 175 | &and (@i[1],$b); | ||
| 176 | &shr (@T[0],32-3*$n); | ||
| 177 | &xor ($lo,@T[1]); | ||
| 178 | &shr ($b,3); | ||
| 179 | &xor ($hi,@T[0]); | ||
| 180 | |||
| 181 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 182 | } | ||
| 183 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 184 | &mov (@T[0],@T[1]); | ||
| 185 | &shl (@T[1],3*$n); | ||
| 186 | &mov (@i[1],&DWP(0,"esp",@i[0],4)); | ||
| 187 | &shr (@T[0],32-3*$n); $n++; | ||
| 188 | &mov (@i[0],@i[1]); | ||
| 189 | &xor ($lo,@T[1]); | ||
| 190 | &shl (@i[1],3*$n); | ||
| 191 | &xor ($hi,@T[0]); | ||
| 192 | &shr (@i[0],32-3*$n); | ||
| 193 | &xor ($lo,@i[1]); | ||
| 194 | &xor ($hi,@i[0]); | ||
| 195 | |||
| 196 | &add ("esp",32+4); | ||
| 197 | &ret (); | ||
| 198 | &function_end_B("_mul_1x1_ialu"); | ||
| 199 | |||
| 200 | # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); | ||
| 201 | &function_begin_B("bn_GF2m_mul_2x2"); | ||
| 202 | if (!$x86only) { | ||
| 203 | &picsetup("edx"); | ||
| 204 | &picsymbol("edx", "OPENSSL_ia32cap_P", "edx"); | ||
| 205 | &mov ("eax",&DWP(0,"edx")); | ||
| 206 | &mov ("edx",&DWP(4,"edx")); | ||
| 207 | &test ("eax","\$IA32CAP_MASK0_MMX"); # check MMX bit | ||
| 208 | &jz (&label("ialu")); | ||
| 209 | if ($sse2) { | ||
| 210 | &test ("eax","\$IA32CAP_MASK0_FXSR"); # check FXSR bit | ||
| 211 | &jz (&label("mmx")); | ||
| 212 | &test ("edx","\$IA32CAP_MASK1_PCLMUL"); # check PCLMULQDQ bit | ||
| 213 | &jz (&label("mmx")); | ||
| 214 | |||
| 215 | &movups ("xmm0",&QWP(8,"esp")); | ||
| 216 | &shufps ("xmm0","xmm0",0b10110001); | ||
| 217 | &pclmulqdq ("xmm0","xmm0",1); | ||
| 218 | &mov ("eax",&DWP(4,"esp")); | ||
| 219 | &movups (&QWP(0,"eax"),"xmm0"); | ||
| 220 | &ret (); | ||
| 221 | |||
| 222 | &set_label("mmx",16); | ||
| 223 | } | ||
| 224 | &push ("ebp"); | ||
| 225 | &push ("ebx"); | ||
| 226 | &push ("esi"); | ||
| 227 | &push ("edi"); | ||
| 228 | &mov ($a,&wparam(1)); | ||
| 229 | &mov ($b,&wparam(3)); | ||
| 230 | &call ("_mul_1x1_mmx"); # a1·b1 | ||
| 231 | &movq ("mm7",$R); | ||
| 232 | |||
| 233 | &mov ($a,&wparam(2)); | ||
| 234 | &mov ($b,&wparam(4)); | ||
| 235 | &call ("_mul_1x1_mmx"); # a0·b0 | ||
| 236 | &movq ("mm6",$R); | ||
| 237 | |||
| 238 | &mov ($a,&wparam(1)); | ||
| 239 | &mov ($b,&wparam(3)); | ||
| 240 | &xor ($a,&wparam(2)); | ||
| 241 | &xor ($b,&wparam(4)); | ||
| 242 | &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | ||
| 243 | &pxor ($R,"mm7"); | ||
| 244 | &mov ($a,&wparam(0)); | ||
| 245 | &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | ||
| 246 | |||
| 247 | &movq ($A,$R); | ||
| 248 | &psllq ($R,32); | ||
| 249 | &pop ("edi"); | ||
| 250 | &psrlq ($A,32); | ||
| 251 | &pop ("esi"); | ||
| 252 | &pxor ($R,"mm6"); | ||
| 253 | &pop ("ebx"); | ||
| 254 | &pxor ($A,"mm7"); | ||
| 255 | &movq (&QWP(0,$a),$R); | ||
| 256 | &pop ("ebp"); | ||
| 257 | &movq (&QWP(8,$a),$A); | ||
| 258 | &emms (); | ||
| 259 | &ret (); | ||
| 260 | &set_label("ialu",16); | ||
| 261 | } | ||
| 262 | &push ("ebp"); | ||
| 263 | &push ("ebx"); | ||
| 264 | &push ("esi"); | ||
| 265 | &push ("edi"); | ||
| 266 | &stack_push(4+1); | ||
| 267 | |||
| 268 | &mov ($a,&wparam(1)); | ||
| 269 | &mov ($b,&wparam(3)); | ||
| 270 | &call ("_mul_1x1_ialu"); # a1·b1 | ||
| 271 | &mov (&DWP(8,"esp"),$lo); | ||
| 272 | &mov (&DWP(12,"esp"),$hi); | ||
| 273 | |||
| 274 | &mov ($a,&wparam(2)); | ||
| 275 | &mov ($b,&wparam(4)); | ||
| 276 | &call ("_mul_1x1_ialu"); # a0·b0 | ||
| 277 | &mov (&DWP(0,"esp"),$lo); | ||
| 278 | &mov (&DWP(4,"esp"),$hi); | ||
| 279 | |||
| 280 | &mov ($a,&wparam(1)); | ||
| 281 | &mov ($b,&wparam(3)); | ||
| 282 | &xor ($a,&wparam(2)); | ||
| 283 | &xor ($b,&wparam(4)); | ||
| 284 | &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | ||
| 285 | |||
| 286 | &mov ("ebp",&wparam(0)); | ||
| 287 | @r=("ebx","ecx","edi","esi"); | ||
| 288 | &mov (@r[0],&DWP(0,"esp")); | ||
| 289 | &mov (@r[1],&DWP(4,"esp")); | ||
| 290 | &mov (@r[2],&DWP(8,"esp")); | ||
| 291 | &mov (@r[3],&DWP(12,"esp")); | ||
| 292 | |||
| 293 | &xor ($lo,$hi); | ||
| 294 | &xor ($hi,@r[1]); | ||
| 295 | &xor ($lo,@r[0]); | ||
| 296 | &mov (&DWP(0,"ebp"),@r[0]); | ||
| 297 | &xor ($hi,@r[2]); | ||
| 298 | &mov (&DWP(12,"ebp"),@r[3]); | ||
| 299 | &xor ($lo,@r[3]); | ||
| 300 | &stack_pop(4+1); | ||
| 301 | &xor ($hi,@r[3]); | ||
| 302 | &pop ("edi"); | ||
| 303 | &xor ($lo,$hi); | ||
| 304 | &pop ("esi"); | ||
| 305 | &mov (&DWP(8,"ebp"),$hi); | ||
| 306 | &pop ("ebx"); | ||
| 307 | &mov (&DWP(4,"ebp"),$lo); | ||
| 308 | &pop ("ebp"); | ||
| 309 | &ret (); | ||
| 310 | &function_end_B("bn_GF2m_mul_2x2"); | ||
| 311 | |||
| 312 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl deleted file mode 100644 index 6985725b20..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl +++ /dev/null | |||
| @@ -1,390 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... Except that it has two code paths: code suitable | ||
| 15 | # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and | ||
| 16 | # later. Improvement varies from one benchmark and µ-arch to another. | ||
| 17 | # Vanilla code path is at most 20% faster than compiler-generated code | ||
| 18 | # [not very impressive], while PCLMULQDQ - whole 85%-160% better on | ||
| 19 | # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that | ||
| 20 | # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not | ||
| 21 | # all CPU time is burnt in it... | ||
| 22 | |||
| 23 | $flavour = shift; | ||
| 24 | $output = shift; | ||
| 25 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 26 | |||
| 27 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 28 | |||
| 29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 30 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 31 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 32 | die "can't locate x86_64-xlate.pl"; | ||
| 33 | |||
| 34 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 35 | *STDOUT=*OUT; | ||
| 36 | |||
| 37 | ($lo,$hi)=("%rax","%rdx"); $a=$lo; | ||
| 38 | ($i0,$i1)=("%rsi","%rdi"); | ||
| 39 | ($t0,$t1)=("%rbx","%rcx"); | ||
| 40 | ($b,$mask)=("%rbp","%r8"); | ||
| 41 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); | ||
| 42 | ($R,$Tx)=("%xmm0","%xmm1"); | ||
| 43 | |||
| 44 | $code.=<<___; | ||
| 45 | .text | ||
| 46 | |||
| 47 | .type _mul_1x1,\@abi-omnipotent | ||
| 48 | .align 16 | ||
| 49 | _mul_1x1: | ||
| 50 | sub \$128+8,%rsp | ||
| 51 | mov \$-1,$a1 | ||
| 52 | lea ($a,$a),$i0 | ||
| 53 | shr \$3,$a1 | ||
| 54 | lea (,$a,4),$i1 | ||
| 55 | and $a,$a1 # a1=a&0x1fffffffffffffff | ||
| 56 | lea (,$a,8),$a8 | ||
| 57 | sar \$63,$a # broadcast 63rd bit | ||
| 58 | lea ($a1,$a1),$a2 | ||
| 59 | sar \$63,$i0 # broadcast 62nd bit | ||
| 60 | lea (,$a1,4),$a4 | ||
| 61 | and $b,$a | ||
| 62 | sar \$63,$i1 # broadcast 61st bit | ||
| 63 | mov $a,$hi # $a is $lo | ||
| 64 | shl \$63,$lo | ||
| 65 | and $b,$i0 | ||
| 66 | shr \$1,$hi | ||
| 67 | mov $i0,$t1 | ||
| 68 | shl \$62,$i0 | ||
| 69 | and $b,$i1 | ||
| 70 | shr \$2,$t1 | ||
| 71 | xor $i0,$lo | ||
| 72 | mov $i1,$t0 | ||
| 73 | shl \$61,$i1 | ||
| 74 | xor $t1,$hi | ||
| 75 | shr \$3,$t0 | ||
| 76 | xor $i1,$lo | ||
| 77 | xor $t0,$hi | ||
| 78 | |||
| 79 | mov $a1,$a12 | ||
| 80 | movq \$0,0(%rsp) # tab[0]=0 | ||
| 81 | xor $a2,$a12 # a1^a2 | ||
| 82 | mov $a1,8(%rsp) # tab[1]=a1 | ||
| 83 | mov $a4,$a48 | ||
| 84 | mov $a2,16(%rsp) # tab[2]=a2 | ||
| 85 | xor $a8,$a48 # a4^a8 | ||
| 86 | mov $a12,24(%rsp) # tab[3]=a1^a2 | ||
| 87 | |||
| 88 | xor $a4,$a1 | ||
| 89 | mov $a4,32(%rsp) # tab[4]=a4 | ||
| 90 | xor $a4,$a2 | ||
| 91 | mov $a1,40(%rsp) # tab[5]=a1^a4 | ||
| 92 | xor $a4,$a12 | ||
| 93 | mov $a2,48(%rsp) # tab[6]=a2^a4 | ||
| 94 | xor $a48,$a1 # a1^a4^a4^a8=a1^a8 | ||
| 95 | mov $a12,56(%rsp) # tab[7]=a1^a2^a4 | ||
| 96 | xor $a48,$a2 # a2^a4^a4^a8=a1^a8 | ||
| 97 | |||
| 98 | mov $a8,64(%rsp) # tab[8]=a8 | ||
| 99 | xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 | ||
| 100 | mov $a1,72(%rsp) # tab[9]=a1^a8 | ||
| 101 | xor $a4,$a1 # a1^a8^a4 | ||
| 102 | mov $a2,80(%rsp) # tab[10]=a2^a8 | ||
| 103 | xor $a4,$a2 # a2^a8^a4 | ||
| 104 | mov $a12,88(%rsp) # tab[11]=a1^a2^a8 | ||
| 105 | |||
| 106 | xor $a4,$a12 # a1^a2^a8^a4 | ||
| 107 | mov $a48,96(%rsp) # tab[12]=a4^a8 | ||
| 108 | mov $mask,$i0 | ||
| 109 | mov $a1,104(%rsp) # tab[13]=a1^a4^a8 | ||
| 110 | and $b,$i0 | ||
| 111 | mov $a2,112(%rsp) # tab[14]=a2^a4^a8 | ||
| 112 | shr \$4,$b | ||
| 113 | mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 | ||
| 114 | mov $mask,$i1 | ||
| 115 | and $b,$i1 | ||
| 116 | shr \$4,$b | ||
| 117 | |||
| 118 | movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 | ||
| 119 | mov $mask,$i0 | ||
| 120 | and $b,$i0 | ||
| 121 | shr \$4,$b | ||
| 122 | ___ | ||
| 123 | for ($n=1;$n<8;$n++) { | ||
| 124 | $code.=<<___; | ||
| 125 | mov (%rsp,$i1,8),$t1 | ||
| 126 | mov $mask,$i1 | ||
| 127 | mov $t1,$t0 | ||
| 128 | shl \$`8*$n-4`,$t1 | ||
| 129 | and $b,$i1 | ||
| 130 | movq (%rsp,$i0,8),$Tx | ||
| 131 | shr \$`64-(8*$n-4)`,$t0 | ||
| 132 | xor $t1,$lo | ||
| 133 | pslldq \$$n,$Tx | ||
| 134 | mov $mask,$i0 | ||
| 135 | shr \$4,$b | ||
| 136 | xor $t0,$hi | ||
| 137 | and $b,$i0 | ||
| 138 | shr \$4,$b | ||
| 139 | pxor $Tx,$R | ||
| 140 | ___ | ||
| 141 | } | ||
| 142 | $code.=<<___; | ||
| 143 | mov (%rsp,$i1,8),$t1 | ||
| 144 | mov $t1,$t0 | ||
| 145 | shl \$`8*$n-4`,$t1 | ||
| 146 | movd $R,$i0 | ||
| 147 | shr \$`64-(8*$n-4)`,$t0 | ||
| 148 | xor $t1,$lo | ||
| 149 | psrldq \$8,$R | ||
| 150 | xor $t0,$hi | ||
| 151 | movd $R,$i1 | ||
| 152 | xor $i0,$lo | ||
| 153 | xor $i1,$hi | ||
| 154 | |||
| 155 | add \$128+8,%rsp | ||
| 156 | ret | ||
| 157 | .Lend_mul_1x1: | ||
| 158 | .size _mul_1x1,.-_mul_1x1 | ||
| 159 | ___ | ||
| 160 | |||
| 161 | ($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order | ||
| 162 | ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order | ||
| 163 | |||
| 164 | $code.=<<___; | ||
| 165 | .extern OPENSSL_ia32cap_P | ||
| 166 | .hidden OPENSSL_ia32cap_P | ||
| 167 | .globl bn_GF2m_mul_2x2 | ||
| 168 | .type bn_GF2m_mul_2x2,\@abi-omnipotent | ||
| 169 | .align 16 | ||
| 170 | bn_GF2m_mul_2x2: | ||
| 171 | mov OPENSSL_ia32cap_P+4(%rip),%eax | ||
| 172 | bt \$IA32CAP_BIT1_PCLMUL,%eax | ||
| 173 | jnc .Lvanilla_mul_2x2 | ||
| 174 | |||
| 175 | movd $a1,%xmm0 | ||
| 176 | movd $b1,%xmm1 | ||
| 177 | movd $a0,%xmm2 | ||
| 178 | ___ | ||
| 179 | $code.=<<___ if ($win64); | ||
| 180 | movq 40(%rsp),%xmm3 | ||
| 181 | ___ | ||
| 182 | $code.=<<___ if (!$win64); | ||
| 183 | movd $b0,%xmm3 | ||
| 184 | ___ | ||
| 185 | $code.=<<___; | ||
| 186 | movdqa %xmm0,%xmm4 | ||
| 187 | movdqa %xmm1,%xmm5 | ||
| 188 | pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 | ||
| 189 | pxor %xmm2,%xmm4 | ||
| 190 | pxor %xmm3,%xmm5 | ||
| 191 | pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 | ||
| 192 | pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) | ||
| 193 | xorps %xmm0,%xmm4 | ||
| 194 | xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
| 195 | movdqa %xmm4,%xmm5 | ||
| 196 | pslldq \$8,%xmm4 | ||
| 197 | psrldq \$8,%xmm5 | ||
| 198 | pxor %xmm4,%xmm2 | ||
| 199 | pxor %xmm5,%xmm0 | ||
| 200 | movdqu %xmm2,0($rp) | ||
| 201 | movdqu %xmm0,16($rp) | ||
| 202 | ret | ||
| 203 | |||
| 204 | .align 16 | ||
| 205 | .Lvanilla_mul_2x2: | ||
| 206 | lea -8*17(%rsp),%rsp | ||
| 207 | ___ | ||
| 208 | $code.=<<___ if ($win64); | ||
| 209 | mov `8*17+40`(%rsp),$b0 | ||
| 210 | mov %rdi,8*15(%rsp) | ||
| 211 | mov %rsi,8*16(%rsp) | ||
| 212 | ___ | ||
| 213 | $code.=<<___; | ||
| 214 | mov %r14,8*10(%rsp) | ||
| 215 | mov %r13,8*11(%rsp) | ||
| 216 | mov %r12,8*12(%rsp) | ||
| 217 | mov %rbp,8*13(%rsp) | ||
| 218 | mov %rbx,8*14(%rsp) | ||
| 219 | .Lbody_mul_2x2: | ||
| 220 | mov $rp,32(%rsp) # save the arguments | ||
| 221 | mov $a1,40(%rsp) | ||
| 222 | mov $a0,48(%rsp) | ||
| 223 | mov $b1,56(%rsp) | ||
| 224 | mov $b0,64(%rsp) | ||
| 225 | |||
| 226 | mov \$0xf,$mask | ||
| 227 | mov $a1,$a | ||
| 228 | mov $b1,$b | ||
| 229 | call _mul_1x1 # a1·b1 | ||
| 230 | mov $lo,16(%rsp) | ||
| 231 | mov $hi,24(%rsp) | ||
| 232 | |||
| 233 | mov 48(%rsp),$a | ||
| 234 | mov 64(%rsp),$b | ||
| 235 | call _mul_1x1 # a0·b0 | ||
| 236 | mov $lo,0(%rsp) | ||
| 237 | mov $hi,8(%rsp) | ||
| 238 | |||
| 239 | mov 40(%rsp),$a | ||
| 240 | mov 56(%rsp),$b | ||
| 241 | xor 48(%rsp),$a | ||
| 242 | xor 64(%rsp),$b | ||
| 243 | call _mul_1x1 # (a0+a1)·(b0+b1) | ||
| 244 | ___ | ||
| 245 | @r=("%rbx","%rcx","%rdi","%rsi"); | ||
| 246 | $code.=<<___; | ||
| 247 | mov 0(%rsp),@r[0] | ||
| 248 | mov 8(%rsp),@r[1] | ||
| 249 | mov 16(%rsp),@r[2] | ||
| 250 | mov 24(%rsp),@r[3] | ||
| 251 | mov 32(%rsp),%rbp | ||
| 252 | |||
| 253 | xor $hi,$lo | ||
| 254 | xor @r[1],$hi | ||
| 255 | xor @r[0],$lo | ||
| 256 | mov @r[0],0(%rbp) | ||
| 257 | xor @r[2],$hi | ||
| 258 | mov @r[3],24(%rbp) | ||
| 259 | xor @r[3],$lo | ||
| 260 | xor @r[3],$hi | ||
| 261 | xor $hi,$lo | ||
| 262 | mov $hi,16(%rbp) | ||
| 263 | mov $lo,8(%rbp) | ||
| 264 | |||
| 265 | mov 8*10(%rsp),%r14 | ||
| 266 | mov 8*11(%rsp),%r13 | ||
| 267 | mov 8*12(%rsp),%r12 | ||
| 268 | mov 8*13(%rsp),%rbp | ||
| 269 | mov 8*14(%rsp),%rbx | ||
| 270 | ___ | ||
| 271 | $code.=<<___ if ($win64); | ||
| 272 | mov 8*15(%rsp),%rdi | ||
| 273 | mov 8*16(%rsp),%rsi | ||
| 274 | ___ | ||
| 275 | $code.=<<___; | ||
| 276 | lea 8*17(%rsp),%rsp | ||
| 277 | ret | ||
| 278 | .Lend_mul_2x2: | ||
| 279 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 280 | .align 16 | ||
| 281 | ___ | ||
| 282 | |||
| 283 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 284 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 285 | if ($win64) { | ||
| 286 | $rec="%rcx"; | ||
| 287 | $frame="%rdx"; | ||
| 288 | $context="%r8"; | ||
| 289 | $disp="%r9"; | ||
| 290 | |||
| 291 | $code.=<<___; | ||
| 292 | .extern __imp_RtlVirtualUnwind | ||
| 293 | |||
| 294 | .type se_handler,\@abi-omnipotent | ||
| 295 | .align 16 | ||
| 296 | se_handler: | ||
| 297 | push %rsi | ||
| 298 | push %rdi | ||
| 299 | push %rbx | ||
| 300 | push %rbp | ||
| 301 | push %r12 | ||
| 302 | push %r13 | ||
| 303 | push %r14 | ||
| 304 | push %r15 | ||
| 305 | pushfq | ||
| 306 | sub \$64,%rsp | ||
| 307 | |||
| 308 | mov 152($context),%rax # pull context->Rsp | ||
| 309 | mov 248($context),%rbx # pull context->Rip | ||
| 310 | |||
| 311 | lea .Lbody_mul_2x2(%rip),%r10 | ||
| 312 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
| 313 | jb .Lin_prologue | ||
| 314 | |||
| 315 | mov 8*10(%rax),%r14 # mimic epilogue | ||
| 316 | mov 8*11(%rax),%r13 | ||
| 317 | mov 8*12(%rax),%r12 | ||
| 318 | mov 8*13(%rax),%rbp | ||
| 319 | mov 8*14(%rax),%rbx | ||
| 320 | mov 8*15(%rax),%rdi | ||
| 321 | mov 8*16(%rax),%rsi | ||
| 322 | |||
| 323 | mov %rbx,144($context) # restore context->Rbx | ||
| 324 | mov %rbp,160($context) # restore context->Rbp | ||
| 325 | mov %rsi,168($context) # restore context->Rsi | ||
| 326 | mov %rdi,176($context) # restore context->Rdi | ||
| 327 | mov %r12,216($context) # restore context->R12 | ||
| 328 | mov %r13,224($context) # restore context->R13 | ||
| 329 | mov %r14,232($context) # restore context->R14 | ||
| 330 | |||
| 331 | .Lin_prologue: | ||
| 332 | lea 8*17(%rax),%rax | ||
| 333 | mov %rax,152($context) # restore context->Rsp | ||
| 334 | |||
| 335 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 336 | mov $context,%rsi # context | ||
| 337 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 338 | .long 0xa548f3fc # cld; rep movsq | ||
| 339 | |||
| 340 | mov $disp,%rsi | ||
| 341 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 342 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 343 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 344 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 345 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 346 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 347 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 348 | mov %r10,32(%rsp) # arg5 | ||
| 349 | mov %r11,40(%rsp) # arg6 | ||
| 350 | mov %r12,48(%rsp) # arg7 | ||
| 351 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 352 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 353 | |||
| 354 | mov \$1,%eax # ExceptionContinueSearch | ||
| 355 | add \$64,%rsp | ||
| 356 | popfq | ||
| 357 | pop %r15 | ||
| 358 | pop %r14 | ||
| 359 | pop %r13 | ||
| 360 | pop %r12 | ||
| 361 | pop %rbp | ||
| 362 | pop %rbx | ||
| 363 | pop %rdi | ||
| 364 | pop %rsi | ||
| 365 | ret | ||
| 366 | .size se_handler,.-se_handler | ||
| 367 | |||
| 368 | .section .pdata | ||
| 369 | .align 4 | ||
| 370 | .rva _mul_1x1 | ||
| 371 | .rva .Lend_mul_1x1 | ||
| 372 | .rva .LSEH_info_1x1 | ||
| 373 | |||
| 374 | .rva .Lvanilla_mul_2x2 | ||
| 375 | .rva .Lend_mul_2x2 | ||
| 376 | .rva .LSEH_info_2x2 | ||
| 377 | .section .xdata | ||
| 378 | .align 8 | ||
| 379 | .LSEH_info_1x1: | ||
| 380 | .byte 0x01,0x07,0x02,0x00 | ||
| 381 | .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 | ||
| 382 | .LSEH_info_2x2: | ||
| 383 | .byte 9,0,0,0 | ||
| 384 | .rva se_handler | ||
| 385 | ___ | ||
| 386 | } | ||
| 387 | |||
| 388 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 389 | print $code; | ||
| 390 | close STDOUT; | ||
