diff options
| author | cvs2svn <admin@example.com> | 2015-03-08 16:48:48 +0000 |
|---|---|---|
| committer | cvs2svn <admin@example.com> | 2015-03-08 16:48:48 +0000 |
| commit | da1a9ad3a4a867ba6569c05e6fca66d7f296c553 (patch) | |
| tree | 44872802e872bdfd60730fa9cf01d9d5751251c1 /src/lib/libcrypto/bn/asm/x86-gf2m.pl | |
| parent | 973703db67a8e73d70e63afa8f2cde19da09144d (diff) | |
| download | openbsd-OPENBSD_5_7_BASE.tar.gz openbsd-OPENBSD_5_7_BASE.tar.bz2 openbsd-OPENBSD_5_7_BASE.zip | |
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_7_BASE'.OPENBSD_5_7_BASE
Diffstat (limited to 'src/lib/libcrypto/bn/asm/x86-gf2m.pl')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86-gf2m.pl | 313 |
1 files changed, 0 insertions, 313 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl deleted file mode 100644 index 808a1e5969..0000000000 --- a/src/lib/libcrypto/bn/asm/x86-gf2m.pl +++ /dev/null | |||
| @@ -1,313 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... Except that it has three code paths: pure integer | ||
| 15 | # code suitable for any x86 CPU, MMX code suitable for PIII and later | ||
| 16 | # and PCLMULQDQ suitable for Westmere and later. Improvement varies | ||
| 17 | # from one benchmark and µ-arch to another. Below are interval values | ||
| 18 | # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | ||
| 19 | # code: | ||
| 20 | # | ||
| 21 | # PIII 16%-30% | ||
| 22 | # P4 12%-12% | ||
| 23 | # Opteron 18%-40% | ||
| 24 | # Core2 19%-44% | ||
| 25 | # Atom 38%-64% | ||
| 26 | # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | ||
| 27 | # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | ||
| 28 | # | ||
| 29 | # Note that above improvement coefficients are not coefficients for | ||
| 30 | # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | ||
| 31 | # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | ||
| 32 | # is more and more dominated by other subroutines, most notably by | ||
| 33 | # BN_GF2m_mod[_mul]_arr... | ||
| 34 | |||
| 35 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 36 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 37 | require "x86asm.pl"; | ||
| 38 | |||
| 39 | &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | ||
| 40 | |||
| 41 | $sse2=0; | ||
| 42 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 43 | |||
| 44 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 45 | |||
| 46 | $a="eax"; | ||
| 47 | $b="ebx"; | ||
| 48 | ($a1,$a2,$a4)=("ecx","edx","ebp"); | ||
| 49 | |||
| 50 | $R="mm0"; | ||
| 51 | @T=("mm1","mm2"); | ||
| 52 | ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | ||
| 53 | @i=("esi","edi"); | ||
| 54 | |||
| 55 | if (!$x86only) { | ||
| 56 | &function_begin_B("_mul_1x1_mmx"); | ||
| 57 | &sub ("esp",32+4); | ||
| 58 | &mov ($a1,$a); | ||
| 59 | &lea ($a2,&DWP(0,$a,$a)); | ||
| 60 | &and ($a1,0x3fffffff); | ||
| 61 | &lea ($a4,&DWP(0,$a2,$a2)); | ||
| 62 | &mov (&DWP(0*4,"esp"),0); | ||
| 63 | &and ($a2,0x7fffffff); | ||
| 64 | &movd ($A,$a); | ||
| 65 | &movd ($B,$b); | ||
| 66 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
| 67 | &xor ($a1,$a2); # a1^a2 | ||
| 68 | &pxor ($B31,$B31); | ||
| 69 | &pxor ($B30,$B30); | ||
| 70 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
| 71 | &xor ($a2,$a4); # a2^a4 | ||
| 72 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
| 73 | &pcmpgtd($B31,$A); # broadcast 31st bit | ||
| 74 | &paddd ($A,$A); # $A<<=1 | ||
| 75 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
| 76 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
| 77 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
| 78 | &pand ($B31,$B); | ||
| 79 | &pcmpgtd($B30,$A); # broadcast 30th bit | ||
| 80 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
| 81 | &xor ($a4,$a1); # a1^a2^a4 | ||
| 82 | &psllq ($B31,31); | ||
| 83 | &pand ($B30,$B); | ||
| 84 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
| 85 | &mov (@i[0],0x7); | ||
| 86 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
| 87 | &mov ($a4,@i[0]); | ||
| 88 | &and (@i[0],$b); | ||
| 89 | &shr ($b,3); | ||
| 90 | &mov (@i[1],$a4); | ||
| 91 | &psllq ($B30,30); | ||
| 92 | &and (@i[1],$b); | ||
| 93 | &shr ($b,3); | ||
| 94 | &movd ($R,&DWP(0,"esp",@i[0],4)); | ||
| 95 | &mov (@i[0],$a4); | ||
| 96 | &and (@i[0],$b); | ||
| 97 | &shr ($b,3); | ||
| 98 | for($n=1;$n<9;$n++) { | ||
| 99 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 100 | &mov (@i[1],$a4); | ||
| 101 | &psllq (@T[1],3*$n); | ||
| 102 | &and (@i[1],$b); | ||
| 103 | &shr ($b,3); | ||
| 104 | &pxor ($R,@T[1]); | ||
| 105 | |||
| 106 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 107 | } | ||
| 108 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 109 | &pxor ($R,$B30); | ||
| 110 | &psllq (@T[1],3*$n++); | ||
| 111 | &pxor ($R,@T[1]); | ||
| 112 | |||
| 113 | &movd (@T[0],&DWP(0,"esp",@i[0],4)); | ||
| 114 | &pxor ($R,$B31); | ||
| 115 | &psllq (@T[0],3*$n); | ||
| 116 | &add ("esp",32+4); | ||
| 117 | &pxor ($R,@T[0]); | ||
| 118 | &ret (); | ||
| 119 | &function_end_B("_mul_1x1_mmx"); | ||
| 120 | } | ||
| 121 | |||
| 122 | ($lo,$hi)=("eax","edx"); | ||
| 123 | @T=("ecx","ebp"); | ||
| 124 | |||
| 125 | &function_begin_B("_mul_1x1_ialu"); | ||
| 126 | &sub ("esp",32+4); | ||
| 127 | &mov ($a1,$a); | ||
| 128 | &lea ($a2,&DWP(0,$a,$a)); | ||
| 129 | &lea ($a4,&DWP(0,"",$a,4)); | ||
| 130 | &and ($a1,0x3fffffff); | ||
| 131 | &lea (@i[1],&DWP(0,$lo,$lo)); | ||
| 132 | &sar ($lo,31); # broadcast 31st bit | ||
| 133 | &mov (&DWP(0*4,"esp"),0); | ||
| 134 | &and ($a2,0x7fffffff); | ||
| 135 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
| 136 | &xor ($a1,$a2); # a1^a2 | ||
| 137 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
| 138 | &xor ($a2,$a4); # a2^a4 | ||
| 139 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
| 140 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
| 141 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
| 142 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
| 143 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
| 144 | &xor ($a4,$a1); # a1^a2^a4 | ||
| 145 | &sar (@i[1],31); # broardcast 30th bit | ||
| 146 | &and ($lo,$b); | ||
| 147 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
| 148 | &and (@i[1],$b); | ||
| 149 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
| 150 | &mov ($hi,$lo); | ||
| 151 | &shl ($lo,31); | ||
| 152 | &mov (@T[0],@i[1]); | ||
| 153 | &shr ($hi,1); | ||
| 154 | |||
| 155 | &mov (@i[0],0x7); | ||
| 156 | &shl (@i[1],30); | ||
| 157 | &and (@i[0],$b); | ||
| 158 | &shr (@T[0],2); | ||
| 159 | &xor ($lo,@i[1]); | ||
| 160 | |||
| 161 | &shr ($b,3); | ||
| 162 | &mov (@i[1],0x7); # 5-byte instruction!? | ||
| 163 | &and (@i[1],$b); | ||
| 164 | &shr ($b,3); | ||
| 165 | &xor ($hi,@T[0]); | ||
| 166 | &xor ($lo,&DWP(0,"esp",@i[0],4)); | ||
| 167 | &mov (@i[0],0x7); | ||
| 168 | &and (@i[0],$b); | ||
| 169 | &shr ($b,3); | ||
| 170 | for($n=1;$n<9;$n++) { | ||
| 171 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 172 | &mov (@i[1],0x7); | ||
| 173 | &mov (@T[0],@T[1]); | ||
| 174 | &shl (@T[1],3*$n); | ||
| 175 | &and (@i[1],$b); | ||
| 176 | &shr (@T[0],32-3*$n); | ||
| 177 | &xor ($lo,@T[1]); | ||
| 178 | &shr ($b,3); | ||
| 179 | &xor ($hi,@T[0]); | ||
| 180 | |||
| 181 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 182 | } | ||
| 183 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 184 | &mov (@T[0],@T[1]); | ||
| 185 | &shl (@T[1],3*$n); | ||
| 186 | &mov (@i[1],&DWP(0,"esp",@i[0],4)); | ||
| 187 | &shr (@T[0],32-3*$n); $n++; | ||
| 188 | &mov (@i[0],@i[1]); | ||
| 189 | &xor ($lo,@T[1]); | ||
| 190 | &shl (@i[1],3*$n); | ||
| 191 | &xor ($hi,@T[0]); | ||
| 192 | &shr (@i[0],32-3*$n); | ||
| 193 | &xor ($lo,@i[1]); | ||
| 194 | &xor ($hi,@i[0]); | ||
| 195 | |||
| 196 | &add ("esp",32+4); | ||
| 197 | &ret (); | ||
| 198 | &function_end_B("_mul_1x1_ialu"); | ||
| 199 | |||
| 200 | # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); | ||
| 201 | &function_begin_B("bn_GF2m_mul_2x2"); | ||
| 202 | if (!$x86only) { | ||
| 203 | &picmeup("edx","OPENSSL_ia32cap_P"); | ||
| 204 | &mov ("eax",&DWP(0,"edx")); | ||
| 205 | &mov ("edx",&DWP(4,"edx")); | ||
| 206 | &test ("eax",1<<23); # check MMX bit | ||
| 207 | &jz (&label("ialu")); | ||
| 208 | if ($sse2) { | ||
| 209 | &test ("eax",1<<24); # check FXSR bit | ||
| 210 | &jz (&label("mmx")); | ||
| 211 | &test ("edx",1<<1); # check PCLMULQDQ bit | ||
| 212 | &jz (&label("mmx")); | ||
| 213 | |||
| 214 | &movups ("xmm0",&QWP(8,"esp")); | ||
| 215 | &shufps ("xmm0","xmm0",0b10110001); | ||
| 216 | &pclmulqdq ("xmm0","xmm0",1); | ||
| 217 | &mov ("eax",&DWP(4,"esp")); | ||
| 218 | &movups (&QWP(0,"eax"),"xmm0"); | ||
| 219 | &ret (); | ||
| 220 | |||
| 221 | &set_label("mmx",16); | ||
| 222 | } | ||
| 223 | &push ("ebp"); | ||
| 224 | &push ("ebx"); | ||
| 225 | &push ("esi"); | ||
| 226 | &push ("edi"); | ||
| 227 | &mov ($a,&wparam(1)); | ||
| 228 | &mov ($b,&wparam(3)); | ||
| 229 | &call ("_mul_1x1_mmx"); # a1·b1 | ||
| 230 | &movq ("mm7",$R); | ||
| 231 | |||
| 232 | &mov ($a,&wparam(2)); | ||
| 233 | &mov ($b,&wparam(4)); | ||
| 234 | &call ("_mul_1x1_mmx"); # a0·b0 | ||
| 235 | &movq ("mm6",$R); | ||
| 236 | |||
| 237 | &mov ($a,&wparam(1)); | ||
| 238 | &mov ($b,&wparam(3)); | ||
| 239 | &xor ($a,&wparam(2)); | ||
| 240 | &xor ($b,&wparam(4)); | ||
| 241 | &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | ||
| 242 | &pxor ($R,"mm7"); | ||
| 243 | &mov ($a,&wparam(0)); | ||
| 244 | &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | ||
| 245 | |||
| 246 | &movq ($A,$R); | ||
| 247 | &psllq ($R,32); | ||
| 248 | &pop ("edi"); | ||
| 249 | &psrlq ($A,32); | ||
| 250 | &pop ("esi"); | ||
| 251 | &pxor ($R,"mm6"); | ||
| 252 | &pop ("ebx"); | ||
| 253 | &pxor ($A,"mm7"); | ||
| 254 | &movq (&QWP(0,$a),$R); | ||
| 255 | &pop ("ebp"); | ||
| 256 | &movq (&QWP(8,$a),$A); | ||
| 257 | &emms (); | ||
| 258 | &ret (); | ||
| 259 | &set_label("ialu",16); | ||
| 260 | } | ||
| 261 | &push ("ebp"); | ||
| 262 | &push ("ebx"); | ||
| 263 | &push ("esi"); | ||
| 264 | &push ("edi"); | ||
| 265 | &stack_push(4+1); | ||
| 266 | |||
| 267 | &mov ($a,&wparam(1)); | ||
| 268 | &mov ($b,&wparam(3)); | ||
| 269 | &call ("_mul_1x1_ialu"); # a1·b1 | ||
| 270 | &mov (&DWP(8,"esp"),$lo); | ||
| 271 | &mov (&DWP(12,"esp"),$hi); | ||
| 272 | |||
| 273 | &mov ($a,&wparam(2)); | ||
| 274 | &mov ($b,&wparam(4)); | ||
| 275 | &call ("_mul_1x1_ialu"); # a0·b0 | ||
| 276 | &mov (&DWP(0,"esp"),$lo); | ||
| 277 | &mov (&DWP(4,"esp"),$hi); | ||
| 278 | |||
| 279 | &mov ($a,&wparam(1)); | ||
| 280 | &mov ($b,&wparam(3)); | ||
| 281 | &xor ($a,&wparam(2)); | ||
| 282 | &xor ($b,&wparam(4)); | ||
| 283 | &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | ||
| 284 | |||
| 285 | &mov ("ebp",&wparam(0)); | ||
| 286 | @r=("ebx","ecx","edi","esi"); | ||
| 287 | &mov (@r[0],&DWP(0,"esp")); | ||
| 288 | &mov (@r[1],&DWP(4,"esp")); | ||
| 289 | &mov (@r[2],&DWP(8,"esp")); | ||
| 290 | &mov (@r[3],&DWP(12,"esp")); | ||
| 291 | |||
| 292 | &xor ($lo,$hi); | ||
| 293 | &xor ($hi,@r[1]); | ||
| 294 | &xor ($lo,@r[0]); | ||
| 295 | &mov (&DWP(0,"ebp"),@r[0]); | ||
| 296 | &xor ($hi,@r[2]); | ||
| 297 | &mov (&DWP(12,"ebp"),@r[3]); | ||
| 298 | &xor ($lo,@r[3]); | ||
| 299 | &stack_pop(4+1); | ||
| 300 | &xor ($hi,@r[3]); | ||
| 301 | &pop ("edi"); | ||
| 302 | &xor ($lo,$hi); | ||
| 303 | &pop ("esi"); | ||
| 304 | &mov (&DWP(8,"ebp"),$hi); | ||
| 305 | &pop ("ebx"); | ||
| 306 | &mov (&DWP(4,"ebp"),$lo); | ||
| 307 | &pop ("ebp"); | ||
| 308 | &ret (); | ||
| 309 | &function_end_B("bn_GF2m_mul_2x2"); | ||
| 310 | |||
| 311 | &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 312 | |||
| 313 | &asm_finish(); | ||
