diff options
| author | bcook <> | 2016-09-03 17:33:16 +0000 |
|---|---|---|
| committer | bcook <> | 2016-09-03 17:33:16 +0000 |
| commit | a368cc5f567b4a1e50fb42285f6e14eb04a5aa87 (patch) | |
| tree | 70c40c7f447bdec56bbdb8315f7182f6e016a20b | |
| parent | c0fcb806af94407aeda5f49a7df9ecf880b3ff57 (diff) | |
| download | openbsd-a368cc5f567b4a1e50fb42285f6e14eb04a5aa87.tar.gz openbsd-a368cc5f567b4a1e50fb42285f6e14eb04a5aa87.tar.bz2 openbsd-a368cc5f567b4a1e50fb42285f6e14eb04a5aa87.zip | |
switch to a constant-time gather procedure for amd64 mont5 asm
from OpenSSL commit 7f98aa7403a1244cf17d1aa489f5bb0f39bae431
CVE-2016-0702
ok beck@
Diffstat (limited to '')
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/x86_64-mont5.pl | 513 |
1 files changed, 314 insertions, 199 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl index 81e5c53728..bb7ad4c4b7 100755 --- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl +++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl | |||
| @@ -66,60 +66,113 @@ bn_mul_mont_gather5: | |||
| 66 | .align 16 | 66 | .align 16 |
| 67 | .Lmul_enter: | 67 | .Lmul_enter: |
| 68 | mov ${num}d,${num}d | 68 | mov ${num}d,${num}d |
| 69 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | 69 | movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
| 70 | lea .Linc(%rip),%r10 | ||
| 70 | push %rbx | 71 | push %rbx |
| 71 | push %rbp | 72 | push %rbp |
| 72 | push %r12 | 73 | push %r12 |
| 73 | push %r13 | 74 | push %r13 |
| 74 | push %r14 | 75 | push %r14 |
| 75 | push %r15 | 76 | push %r15 |
| 76 | ___ | 77 | |
| 77 | $code.=<<___ if ($win64); | ||
| 78 | lea -0x28(%rsp),%rsp | ||
| 79 | movaps %xmm6,(%rsp) | ||
| 80 | movaps %xmm7,0x10(%rsp) | ||
| 81 | .Lmul_alloca: | 78 | .Lmul_alloca: |
| 82 | ___ | ||
| 83 | $code.=<<___; | ||
| 84 | mov %rsp,%rax | 79 | mov %rsp,%rax |
| 85 | lea 2($num),%r11 | 80 | lea 2($num),%r11 |
| 86 | neg %r11 | 81 | neg %r11 |
| 87 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) | 82 | lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) |
| 88 | and \$-1024,%rsp # minimize TLB usage | 83 | and \$-1024,%rsp # minimize TLB usage |
| 89 | 84 | ||
| 90 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | 85 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
| 91 | .Lmul_body: | 86 | .Lmul_body: |
| 92 | mov $bp,%r12 # reassign $bp | 87 | lea 128($bp),%r12 # reassign $bp (+size optimization) |
| 93 | ___ | 88 | ___ |
| 94 | $bp="%r12"; | 89 | $bp="%r12"; |
| 95 | $STRIDE=2**5*8; # 5 is "window size" | 90 | $STRIDE=2**5*8; # 5 is "window size" |
| 96 | $N=$STRIDE/4; # should match cache line size | 91 | $N=$STRIDE/4; # should match cache line size |
| 97 | $code.=<<___; | 92 | $code.=<<___; |
| 98 | mov %r10,%r11 | 93 | movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
| 99 | shr \$`log($N/8)/log(2)`,%r10 | 94 | movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
| 100 | and \$`$N/8-1`,%r11 | 95 | lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) |
| 101 | not %r10 | 96 | and \$-16,%r10 |
| 102 | lea .Lmagic_masks(%rip),%rax | 97 | |
| 103 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | 98 | pshufd \$0,%xmm5,%xmm5 # broadcast index |
| 104 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | 99 | movdqa %xmm1,%xmm4 |
| 105 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | 100 | movdqa %xmm1,%xmm2 |
| 106 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | 101 | ___ |
| 107 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | 102 | ######################################################################## |
| 108 | movq 24(%rax,%r10,8),%xmm7 | 103 | # calculate mask by comparing 0..31 to index and save result to stack |
| 109 | 104 | # | |
| 110 | movq `0*$STRIDE/4-96`($bp),%xmm0 | 105 | $code.=<<___; |
| 111 | movq `1*$STRIDE/4-96`($bp),%xmm1 | 106 | paddd %xmm0,%xmm1 |
| 112 | pand %xmm4,%xmm0 | 107 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
| 113 | movq `2*$STRIDE/4-96`($bp),%xmm2 | 108 | .byte 0x67 |
| 114 | pand %xmm5,%xmm1 | 109 | movdqa %xmm4,%xmm3 |
| 115 | movq `3*$STRIDE/4-96`($bp),%xmm3 | 110 | ___ |
| 116 | pand %xmm6,%xmm2 | 111 | for($k=0;$k<$STRIDE/16-4;$k+=4) { |
| 117 | por %xmm1,%xmm0 | 112 | $code.=<<___; |
| 118 | pand %xmm7,%xmm3 | 113 | paddd %xmm1,%xmm2 |
| 114 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | ||
| 115 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
| 116 | movdqa %xmm4,%xmm0 | ||
| 117 | |||
| 118 | paddd %xmm2,%xmm3 | ||
| 119 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | ||
| 120 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
| 121 | movdqa %xmm4,%xmm1 | ||
| 122 | |||
| 123 | paddd %xmm3,%xmm0 | ||
| 124 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | ||
| 125 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
| 126 | movdqa %xmm4,%xmm2 | ||
| 127 | |||
| 128 | paddd %xmm0,%xmm1 | ||
| 129 | pcmpeqd %xmm5,%xmm0 | ||
| 130 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
| 131 | movdqa %xmm4,%xmm3 | ||
| 132 | ___ | ||
| 133 | } | ||
| 134 | $code.=<<___; # last iteration can be optimized | ||
| 135 | paddd %xmm1,%xmm2 | ||
| 136 | pcmpeqd %xmm5,%xmm1 | ||
| 137 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
| 138 | |||
| 139 | paddd %xmm2,%xmm3 | ||
| 140 | .byte 0x67 | ||
| 141 | pcmpeqd %xmm5,%xmm2 | ||
| 142 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
| 143 | |||
| 144 | pcmpeqd %xmm5,%xmm3 | ||
| 145 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
| 146 | pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register | ||
| 147 | |||
| 148 | pand `16*($k+1)-128`($bp),%xmm1 | ||
| 149 | pand `16*($k+2)-128`($bp),%xmm2 | ||
| 150 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
| 151 | pand `16*($k+3)-128`($bp),%xmm3 | ||
| 152 | por %xmm2,%xmm0 | ||
| 153 | por %xmm3,%xmm1 | ||
| 154 | ___ | ||
| 155 | for($k=0;$k<$STRIDE/16-4;$k+=4) { | ||
| 156 | $code.=<<___; | ||
| 157 | movdqa `16*($k+0)-128`($bp),%xmm4 | ||
| 158 | movdqa `16*($k+1)-128`($bp),%xmm5 | ||
| 159 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
| 160 | pand `16*($k+0)+112`(%r10),%xmm4 | ||
| 161 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
| 162 | pand `16*($k+1)+112`(%r10),%xmm5 | ||
| 163 | por %xmm4,%xmm0 | ||
| 164 | pand `16*($k+2)+112`(%r10),%xmm2 | ||
| 165 | por %xmm5,%xmm1 | ||
| 166 | pand `16*($k+3)+112`(%r10),%xmm3 | ||
| 119 | por %xmm2,%xmm0 | 167 | por %xmm2,%xmm0 |
| 168 | por %xmm3,%xmm1 | ||
| 169 | ___ | ||
| 170 | } | ||
| 171 | $code.=<<___; | ||
| 172 | por %xmm1,%xmm0 | ||
| 173 | pshufd \$0x4e,%xmm0,%xmm1 | ||
| 174 | por %xmm1,%xmm0 | ||
| 120 | lea $STRIDE($bp),$bp | 175 | lea $STRIDE($bp),$bp |
| 121 | por %xmm3,%xmm0 | ||
| 122 | |||
| 123 | movd %xmm0,$m0 # m0=bp[0] | 176 | movd %xmm0,$m0 # m0=bp[0] |
| 124 | 177 | ||
| 125 | mov ($n0),$n0 # pull n0[0] value | 178 | mov ($n0),$n0 # pull n0[0] value |
| @@ -128,29 +181,14 @@ $code.=<<___; | |||
| 128 | xor $i,$i # i=0 | 181 | xor $i,$i # i=0 |
| 129 | xor $j,$j # j=0 | 182 | xor $j,$j # j=0 |
| 130 | 183 | ||
| 131 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 132 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 133 | pand %xmm4,%xmm0 | ||
| 134 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 135 | pand %xmm5,%xmm1 | ||
| 136 | |||
| 137 | mov $n0,$m1 | 184 | mov $n0,$m1 |
| 138 | mulq $m0 # ap[0]*bp[0] | 185 | mulq $m0 # ap[0]*bp[0] |
| 139 | mov %rax,$lo0 | 186 | mov %rax,$lo0 |
| 140 | mov ($np),%rax | 187 | mov ($np),%rax |
| 141 | 188 | ||
| 142 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 143 | pand %xmm6,%xmm2 | ||
| 144 | por %xmm1,%xmm0 | ||
| 145 | pand %xmm7,%xmm3 | ||
| 146 | |||
| 147 | imulq $lo0,$m1 # "tp[0]"*n0 | 189 | imulq $lo0,$m1 # "tp[0]"*n0 |
| 148 | mov %rdx,$hi0 | 190 | mov %rdx,$hi0 |
| 149 | 191 | ||
| 150 | por %xmm2,%xmm0 | ||
| 151 | lea $STRIDE($bp),$bp | ||
| 152 | por %xmm3,%xmm0 | ||
| 153 | |||
| 154 | mulq $m1 # np[0]*m1 | 192 | mulq $m1 # np[0]*m1 |
| 155 | add %rax,$lo0 # discarded | 193 | add %rax,$lo0 # discarded |
| 156 | mov 8($ap),%rax | 194 | mov 8($ap),%rax |
| @@ -183,8 +221,6 @@ $code.=<<___; | |||
| 183 | cmp $num,$j | 221 | cmp $num,$j |
| 184 | jl .L1st | 222 | jl .L1st |
| 185 | 223 | ||
| 186 | movd %xmm0,$m0 # bp[1] | ||
| 187 | |||
| 188 | add %rax,$hi1 | 224 | add %rax,$hi1 |
| 189 | mov ($ap),%rax # ap[0] | 225 | mov ($ap),%rax # ap[0] |
| 190 | adc \$0,%rdx | 226 | adc \$0,%rdx |
| @@ -204,33 +240,46 @@ $code.=<<___; | |||
| 204 | jmp .Louter | 240 | jmp .Louter |
| 205 | .align 16 | 241 | .align 16 |
| 206 | .Louter: | 242 | .Louter: |
| 243 | lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) | ||
| 244 | and \$-16,%rdx | ||
| 245 | pxor %xmm4,%xmm4 | ||
| 246 | pxor %xmm5,%xmm5 | ||
| 247 | ___ | ||
| 248 | for($k=0;$k<$STRIDE/16;$k+=4) { | ||
| 249 | $code.=<<___; | ||
| 250 | movdqa `16*($k+0)-128`($bp),%xmm0 | ||
| 251 | movdqa `16*($k+1)-128`($bp),%xmm1 | ||
| 252 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
| 253 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
| 254 | pand `16*($k+0)-128`(%rdx),%xmm0 | ||
| 255 | pand `16*($k+1)-128`(%rdx),%xmm1 | ||
| 256 | por %xmm0,%xmm4 | ||
| 257 | pand `16*($k+2)-128`(%rdx),%xmm2 | ||
| 258 | por %xmm1,%xmm5 | ||
| 259 | pand `16*($k+3)-128`(%rdx),%xmm3 | ||
| 260 | por %xmm2,%xmm4 | ||
| 261 | por %xmm3,%xmm5 | ||
| 262 | ___ | ||
| 263 | } | ||
| 264 | $code.=<<___; | ||
| 265 | por %xmm5,%xmm4 | ||
| 266 | pshufd \$0x4e,%xmm4,%xmm0 | ||
| 267 | por %xmm4,%xmm0 | ||
| 268 | lea $STRIDE($bp),$bp | ||
| 269 | movd %xmm0,$m0 # m0=bp[i] | ||
| 270 | |||
| 207 | xor $j,$j # j=0 | 271 | xor $j,$j # j=0 |
| 208 | mov $n0,$m1 | 272 | mov $n0,$m1 |
| 209 | mov (%rsp),$lo0 | 273 | mov (%rsp),$lo0 |
| 210 | 274 | ||
| 211 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 212 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 213 | pand %xmm4,%xmm0 | ||
| 214 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 215 | pand %xmm5,%xmm1 | ||
| 216 | |||
| 217 | mulq $m0 # ap[0]*bp[i] | 275 | mulq $m0 # ap[0]*bp[i] |
| 218 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] | 276 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
| 219 | mov ($np),%rax | 277 | mov ($np),%rax |
| 220 | adc \$0,%rdx | 278 | adc \$0,%rdx |
| 221 | 279 | ||
| 222 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 223 | pand %xmm6,%xmm2 | ||
| 224 | por %xmm1,%xmm0 | ||
| 225 | pand %xmm7,%xmm3 | ||
| 226 | |||
| 227 | imulq $lo0,$m1 # tp[0]*n0 | 280 | imulq $lo0,$m1 # tp[0]*n0 |
| 228 | mov %rdx,$hi0 | 281 | mov %rdx,$hi0 |
| 229 | 282 | ||
| 230 | por %xmm2,%xmm0 | ||
| 231 | lea $STRIDE($bp),$bp | ||
| 232 | por %xmm3,%xmm0 | ||
| 233 | |||
| 234 | mulq $m1 # np[0]*m1 | 283 | mulq $m1 # np[0]*m1 |
| 235 | add %rax,$lo0 # discarded | 284 | add %rax,$lo0 # discarded |
| 236 | mov 8($ap),%rax | 285 | mov 8($ap),%rax |
| @@ -266,8 +315,6 @@ $code.=<<___; | |||
| 266 | cmp $num,$j | 315 | cmp $num,$j |
| 267 | jl .Linner | 316 | jl .Linner |
| 268 | 317 | ||
| 269 | movd %xmm0,$m0 # bp[i+1] | ||
| 270 | |||
| 271 | add %rax,$hi1 | 318 | add %rax,$hi1 |
| 272 | mov ($ap),%rax # ap[0] | 319 | mov ($ap),%rax # ap[0] |
| 273 | adc \$0,%rdx | 320 | adc \$0,%rdx |
| @@ -321,13 +368,7 @@ $code.=<<___; | |||
| 321 | 368 | ||
| 322 | mov 8(%rsp,$num,8),%rsi # restore %rsp | 369 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
| 323 | mov \$1,%rax | 370 | mov \$1,%rax |
| 324 | ___ | 371 | |
| 325 | $code.=<<___ if ($win64); | ||
| 326 | movaps (%rsi),%xmm6 | ||
| 327 | movaps 0x10(%rsi),%xmm7 | ||
| 328 | lea 0x28(%rsi),%rsi | ||
| 329 | ___ | ||
| 330 | $code.=<<___; | ||
| 331 | mov (%rsi),%r15 | 372 | mov (%rsi),%r15 |
| 332 | mov 8(%rsi),%r14 | 373 | mov 8(%rsi),%r14 |
| 333 | mov 16(%rsi),%r13 | 374 | mov 16(%rsi),%r13 |
| @@ -348,91 +389,130 @@ $code.=<<___; | |||
| 348 | bn_mul4x_mont_gather5: | 389 | bn_mul4x_mont_gather5: |
| 349 | .Lmul4x_enter: | 390 | .Lmul4x_enter: |
| 350 | mov ${num}d,${num}d | 391 | mov ${num}d,${num}d |
| 351 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | 392 | movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
| 393 | lea .Linc(%rip),%r10 | ||
| 352 | push %rbx | 394 | push %rbx |
| 353 | push %rbp | 395 | push %rbp |
| 354 | push %r12 | 396 | push %r12 |
| 355 | push %r13 | 397 | push %r13 |
| 356 | push %r14 | 398 | push %r14 |
| 357 | push %r15 | 399 | push %r15 |
| 358 | ___ | 400 | |
| 359 | $code.=<<___ if ($win64); | ||
| 360 | lea -0x28(%rsp),%rsp | ||
| 361 | movaps %xmm6,(%rsp) | ||
| 362 | movaps %xmm7,0x10(%rsp) | ||
| 363 | .Lmul4x_alloca: | 401 | .Lmul4x_alloca: |
| 364 | ___ | ||
| 365 | $code.=<<___; | ||
| 366 | mov %rsp,%rax | 402 | mov %rsp,%rax |
| 367 | lea 4($num),%r11 | 403 | lea 4($num),%r11 |
| 368 | neg %r11 | 404 | neg %r11 |
| 369 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) | 405 | lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) |
| 370 | and \$-1024,%rsp # minimize TLB usage | 406 | and \$-1024,%rsp # minimize TLB usage |
| 371 | 407 | ||
| 372 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | 408 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
| 373 | .Lmul4x_body: | 409 | .Lmul4x_body: |
| 374 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | 410 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp |
| 375 | mov %rdx,%r12 # reassign $bp | 411 | lea 128(%rdx),%r12 # reassign $bp (+size optimization) |
| 376 | ___ | 412 | ___ |
| 377 | $bp="%r12"; | 413 | $bp="%r12"; |
| 378 | $STRIDE=2**5*8; # 5 is "window size" | 414 | $STRIDE=2**5*8; # 5 is "window size" |
| 379 | $N=$STRIDE/4; # should match cache line size | 415 | $N=$STRIDE/4; # should match cache line size |
| 380 | $code.=<<___; | 416 | $code.=<<___; |
| 381 | mov %r10,%r11 | 417 | movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
| 382 | shr \$`log($N/8)/log(2)`,%r10 | 418 | movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
| 383 | and \$`$N/8-1`,%r11 | 419 | lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) |
| 384 | not %r10 | 420 | |
| 385 | lea .Lmagic_masks(%rip),%rax | 421 | pshufd \$0,%xmm5,%xmm5 # broadcast index |
| 386 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | 422 | movdqa %xmm1,%xmm4 |
| 387 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | 423 | .byte 0x67,0x67 |
| 388 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | 424 | movdqa %xmm1,%xmm2 |
| 389 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | 425 | ___ |
| 390 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | 426 | ######################################################################## |
| 391 | movq 24(%rax,%r10,8),%xmm7 | 427 | # calculate mask by comparing 0..31 to index and save result to stack |
| 392 | 428 | # | |
| 393 | movq `0*$STRIDE/4-96`($bp),%xmm0 | 429 | $code.=<<___; |
| 394 | movq `1*$STRIDE/4-96`($bp),%xmm1 | 430 | paddd %xmm0,%xmm1 |
| 395 | pand %xmm4,%xmm0 | 431 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
| 396 | movq `2*$STRIDE/4-96`($bp),%xmm2 | 432 | .byte 0x67 |
| 397 | pand %xmm5,%xmm1 | 433 | movdqa %xmm4,%xmm3 |
| 398 | movq `3*$STRIDE/4-96`($bp),%xmm3 | 434 | ___ |
| 399 | pand %xmm6,%xmm2 | 435 | for($k=0;$k<$STRIDE/16-4;$k+=4) { |
| 400 | por %xmm1,%xmm0 | 436 | $code.=<<___; |
| 401 | pand %xmm7,%xmm3 | 437 | paddd %xmm1,%xmm2 |
| 438 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | ||
| 439 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
| 440 | movdqa %xmm4,%xmm0 | ||
| 441 | |||
| 442 | paddd %xmm2,%xmm3 | ||
| 443 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | ||
| 444 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
| 445 | movdqa %xmm4,%xmm1 | ||
| 446 | |||
| 447 | paddd %xmm3,%xmm0 | ||
| 448 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | ||
| 449 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
| 450 | movdqa %xmm4,%xmm2 | ||
| 451 | |||
| 452 | paddd %xmm0,%xmm1 | ||
| 453 | pcmpeqd %xmm5,%xmm0 | ||
| 454 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
| 455 | movdqa %xmm4,%xmm3 | ||
| 456 | ___ | ||
| 457 | } | ||
| 458 | $code.=<<___; # last iteration can be optimized | ||
| 459 | paddd %xmm1,%xmm2 | ||
| 460 | pcmpeqd %xmm5,%xmm1 | ||
| 461 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
| 462 | |||
| 463 | paddd %xmm2,%xmm3 | ||
| 464 | .byte 0x67 | ||
| 465 | pcmpeqd %xmm5,%xmm2 | ||
| 466 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
| 467 | |||
| 468 | pcmpeqd %xmm5,%xmm3 | ||
| 469 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
| 470 | pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register | ||
| 471 | |||
| 472 | pand `16*($k+1)-128`($bp),%xmm1 | ||
| 473 | pand `16*($k+2)-128`($bp),%xmm2 | ||
| 474 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
| 475 | pand `16*($k+3)-128`($bp),%xmm3 | ||
| 476 | por %xmm2,%xmm0 | ||
| 477 | por %xmm3,%xmm1 | ||
| 478 | ___ | ||
| 479 | for($k=0;$k<$STRIDE/16-4;$k+=4) { | ||
| 480 | $code.=<<___; | ||
| 481 | movdqa `16*($k+0)-128`($bp),%xmm4 | ||
| 482 | movdqa `16*($k+1)-128`($bp),%xmm5 | ||
| 483 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
| 484 | pand `16*($k+0)+112`(%r10),%xmm4 | ||
| 485 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
| 486 | pand `16*($k+1)+112`(%r10),%xmm5 | ||
| 487 | por %xmm4,%xmm0 | ||
| 488 | pand `16*($k+2)+112`(%r10),%xmm2 | ||
| 489 | por %xmm5,%xmm1 | ||
| 490 | pand `16*($k+3)+112`(%r10),%xmm3 | ||
| 402 | por %xmm2,%xmm0 | 491 | por %xmm2,%xmm0 |
| 492 | por %xmm3,%xmm1 | ||
| 493 | ___ | ||
| 494 | } | ||
| 495 | $code.=<<___; | ||
| 496 | por %xmm1,%xmm0 | ||
| 497 | pshufd \$0x4e,%xmm0,%xmm1 | ||
| 498 | por %xmm1,%xmm0 | ||
| 403 | lea $STRIDE($bp),$bp | 499 | lea $STRIDE($bp),$bp |
| 404 | por %xmm3,%xmm0 | ||
| 405 | |||
| 406 | movd %xmm0,$m0 # m0=bp[0] | 500 | movd %xmm0,$m0 # m0=bp[0] |
| 501 | |||
| 407 | mov ($n0),$n0 # pull n0[0] value | 502 | mov ($n0),$n0 # pull n0[0] value |
| 408 | mov ($ap),%rax | 503 | mov ($ap),%rax |
| 409 | 504 | ||
| 410 | xor $i,$i # i=0 | 505 | xor $i,$i # i=0 |
| 411 | xor $j,$j # j=0 | 506 | xor $j,$j # j=0 |
| 412 | 507 | ||
| 413 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 414 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 415 | pand %xmm4,%xmm0 | ||
| 416 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 417 | pand %xmm5,%xmm1 | ||
| 418 | |||
| 419 | mov $n0,$m1 | 508 | mov $n0,$m1 |
| 420 | mulq $m0 # ap[0]*bp[0] | 509 | mulq $m0 # ap[0]*bp[0] |
| 421 | mov %rax,$A[0] | 510 | mov %rax,$A[0] |
| 422 | mov ($np),%rax | 511 | mov ($np),%rax |
| 423 | 512 | ||
| 424 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 425 | pand %xmm6,%xmm2 | ||
| 426 | por %xmm1,%xmm0 | ||
| 427 | pand %xmm7,%xmm3 | ||
| 428 | |||
| 429 | imulq $A[0],$m1 # "tp[0]"*n0 | 513 | imulq $A[0],$m1 # "tp[0]"*n0 |
| 430 | mov %rdx,$A[1] | 514 | mov %rdx,$A[1] |
| 431 | 515 | ||
| 432 | por %xmm2,%xmm0 | ||
| 433 | lea $STRIDE($bp),$bp | ||
| 434 | por %xmm3,%xmm0 | ||
| 435 | |||
| 436 | mulq $m1 # np[0]*m1 | 516 | mulq $m1 # np[0]*m1 |
| 437 | add %rax,$A[0] # discarded | 517 | add %rax,$A[0] # discarded |
| 438 | mov 8($ap),%rax | 518 | mov 8($ap),%rax |
| @@ -550,8 +630,6 @@ $code.=<<___; | |||
| 550 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | 630 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
| 551 | mov %rdx,$N[0] | 631 | mov %rdx,$N[0] |
| 552 | 632 | ||
| 553 | movd %xmm0,$m0 # bp[1] | ||
| 554 | |||
| 555 | xor $N[1],$N[1] | 633 | xor $N[1],$N[1] |
| 556 | add $A[0],$N[0] | 634 | add $A[0],$N[0] |
| 557 | adc \$0,$N[1] | 635 | adc \$0,$N[1] |
| @@ -561,12 +639,34 @@ $code.=<<___; | |||
| 561 | lea 1($i),$i # i++ | 639 | lea 1($i),$i # i++ |
| 562 | .align 4 | 640 | .align 4 |
| 563 | .Louter4x: | 641 | .Louter4x: |
| 642 | lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) | ||
| 643 | pxor %xmm4,%xmm4 | ||
| 644 | pxor %xmm5,%xmm5 | ||
| 645 | ___ | ||
| 646 | for($k=0;$k<$STRIDE/16;$k+=4) { | ||
| 647 | $code.=<<___; | ||
| 648 | movdqa `16*($k+0)-128`($bp),%xmm0 | ||
| 649 | movdqa `16*($k+1)-128`($bp),%xmm1 | ||
| 650 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
| 651 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
| 652 | pand `16*($k+0)-128`(%rdx),%xmm0 | ||
| 653 | pand `16*($k+1)-128`(%rdx),%xmm1 | ||
| 654 | por %xmm0,%xmm4 | ||
| 655 | pand `16*($k+2)-128`(%rdx),%xmm2 | ||
| 656 | por %xmm1,%xmm5 | ||
| 657 | pand `16*($k+3)-128`(%rdx),%xmm3 | ||
| 658 | por %xmm2,%xmm4 | ||
| 659 | por %xmm3,%xmm5 | ||
| 660 | ___ | ||
| 661 | } | ||
| 662 | $code.=<<___; | ||
| 663 | por %xmm5,%xmm4 | ||
| 664 | pshufd \$0x4e,%xmm4,%xmm0 | ||
| 665 | por %xmm4,%xmm0 | ||
| 666 | lea $STRIDE($bp),$bp | ||
| 667 | movd %xmm0,$m0 # m0=bp[i] | ||
| 668 | |||
| 564 | xor $j,$j # j=0 | 669 | xor $j,$j # j=0 |
| 565 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 566 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 567 | pand %xmm4,%xmm0 | ||
| 568 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 569 | pand %xmm5,%xmm1 | ||
| 570 | 670 | ||
| 571 | mov (%rsp),$A[0] | 671 | mov (%rsp),$A[0] |
| 572 | mov $n0,$m1 | 672 | mov $n0,$m1 |
| @@ -575,18 +675,9 @@ $code.=<<___; | |||
| 575 | mov ($np),%rax | 675 | mov ($np),%rax |
| 576 | adc \$0,%rdx | 676 | adc \$0,%rdx |
| 577 | 677 | ||
| 578 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 579 | pand %xmm6,%xmm2 | ||
| 580 | por %xmm1,%xmm0 | ||
| 581 | pand %xmm7,%xmm3 | ||
| 582 | |||
| 583 | imulq $A[0],$m1 # tp[0]*n0 | 678 | imulq $A[0],$m1 # tp[0]*n0 |
| 584 | mov %rdx,$A[1] | 679 | mov %rdx,$A[1] |
| 585 | 680 | ||
| 586 | por %xmm2,%xmm0 | ||
| 587 | lea $STRIDE($bp),$bp | ||
| 588 | por %xmm3,%xmm0 | ||
| 589 | |||
| 590 | mulq $m1 # np[0]*m1 | 681 | mulq $m1 # np[0]*m1 |
| 591 | add %rax,$A[0] # "$N[0]", discarded | 682 | add %rax,$A[0] # "$N[0]", discarded |
| 592 | mov 8($ap),%rax | 683 | mov 8($ap),%rax |
| @@ -718,7 +809,6 @@ $code.=<<___; | |||
| 718 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | 809 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] |
| 719 | mov %rdx,$N[0] | 810 | mov %rdx,$N[0] |
| 720 | 811 | ||
| 721 | movd %xmm0,$m0 # bp[i+1] | ||
| 722 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | 812 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
| 723 | 813 | ||
| 724 | xor $N[1],$N[1] | 814 | xor $N[1],$N[1] |
| @@ -809,13 +899,7 @@ ___ | |||
| 809 | $code.=<<___; | 899 | $code.=<<___; |
| 810 | mov 8(%rsp,$num,8),%rsi # restore %rsp | 900 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
| 811 | mov \$1,%rax | 901 | mov \$1,%rax |
| 812 | ___ | 902 | |
| 813 | $code.=<<___ if ($win64); | ||
| 814 | movaps (%rsi),%xmm6 | ||
| 815 | movaps 0x10(%rsi),%xmm7 | ||
| 816 | lea 0x28(%rsi),%rsi | ||
| 817 | ___ | ||
| 818 | $code.=<<___; | ||
| 819 | mov (%rsi),%r15 | 903 | mov (%rsi),%r15 |
| 820 | mov 8(%rsi),%r14 | 904 | mov 8(%rsi),%r14 |
| 821 | mov 16(%rsi),%r13 | 905 | mov 16(%rsi),%r13 |
| @@ -830,8 +914,8 @@ ___ | |||
| 830 | }}} | 914 | }}} |
| 831 | 915 | ||
| 832 | { | 916 | { |
| 833 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | 917 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order |
| 834 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | 918 | ("%rdi","%rsi","%rdx","%ecx"); # Unix order |
| 835 | my $out=$inp; | 919 | my $out=$inp; |
| 836 | my $STRIDE=2**5*8; | 920 | my $STRIDE=2**5*8; |
| 837 | my $N=$STRIDE/4; | 921 | my $N=$STRIDE/4; |
| @@ -859,53 +943,89 @@ bn_scatter5: | |||
| 859 | .type bn_gather5,\@abi-omnipotent | 943 | .type bn_gather5,\@abi-omnipotent |
| 860 | .align 16 | 944 | .align 16 |
| 861 | bn_gather5: | 945 | bn_gather5: |
| 862 | ___ | 946 | .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases |
| 863 | $code.=<<___ if ($win64); | ||
| 864 | .LSEH_begin_bn_gather5: | ||
| 865 | # I can't trust assembler to use specific encoding:-( | 947 | # I can't trust assembler to use specific encoding:-( |
| 866 | .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp | 948 | .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 |
| 867 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | 949 | .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp |
| 868 | .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | 950 | lea .Linc(%rip),%rax |
| 951 | and \$-16,%rsp # shouldn't be formally required | ||
| 952 | |||
| 953 | movd $idx,%xmm5 | ||
| 954 | movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 | ||
| 955 | movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 | ||
| 956 | lea 128($tbl),%r11 # size optimization | ||
| 957 | lea 128(%rsp),%rax # size optimization | ||
| 958 | |||
| 959 | pshufd \$0,%xmm5,%xmm5 # broadcast $idx | ||
| 960 | movdqa %xmm1,%xmm4 | ||
| 961 | movdqa %xmm1,%xmm2 | ||
| 869 | ___ | 962 | ___ |
| 963 | ######################################################################## | ||
| 964 | # calculate mask by comparing 0..31 to $idx and save result to stack | ||
| 965 | # | ||
| 966 | for($i=0;$i<$STRIDE/16;$i+=4) { | ||
| 967 | $code.=<<___; | ||
| 968 | paddd %xmm0,%xmm1 | ||
| 969 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 | ||
| 970 | ___ | ||
| 971 | $code.=<<___ if ($i); | ||
| 972 | movdqa %xmm3,`16*($i-1)-128`(%rax) | ||
| 973 | ___ | ||
| 974 | $code.=<<___; | ||
| 975 | movdqa %xmm4,%xmm3 | ||
| 976 | |||
| 977 | paddd %xmm1,%xmm2 | ||
| 978 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | ||
| 979 | movdqa %xmm0,`16*($i+0)-128`(%rax) | ||
| 980 | movdqa %xmm4,%xmm0 | ||
| 981 | |||
| 982 | paddd %xmm2,%xmm3 | ||
| 983 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | ||
| 984 | movdqa %xmm1,`16*($i+1)-128`(%rax) | ||
| 985 | movdqa %xmm4,%xmm1 | ||
| 986 | |||
| 987 | paddd %xmm3,%xmm0 | ||
| 988 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | ||
| 989 | movdqa %xmm2,`16*($i+2)-128`(%rax) | ||
| 990 | movdqa %xmm4,%xmm2 | ||
| 991 | ___ | ||
| 992 | } | ||
| 870 | $code.=<<___; | 993 | $code.=<<___; |
| 871 | mov $idx,%r11 | 994 | movdqa %xmm3,`16*($i-1)-128`(%rax) |
| 872 | shr \$`log($N/8)/log(2)`,$idx | ||
| 873 | and \$`$N/8-1`,%r11 | ||
| 874 | not $idx | ||
| 875 | lea .Lmagic_masks(%rip),%rax | ||
| 876 | and \$`2**5/($N/8)-1`,$idx # 5 is "window size" | ||
| 877 | lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line | ||
| 878 | movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which | ||
| 879 | movq 8(%rax,$idx,8),%xmm5 # cache line contains element | ||
| 880 | movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument | ||
| 881 | movq 24(%rax,$idx,8),%xmm7 | ||
| 882 | jmp .Lgather | 995 | jmp .Lgather |
| 883 | .align 16 | ||
| 884 | .Lgather: | ||
| 885 | movq `0*$STRIDE/4-96`($tbl),%xmm0 | ||
| 886 | movq `1*$STRIDE/4-96`($tbl),%xmm1 | ||
| 887 | pand %xmm4,%xmm0 | ||
| 888 | movq `2*$STRIDE/4-96`($tbl),%xmm2 | ||
| 889 | pand %xmm5,%xmm1 | ||
| 890 | movq `3*$STRIDE/4-96`($tbl),%xmm3 | ||
| 891 | pand %xmm6,%xmm2 | ||
| 892 | por %xmm1,%xmm0 | ||
| 893 | pand %xmm7,%xmm3 | ||
| 894 | por %xmm2,%xmm0 | ||
| 895 | lea $STRIDE($tbl),$tbl | ||
| 896 | por %xmm3,%xmm0 | ||
| 897 | 996 | ||
| 997 | .align 32 | ||
| 998 | .Lgather: | ||
| 999 | pxor %xmm4,%xmm4 | ||
| 1000 | pxor %xmm5,%xmm5 | ||
| 1001 | ___ | ||
| 1002 | for($i=0;$i<$STRIDE/16;$i+=4) { | ||
| 1003 | $code.=<<___; | ||
| 1004 | movdqa `16*($i+0)-128`(%r11),%xmm0 | ||
| 1005 | movdqa `16*($i+1)-128`(%r11),%xmm1 | ||
| 1006 | movdqa `16*($i+2)-128`(%r11),%xmm2 | ||
| 1007 | pand `16*($i+0)-128`(%rax),%xmm0 | ||
| 1008 | movdqa `16*($i+3)-128`(%r11),%xmm3 | ||
| 1009 | pand `16*($i+1)-128`(%rax),%xmm1 | ||
| 1010 | por %xmm0,%xmm4 | ||
| 1011 | pand `16*($i+2)-128`(%rax),%xmm2 | ||
| 1012 | por %xmm1,%xmm5 | ||
| 1013 | pand `16*($i+3)-128`(%rax),%xmm3 | ||
| 1014 | por %xmm2,%xmm4 | ||
| 1015 | por %xmm3,%xmm5 | ||
| 1016 | ___ | ||
| 1017 | } | ||
| 1018 | $code.=<<___; | ||
| 1019 | por %xmm5,%xmm4 | ||
| 1020 | lea $STRIDE(%r11),%r11 | ||
| 1021 | pshufd \$0x4e,%xmm4,%xmm0 | ||
| 1022 | por %xmm4,%xmm0 | ||
| 898 | movq %xmm0,($out) # m0=bp[0] | 1023 | movq %xmm0,($out) # m0=bp[0] |
| 899 | lea 8($out),$out | 1024 | lea 8($out),$out |
| 900 | sub \$1,$num | 1025 | sub \$1,$num |
| 901 | jnz .Lgather | 1026 | jnz .Lgather |
| 902 | ___ | 1027 | |
| 903 | $code.=<<___ if ($win64); | 1028 | lea (%r10),%rsp |
| 904 | movaps (%rsp),%xmm6 | ||
| 905 | movaps 0x10(%rsp),%xmm7 | ||
| 906 | lea 0x28(%rsp),%rsp | ||
| 907 | ___ | ||
| 908 | $code.=<<___; | ||
| 909 | ret | 1029 | ret |
| 910 | .LSEH_end_bn_gather5: | 1030 | .LSEH_end_bn_gather5: |
| 911 | .size bn_gather5,.-bn_gather5 | 1031 | .size bn_gather5,.-bn_gather5 |
| @@ -913,9 +1033,9 @@ ___ | |||
| 913 | } | 1033 | } |
| 914 | $code.=<<___; | 1034 | $code.=<<___; |
| 915 | .align 64 | 1035 | .align 64 |
| 916 | .Lmagic_masks: | 1036 | .Linc: |
| 917 | .long 0,0, 0,0, 0,0, -1,-1 | 1037 | .long 0,0, 1,1 |
| 918 | .long 0,0, 0,0, 0,0, 0,0 | 1038 | .long 2,2, 2,2 |
| 919 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 1039 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 920 | ___ | 1040 | ___ |
| 921 | 1041 | ||
| @@ -954,7 +1074,7 @@ mul_handler: | |||
| 954 | cmp %r10,%rbx # context->Rip<end of prologue label | 1074 | cmp %r10,%rbx # context->Rip<end of prologue label |
| 955 | jb .Lcommon_seh_tail | 1075 | jb .Lcommon_seh_tail |
| 956 | 1076 | ||
| 957 | lea `40+48`(%rax),%rax | 1077 | lea 48(%rax),%rax |
| 958 | 1078 | ||
| 959 | mov 4(%r11),%r10d # HandlerData[1] | 1079 | mov 4(%r11),%r10d # HandlerData[1] |
| 960 | lea (%rsi,%r10),%r10 # end of alloca label | 1080 | lea (%rsi,%r10),%r10 # end of alloca label |
| @@ -971,9 +1091,7 @@ mul_handler: | |||
| 971 | mov 192($context),%r10 # pull $num | 1091 | mov 192($context),%r10 # pull $num |
| 972 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | 1092 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
| 973 | 1093 | ||
| 974 | movaps (%rax),%xmm0 | 1094 | lea 48(%rax),%rax |
| 975 | movaps 16(%rax),%xmm1 | ||
| 976 | lea `40+48`(%rax),%rax | ||
| 977 | 1095 | ||
| 978 | mov -8(%rax),%rbx | 1096 | mov -8(%rax),%rbx |
| 979 | mov -16(%rax),%rbp | 1097 | mov -16(%rax),%rbp |
| @@ -987,8 +1105,6 @@ mul_handler: | |||
| 987 | mov %r13,224($context) # restore context->R13 | 1105 | mov %r13,224($context) # restore context->R13 |
| 988 | mov %r14,232($context) # restore context->R14 | 1106 | mov %r14,232($context) # restore context->R14 |
| 989 | mov %r15,240($context) # restore context->R15 | 1107 | mov %r15,240($context) # restore context->R15 |
| 990 | movups %xmm0,512($context) # restore context->Xmm6 | ||
| 991 | movups %xmm1,528($context) # restore context->Xmm7 | ||
| 992 | 1108 | ||
| 993 | .Lcommon_seh_tail: | 1109 | .Lcommon_seh_tail: |
| 994 | mov 8(%rax),%rdi | 1110 | mov 8(%rax),%rdi |
| @@ -1057,10 +1173,9 @@ mul_handler: | |||
| 1057 | .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | 1173 | .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
| 1058 | .align 8 | 1174 | .align 8 |
| 1059 | .LSEH_info_bn_gather5: | 1175 | .LSEH_info_bn_gather5: |
| 1060 | .byte 0x01,0x0d,0x05,0x00 | 1176 | .byte 0x01,0x0b,0x03,0x0a |
| 1061 | .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | 1177 | .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 |
| 1062 | .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | 1178 | .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 |
| 1063 | .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 | ||
| 1064 | .align 8 | 1179 | .align 8 |
| 1065 | ___ | 1180 | ___ |
| 1066 | } | 1181 | } |
