diff options
Diffstat (limited to 'src/lib/libcrypto/ec/asm')
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl | 1733 | ||||
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl | 2890 | ||||
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl | 1740 | ||||
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl | 1971 |
4 files changed, 0 insertions, 8334 deletions
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl deleted file mode 100644 index 9e6c65905f..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl +++ /dev/null | |||
| @@ -1,1733 +0,0 @@ | |||
| 1 | #! /usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-armv4.pl,v 1.2 2022/12/26 07:18:51 jmc Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | |||
| 12 | # ==================================================================== | ||
| 13 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 14 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 15 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 16 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 17 | # ==================================================================== | ||
| 18 | # | ||
| 19 | # ECP_NISTZ256 module for ARMv4. | ||
| 20 | # | ||
| 21 | # October 2014. | ||
| 22 | # | ||
| 23 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | ||
| 24 | # http://eprint.iacr.org/2013/816. In the process of adaptation | ||
| 25 | # original .c module was made 32-bit savvy in order to make this | ||
| 26 | # implementation possible. | ||
| 27 | # | ||
| 28 | # with/without -DECP_NISTZ256_ASM | ||
| 29 | # Cortex-A8 +53-170% | ||
| 30 | # Cortex-A9 +76-205% | ||
| 31 | # Cortex-A15 +100-316% | ||
| 32 | # Snapdragon S4 +66-187% | ||
| 33 | # | ||
| 34 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 35 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | ||
| 36 | # operation. Keep in mind that +200% means 3x improvement. | ||
| 37 | |||
| 38 | $flavour = shift; | ||
| 39 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | ||
| 40 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | ||
| 41 | |||
| 42 | if ($flavour && $flavour ne "void") { | ||
| 43 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 44 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | ||
| 45 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | ||
| 46 | die "can't locate arm-xlate.pl"; | ||
| 47 | |||
| 48 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 49 | } else { | ||
| 50 | open STDOUT,">$output"; | ||
| 51 | } | ||
| 52 | |||
| 53 | $code.=<<___; | ||
| 54 | #include "arm_arch.h" | ||
| 55 | |||
| 56 | .text | ||
| 57 | #if defined(__thumb2__) | ||
| 58 | .syntax unified | ||
| 59 | .thumb | ||
| 60 | #else | ||
| 61 | .code 32 | ||
| 62 | #endif | ||
| 63 | ___ | ||
| 64 | |||
| 65 | $code.=<<___; | ||
| 66 | .Lone: | ||
| 67 | .long 1,0,0,0,0,0,0,0 | ||
| 68 | .align 6 | ||
| 69 | ___ | ||
| 70 | |||
| 71 | ######################################################################## | ||
| 72 | # common register layout, note that $t2 is link register, so that if | ||
| 73 | # internal subroutine uses $t2, then it has to offload lr... | ||
| 74 | |||
| 75 | ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= | ||
| 76 | map("r$_",(0..12,14)); | ||
| 77 | ($t0,$t3)=($ff,$a_ptr); | ||
| 78 | |||
| 79 | $code.=<<___; | ||
| 80 | @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 81 | .globl ecp_nistz256_from_mont | ||
| 82 | .type ecp_nistz256_from_mont,%function | ||
| 83 | ecp_nistz256_from_mont: | ||
| 84 | adr $b_ptr,.Lone | ||
| 85 | b .Lecp_nistz256_mul_mont | ||
| 86 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | ||
| 87 | |||
| 88 | @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 89 | .globl ecp_nistz256_mul_by_2 | ||
| 90 | .type ecp_nistz256_mul_by_2,%function | ||
| 91 | .align 4 | ||
| 92 | ecp_nistz256_mul_by_2: | ||
| 93 | stmdb sp!,{r4-r12,lr} | ||
| 94 | bl __ecp_nistz256_mul_by_2 | ||
| 95 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 96 | ldmia sp!,{r4-r12,pc} | ||
| 97 | #else | ||
| 98 | ldmia sp!,{r4-r12,lr} | ||
| 99 | bx lr @ interoperable with Thumb ISA:-) | ||
| 100 | #endif | ||
| 101 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | ||
| 102 | |||
| 103 | .type __ecp_nistz256_mul_by_2,%function | ||
| 104 | .align 4 | ||
| 105 | __ecp_nistz256_mul_by_2: | ||
| 106 | ldr $a0,[$a_ptr,#0] | ||
| 107 | ldr $a1,[$a_ptr,#4] | ||
| 108 | ldr $a2,[$a_ptr,#8] | ||
| 109 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself | ||
| 110 | ldr $a3,[$a_ptr,#12] | ||
| 111 | adcs $a1,$a1,$a1 | ||
| 112 | ldr $a4,[$a_ptr,#16] | ||
| 113 | adcs $a2,$a2,$a2 | ||
| 114 | ldr $a5,[$a_ptr,#20] | ||
| 115 | adcs $a3,$a3,$a3 | ||
| 116 | ldr $a6,[$a_ptr,#24] | ||
| 117 | adcs $a4,$a4,$a4 | ||
| 118 | ldr $a7,[$a_ptr,#28] | ||
| 119 | adcs $a5,$a5,$a5 | ||
| 120 | adcs $a6,$a6,$a6 | ||
| 121 | mov $ff,#0 | ||
| 122 | adcs $a7,$a7,$a7 | ||
| 123 | adc $ff,$ff,#0 | ||
| 124 | |||
| 125 | b .Lreduce_by_sub | ||
| 126 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 | ||
| 127 | |||
| 128 | @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], | ||
| 129 | @ const BN_ULONG r2[8]); | ||
| 130 | .globl ecp_nistz256_add | ||
| 131 | .type ecp_nistz256_add,%function | ||
| 132 | .align 4 | ||
| 133 | ecp_nistz256_add: | ||
| 134 | stmdb sp!,{r4-r12,lr} | ||
| 135 | bl __ecp_nistz256_add | ||
| 136 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 137 | ldmia sp!,{r4-r12,pc} | ||
| 138 | #else | ||
| 139 | ldmia sp!,{r4-r12,lr} | ||
| 140 | bx lr @ interoperable with Thumb ISA:-) | ||
| 141 | #endif | ||
| 142 | .size ecp_nistz256_add,.-ecp_nistz256_add | ||
| 143 | |||
| 144 | .type __ecp_nistz256_add,%function | ||
| 145 | .align 4 | ||
| 146 | __ecp_nistz256_add: | ||
| 147 | str lr,[sp,#-4]! @ push lr | ||
| 148 | |||
| 149 | ldr $a0,[$a_ptr,#0] | ||
| 150 | ldr $a1,[$a_ptr,#4] | ||
| 151 | ldr $a2,[$a_ptr,#8] | ||
| 152 | ldr $a3,[$a_ptr,#12] | ||
| 153 | ldr $a4,[$a_ptr,#16] | ||
| 154 | ldr $t0,[$b_ptr,#0] | ||
| 155 | ldr $a5,[$a_ptr,#20] | ||
| 156 | ldr $t1,[$b_ptr,#4] | ||
| 157 | ldr $a6,[$a_ptr,#24] | ||
| 158 | ldr $t2,[$b_ptr,#8] | ||
| 159 | ldr $a7,[$a_ptr,#28] | ||
| 160 | ldr $t3,[$b_ptr,#12] | ||
| 161 | adds $a0,$a0,$t0 | ||
| 162 | ldr $t0,[$b_ptr,#16] | ||
| 163 | adcs $a1,$a1,$t1 | ||
| 164 | ldr $t1,[$b_ptr,#20] | ||
| 165 | adcs $a2,$a2,$t2 | ||
| 166 | ldr $t2,[$b_ptr,#24] | ||
| 167 | adcs $a3,$a3,$t3 | ||
| 168 | ldr $t3,[$b_ptr,#28] | ||
| 169 | adcs $a4,$a4,$t0 | ||
| 170 | adcs $a5,$a5,$t1 | ||
| 171 | adcs $a6,$a6,$t2 | ||
| 172 | mov $ff,#0 | ||
| 173 | adcs $a7,$a7,$t3 | ||
| 174 | adc $ff,$ff,#0 | ||
| 175 | ldr lr,[sp],#4 @ pop lr | ||
| 176 | |||
| 177 | .Lreduce_by_sub: | ||
| 178 | |||
| 179 | @ if a+b >= modulus, subtract modulus. | ||
| 180 | @ | ||
| 181 | @ But since comparison implies subtraction, we subtract | ||
| 182 | @ modulus and then add it back if subtraction borrowed. | ||
| 183 | |||
| 184 | subs $a0,$a0,#-1 | ||
| 185 | sbcs $a1,$a1,#-1 | ||
| 186 | sbcs $a2,$a2,#-1 | ||
| 187 | sbcs $a3,$a3,#0 | ||
| 188 | sbcs $a4,$a4,#0 | ||
| 189 | sbcs $a5,$a5,#0 | ||
| 190 | sbcs $a6,$a6,#1 | ||
| 191 | sbcs $a7,$a7,#-1 | ||
| 192 | sbc $ff,$ff,#0 | ||
| 193 | |||
| 194 | @ Note that because mod has special form, i.e. consists of | ||
| 195 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 196 | @ using value of borrow as a whole or extracting single bit. | ||
| 197 | @ Follow $ff register... | ||
| 198 | |||
| 199 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 200 | adcs $a1,$a1,$ff | ||
| 201 | str $a0,[$r_ptr,#0] | ||
| 202 | adcs $a2,$a2,$ff | ||
| 203 | str $a1,[$r_ptr,#4] | ||
| 204 | adcs $a3,$a3,#0 | ||
| 205 | str $a2,[$r_ptr,#8] | ||
| 206 | adcs $a4,$a4,#0 | ||
| 207 | str $a3,[$r_ptr,#12] | ||
| 208 | adcs $a5,$a5,#0 | ||
| 209 | str $a4,[$r_ptr,#16] | ||
| 210 | adcs $a6,$a6,$ff,lsr#31 | ||
| 211 | str $a5,[$r_ptr,#20] | ||
| 212 | adcs $a7,$a7,$ff | ||
| 213 | str $a6,[$r_ptr,#24] | ||
| 214 | str $a7,[$r_ptr,#28] | ||
| 215 | |||
| 216 | mov pc,lr | ||
| 217 | .size __ecp_nistz256_add,.-__ecp_nistz256_add | ||
| 218 | |||
| 219 | @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 220 | .globl ecp_nistz256_mul_by_3 | ||
| 221 | .type ecp_nistz256_mul_by_3,%function | ||
| 222 | .align 4 | ||
| 223 | ecp_nistz256_mul_by_3: | ||
| 224 | stmdb sp!,{r4-r12,lr} | ||
| 225 | bl __ecp_nistz256_mul_by_3 | ||
| 226 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 227 | ldmia sp!,{r4-r12,pc} | ||
| 228 | #else | ||
| 229 | ldmia sp!,{r4-r12,lr} | ||
| 230 | bx lr @ interoperable with Thumb ISA:-) | ||
| 231 | #endif | ||
| 232 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | ||
| 233 | |||
| 234 | .type __ecp_nistz256_mul_by_3,%function | ||
| 235 | .align 4 | ||
| 236 | __ecp_nistz256_mul_by_3: | ||
| 237 | str lr,[sp,#-4]! @ push lr | ||
| 238 | |||
| 239 | @ As multiplication by 3 is performed as 2*n+n, below are inline | ||
| 240 | @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see | ||
| 241 | @ corresponding subroutines for details. | ||
| 242 | |||
| 243 | ldr $a0,[$a_ptr,#0] | ||
| 244 | ldr $a1,[$a_ptr,#4] | ||
| 245 | ldr $a2,[$a_ptr,#8] | ||
| 246 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] | ||
| 247 | ldr $a3,[$a_ptr,#12] | ||
| 248 | adcs $a1,$a1,$a1 | ||
| 249 | ldr $a4,[$a_ptr,#16] | ||
| 250 | adcs $a2,$a2,$a2 | ||
| 251 | ldr $a5,[$a_ptr,#20] | ||
| 252 | adcs $a3,$a3,$a3 | ||
| 253 | ldr $a6,[$a_ptr,#24] | ||
| 254 | adcs $a4,$a4,$a4 | ||
| 255 | ldr $a7,[$a_ptr,#28] | ||
| 256 | adcs $a5,$a5,$a5 | ||
| 257 | adcs $a6,$a6,$a6 | ||
| 258 | mov $ff,#0 | ||
| 259 | adcs $a7,$a7,$a7 | ||
| 260 | adc $ff,$ff,#0 | ||
| 261 | |||
| 262 | subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores | ||
| 263 | sbcs $a1,$a1,#-1 | ||
| 264 | sbcs $a2,$a2,#-1 | ||
| 265 | sbcs $a3,$a3,#0 | ||
| 266 | sbcs $a4,$a4,#0 | ||
| 267 | sbcs $a5,$a5,#0 | ||
| 268 | sbcs $a6,$a6,#1 | ||
| 269 | sbcs $a7,$a7,#-1 | ||
| 270 | sbc $ff,$ff,#0 | ||
| 271 | |||
| 272 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 273 | adcs $a1,$a1,$ff | ||
| 274 | adcs $a2,$a2,$ff | ||
| 275 | adcs $a3,$a3,#0 | ||
| 276 | adcs $a4,$a4,#0 | ||
| 277 | ldr $b_ptr,[$a_ptr,#0] | ||
| 278 | adcs $a5,$a5,#0 | ||
| 279 | ldr $t1,[$a_ptr,#4] | ||
| 280 | adcs $a6,$a6,$ff,lsr#31 | ||
| 281 | ldr $t2,[$a_ptr,#8] | ||
| 282 | adc $a7,$a7,$ff | ||
| 283 | |||
| 284 | ldr $t0,[$a_ptr,#12] | ||
| 285 | adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] | ||
| 286 | ldr $b_ptr,[$a_ptr,#16] | ||
| 287 | adcs $a1,$a1,$t1 | ||
| 288 | ldr $t1,[$a_ptr,#20] | ||
| 289 | adcs $a2,$a2,$t2 | ||
| 290 | ldr $t2,[$a_ptr,#24] | ||
| 291 | adcs $a3,$a3,$t0 | ||
| 292 | ldr $t3,[$a_ptr,#28] | ||
| 293 | adcs $a4,$a4,$b_ptr | ||
| 294 | adcs $a5,$a5,$t1 | ||
| 295 | adcs $a6,$a6,$t2 | ||
| 296 | mov $ff,#0 | ||
| 297 | adcs $a7,$a7,$t3 | ||
| 298 | adc $ff,$ff,#0 | ||
| 299 | ldr lr,[sp],#4 @ pop lr | ||
| 300 | |||
| 301 | b .Lreduce_by_sub | ||
| 302 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | ||
| 303 | |||
| 304 | @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 305 | .globl ecp_nistz256_div_by_2 | ||
| 306 | .type ecp_nistz256_div_by_2,%function | ||
| 307 | .align 4 | ||
| 308 | ecp_nistz256_div_by_2: | ||
| 309 | stmdb sp!,{r4-r12,lr} | ||
| 310 | bl __ecp_nistz256_div_by_2 | ||
| 311 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 312 | ldmia sp!,{r4-r12,pc} | ||
| 313 | #else | ||
| 314 | ldmia sp!,{r4-r12,lr} | ||
| 315 | bx lr @ interoperable with Thumb ISA:-) | ||
| 316 | #endif | ||
| 317 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | ||
| 318 | |||
| 319 | .type __ecp_nistz256_div_by_2,%function | ||
| 320 | .align 4 | ||
| 321 | __ecp_nistz256_div_by_2: | ||
| 322 | @ ret = (a is odd ? a+mod : a) >> 1 | ||
| 323 | |||
| 324 | ldr $a0,[$a_ptr,#0] | ||
| 325 | ldr $a1,[$a_ptr,#4] | ||
| 326 | ldr $a2,[$a_ptr,#8] | ||
| 327 | mov $ff,$a0,lsl#31 @ place least significant bit to most | ||
| 328 | @ significant position, now arithmetic | ||
| 329 | @ right shift by 31 will produce -1 or | ||
| 330 | @ 0, while logical right shift 1 or 0, | ||
| 331 | @ this is how modulus is conditionally | ||
| 332 | @ synthesized in this case... | ||
| 333 | ldr $a3,[$a_ptr,#12] | ||
| 334 | adds $a0,$a0,$ff,asr#31 | ||
| 335 | ldr $a4,[$a_ptr,#16] | ||
| 336 | adcs $a1,$a1,$ff,asr#31 | ||
| 337 | ldr $a5,[$a_ptr,#20] | ||
| 338 | adcs $a2,$a2,$ff,asr#31 | ||
| 339 | ldr $a6,[$a_ptr,#24] | ||
| 340 | adcs $a3,$a3,#0 | ||
| 341 | ldr $a7,[$a_ptr,#28] | ||
| 342 | adcs $a4,$a4,#0 | ||
| 343 | mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early | ||
| 344 | @ because it doesn't affect flags | ||
| 345 | adcs $a5,$a5,#0 | ||
| 346 | orr $a0,$a0,$a1,lsl#31 | ||
| 347 | adcs $a6,$a6,$ff,lsr#31 | ||
| 348 | mov $b_ptr,#0 | ||
| 349 | adcs $a7,$a7,$ff,asr#31 | ||
| 350 | mov $a1,$a1,lsr#1 | ||
| 351 | adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition | ||
| 352 | |||
| 353 | orr $a1,$a1,$a2,lsl#31 | ||
| 354 | mov $a2,$a2,lsr#1 | ||
| 355 | str $a0,[$r_ptr,#0] | ||
| 356 | orr $a2,$a2,$a3,lsl#31 | ||
| 357 | mov $a3,$a3,lsr#1 | ||
| 358 | str $a1,[$r_ptr,#4] | ||
| 359 | orr $a3,$a3,$a4,lsl#31 | ||
| 360 | mov $a4,$a4,lsr#1 | ||
| 361 | str $a2,[$r_ptr,#8] | ||
| 362 | orr $a4,$a4,$a5,lsl#31 | ||
| 363 | mov $a5,$a5,lsr#1 | ||
| 364 | str $a3,[$r_ptr,#12] | ||
| 365 | orr $a5,$a5,$a6,lsl#31 | ||
| 366 | mov $a6,$a6,lsr#1 | ||
| 367 | str $a4,[$r_ptr,#16] | ||
| 368 | orr $a6,$a6,$a7,lsl#31 | ||
| 369 | mov $a7,$a7,lsr#1 | ||
| 370 | str $a5,[$r_ptr,#20] | ||
| 371 | orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit | ||
| 372 | str $a6,[$r_ptr,#24] | ||
| 373 | str $a7,[$r_ptr,#28] | ||
| 374 | |||
| 375 | mov pc,lr | ||
| 376 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 | ||
| 377 | |||
| 378 | @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], | ||
| 379 | @ const BN_ULONG r2[8]); | ||
| 380 | .globl ecp_nistz256_sub | ||
| 381 | .type ecp_nistz256_sub,%function | ||
| 382 | .align 4 | ||
| 383 | ecp_nistz256_sub: | ||
| 384 | stmdb sp!,{r4-r12,lr} | ||
| 385 | bl __ecp_nistz256_sub | ||
| 386 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 387 | ldmia sp!,{r4-r12,pc} | ||
| 388 | #else | ||
| 389 | ldmia sp!,{r4-r12,lr} | ||
| 390 | bx lr @ interoperable with Thumb ISA:-) | ||
| 391 | #endif | ||
| 392 | .size ecp_nistz256_sub,.-ecp_nistz256_sub | ||
| 393 | |||
| 394 | .type __ecp_nistz256_sub,%function | ||
| 395 | .align 4 | ||
| 396 | __ecp_nistz256_sub: | ||
| 397 | str lr,[sp,#-4]! @ push lr | ||
| 398 | |||
| 399 | ldr $a0,[$a_ptr,#0] | ||
| 400 | ldr $a1,[$a_ptr,#4] | ||
| 401 | ldr $a2,[$a_ptr,#8] | ||
| 402 | ldr $a3,[$a_ptr,#12] | ||
| 403 | ldr $a4,[$a_ptr,#16] | ||
| 404 | ldr $t0,[$b_ptr,#0] | ||
| 405 | ldr $a5,[$a_ptr,#20] | ||
| 406 | ldr $t1,[$b_ptr,#4] | ||
| 407 | ldr $a6,[$a_ptr,#24] | ||
| 408 | ldr $t2,[$b_ptr,#8] | ||
| 409 | ldr $a7,[$a_ptr,#28] | ||
| 410 | ldr $t3,[$b_ptr,#12] | ||
| 411 | subs $a0,$a0,$t0 | ||
| 412 | ldr $t0,[$b_ptr,#16] | ||
| 413 | sbcs $a1,$a1,$t1 | ||
| 414 | ldr $t1,[$b_ptr,#20] | ||
| 415 | sbcs $a2,$a2,$t2 | ||
| 416 | ldr $t2,[$b_ptr,#24] | ||
| 417 | sbcs $a3,$a3,$t3 | ||
| 418 | ldr $t3,[$b_ptr,#28] | ||
| 419 | sbcs $a4,$a4,$t0 | ||
| 420 | sbcs $a5,$a5,$t1 | ||
| 421 | sbcs $a6,$a6,$t2 | ||
| 422 | sbcs $a7,$a7,$t3 | ||
| 423 | sbc $ff,$ff,$ff @ broadcast borrow bit | ||
| 424 | ldr lr,[sp],#4 @ pop lr | ||
| 425 | |||
| 426 | .Lreduce_by_add: | ||
| 427 | |||
| 428 | @ if a-b borrows, add modulus. | ||
| 429 | @ | ||
| 430 | @ Note that because mod has special form, i.e. consists of | ||
| 431 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 432 | @ broadcasting borrow bit to a register, $ff, and using it as | ||
| 433 | @ a whole or extracting single bit. | ||
| 434 | |||
| 435 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 436 | adcs $a1,$a1,$ff | ||
| 437 | str $a0,[$r_ptr,#0] | ||
| 438 | adcs $a2,$a2,$ff | ||
| 439 | str $a1,[$r_ptr,#4] | ||
| 440 | adcs $a3,$a3,#0 | ||
| 441 | str $a2,[$r_ptr,#8] | ||
| 442 | adcs $a4,$a4,#0 | ||
| 443 | str $a3,[$r_ptr,#12] | ||
| 444 | adcs $a5,$a5,#0 | ||
| 445 | str $a4,[$r_ptr,#16] | ||
| 446 | adcs $a6,$a6,$ff,lsr#31 | ||
| 447 | str $a5,[$r_ptr,#20] | ||
| 448 | adcs $a7,$a7,$ff | ||
| 449 | str $a6,[$r_ptr,#24] | ||
| 450 | str $a7,[$r_ptr,#28] | ||
| 451 | |||
| 452 | mov pc,lr | ||
| 453 | .size __ecp_nistz256_sub,.-__ecp_nistz256_sub | ||
| 454 | |||
| 455 | @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 456 | .globl ecp_nistz256_neg | ||
| 457 | .type ecp_nistz256_neg,%function | ||
| 458 | .align 4 | ||
| 459 | ecp_nistz256_neg: | ||
| 460 | stmdb sp!,{r4-r12,lr} | ||
| 461 | bl __ecp_nistz256_neg | ||
| 462 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 463 | ldmia sp!,{r4-r12,pc} | ||
| 464 | #else | ||
| 465 | ldmia sp!,{r4-r12,lr} | ||
| 466 | bx lr @ interoperable with Thumb ISA:-) | ||
| 467 | #endif | ||
| 468 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | ||
| 469 | |||
| 470 | .type __ecp_nistz256_neg,%function | ||
| 471 | .align 4 | ||
| 472 | __ecp_nistz256_neg: | ||
| 473 | ldr $a0,[$a_ptr,#0] | ||
| 474 | eor $ff,$ff,$ff | ||
| 475 | ldr $a1,[$a_ptr,#4] | ||
| 476 | ldr $a2,[$a_ptr,#8] | ||
| 477 | subs $a0,$ff,$a0 | ||
| 478 | ldr $a3,[$a_ptr,#12] | ||
| 479 | sbcs $a1,$ff,$a1 | ||
| 480 | ldr $a4,[$a_ptr,#16] | ||
| 481 | sbcs $a2,$ff,$a2 | ||
| 482 | ldr $a5,[$a_ptr,#20] | ||
| 483 | sbcs $a3,$ff,$a3 | ||
| 484 | ldr $a6,[$a_ptr,#24] | ||
| 485 | sbcs $a4,$ff,$a4 | ||
| 486 | ldr $a7,[$a_ptr,#28] | ||
| 487 | sbcs $a5,$ff,$a5 | ||
| 488 | sbcs $a6,$ff,$a6 | ||
| 489 | sbcs $a7,$ff,$a7 | ||
| 490 | sbc $ff,$ff,$ff | ||
| 491 | |||
| 492 | b .Lreduce_by_add | ||
| 493 | .size __ecp_nistz256_neg,.-__ecp_nistz256_neg | ||
| 494 | ___ | ||
| 495 | { | ||
| 496 | my @acc=map("r$_",(3..11)); | ||
| 497 | my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); | ||
| 498 | |||
| 499 | $code.=<<___; | ||
| 500 | @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 501 | .globl ecp_nistz256_sqr_mont | ||
| 502 | .type ecp_nistz256_sqr_mont,%function | ||
| 503 | .align 4 | ||
| 504 | ecp_nistz256_sqr_mont: | ||
| 505 | mov $b_ptr,$a_ptr | ||
| 506 | b .Lecp_nistz256_mul_mont | ||
| 507 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | ||
| 508 | |||
| 509 | @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], | ||
| 510 | @ const BN_ULONG r2[8]); | ||
| 511 | .globl ecp_nistz256_mul_mont | ||
| 512 | .type ecp_nistz256_mul_mont,%function | ||
| 513 | .align 4 | ||
| 514 | ecp_nistz256_mul_mont: | ||
| 515 | .Lecp_nistz256_mul_mont: | ||
| 516 | stmdb sp!,{r4-r12,lr} | ||
| 517 | bl __ecp_nistz256_mul_mont | ||
| 518 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 519 | ldmia sp!,{r4-r12,pc} | ||
| 520 | #else | ||
| 521 | ldmia sp!,{r4-r12,lr} | ||
| 522 | bx lr @ interoperable with Thumb ISA:-) | ||
| 523 | #endif | ||
| 524 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | ||
| 525 | |||
| 526 | .type __ecp_nistz256_mul_mont,%function | ||
| 527 | .align 4 | ||
| 528 | __ecp_nistz256_mul_mont: | ||
| 529 | stmdb sp!,{r0-r2,lr} @ make a copy of arguments too | ||
| 530 | |||
| 531 | ldr $bj,[$b_ptr,#0] @ b[0] | ||
| 532 | ldmia $a_ptr,{@acc[1]-@acc[8]} | ||
| 533 | |||
| 534 | umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] | ||
| 535 | stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so | ||
| 536 | @ that it can be addressed | ||
| 537 | @ without spending register | ||
| 538 | @ on address | ||
| 539 | umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] | ||
| 540 | umull @acc[2],$t1,@acc[3],$bj | ||
| 541 | adds @acc[1],@acc[1],$t3 @ accumulate high part of mult | ||
| 542 | umull @acc[3],$t2,@acc[4],$bj | ||
| 543 | adcs @acc[2],@acc[2],$t0 | ||
| 544 | umull @acc[4],$t3,@acc[5],$bj | ||
| 545 | adcs @acc[3],@acc[3],$t1 | ||
| 546 | umull @acc[5],$t0,@acc[6],$bj | ||
| 547 | adcs @acc[4],@acc[4],$t2 | ||
| 548 | umull @acc[6],$t1,@acc[7],$bj | ||
| 549 | adcs @acc[5],@acc[5],$t3 | ||
| 550 | umull @acc[7],$t2,@acc[8],$bj | ||
| 551 | adcs @acc[6],@acc[6],$t0 | ||
| 552 | adcs @acc[7],@acc[7],$t1 | ||
| 553 | eor $t3,$t3,$t3 @ first overflow bit is zero | ||
| 554 | adc @acc[8],$t2,#0 | ||
| 555 | ___ | ||
| 556 | for(my $i=1;$i<8;$i++) { | ||
| 557 | my $t4=@acc[0]; | ||
| 558 | |||
| 559 | # Reduction iteration is normally performed by accumulating | ||
| 560 | # result of multiplication of modulus by "magic" digit [and | ||
| 561 | # omitting least significant word, which is guaranteed to | ||
| 562 | # be 0], but thanks to special form of modulus and "magic" | ||
| 563 | # digit being equal to least significant word, it can be | ||
| 564 | # performed with additions and subtractions alone. Indeed: | ||
| 565 | # | ||
| 566 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | ||
| 567 | # * abcd | ||
| 568 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 569 | # | ||
| 570 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 571 | # rewrite above as: | ||
| 572 | # | ||
| 573 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 574 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | ||
| 575 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | ||
| 576 | # | ||
| 577 | # or marking redundant operations: | ||
| 578 | # | ||
| 579 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | ||
| 580 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | ||
| 581 | # - abcd.----.----.----.----.----.----.---- | ||
| 582 | |||
| 583 | $code.=<<___; | ||
| 584 | @ multiplication-less reduction $i | ||
| 585 | adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] | ||
| 586 | ldr $bj,[sp,#40] @ restore b_ptr | ||
| 587 | adcs @acc[4],@acc[4],#0 @ r[4]+=0 | ||
| 588 | adcs @acc[5],@acc[5],#0 @ r[5]+=0 | ||
| 589 | adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] | ||
| 590 | ldr $t1,[sp,#0] @ load a[0] | ||
| 591 | adcs @acc[7],@acc[7],#0 @ r[7]+=0 | ||
| 592 | ldr $bj,[$bj,#4*$i] @ load b[i] | ||
| 593 | adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] | ||
| 594 | eor $t0,$t0,$t0 | ||
| 595 | adc $t3,$t3,#0 @ overflow bit | ||
| 596 | subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] | ||
| 597 | ldr $t2,[sp,#4] @ a[1] | ||
| 598 | sbcs @acc[8],@acc[8],#0 @ r[8]-=0 | ||
| 599 | umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] | ||
| 600 | eor $t1,$t1,$t1 | ||
| 601 | sbc @acc[0],$t3,#0 @ overflow bit, keep in mind | ||
| 602 | @ that netto result is | ||
| 603 | @ addition of a value which | ||
| 604 | @ makes underflow impossible | ||
| 605 | |||
| 606 | ldr $t3,[sp,#8] @ a[2] | ||
| 607 | umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] | ||
| 608 | str @acc[0],[sp,#36] @ temporarily offload overflow | ||
| 609 | eor $t2,$t2,$t2 | ||
| 610 | ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] | ||
| 611 | umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] | ||
| 612 | eor $t3,$t3,$t3 | ||
| 613 | adds @acc[2],@acc[2],$t0 @ accumulate high part of mult | ||
| 614 | ldr $t0,[sp,#16] @ a[4] | ||
| 615 | umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] | ||
| 616 | eor $t4,$t4,$t4 | ||
| 617 | adcs @acc[3],@acc[3],$t1 | ||
| 618 | ldr $t1,[sp,#20] @ a[5] | ||
| 619 | umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] | ||
| 620 | eor $t0,$t0,$t0 | ||
| 621 | adcs @acc[4],@acc[4],$t2 | ||
| 622 | ldr $t2,[sp,#24] @ a[6] | ||
| 623 | umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] | ||
| 624 | eor $t1,$t1,$t1 | ||
| 625 | adcs @acc[5],@acc[5],$t3 | ||
| 626 | ldr $t3,[sp,#28] @ a[7] | ||
| 627 | umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] | ||
| 628 | eor $t2,$t2,$t2 | ||
| 629 | adcs @acc[6],@acc[6],$t4 | ||
| 630 | ldr @acc[0],[sp,#36] @ restore overflow bit | ||
| 631 | umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] | ||
| 632 | eor $t3,$t3,$t3 | ||
| 633 | adcs @acc[7],@acc[7],$t0 | ||
| 634 | adcs @acc[8],@acc[8],$t1 | ||
| 635 | adcs @acc[0],$acc[0],$t2 | ||
| 636 | adc $t3,$t3,#0 @ new overflow bit | ||
| 637 | ___ | ||
| 638 | push(@acc,shift(@acc)); # rotate registers, so that | ||
| 639 | # "r[i]" becomes r[i] | ||
| 640 | } | ||
| 641 | $code.=<<___; | ||
| 642 | @ last multiplication-less reduction | ||
| 643 | adds @acc[3],@acc[3],@acc[0] | ||
| 644 | ldr $r_ptr,[sp,#32] @ restore r_ptr | ||
| 645 | adcs @acc[4],@acc[4],#0 | ||
| 646 | adcs @acc[5],@acc[5],#0 | ||
| 647 | adcs @acc[6],@acc[6],@acc[0] | ||
| 648 | adcs @acc[7],@acc[7],#0 | ||
| 649 | adcs @acc[8],@acc[8],@acc[0] | ||
| 650 | adc $t3,$t3,#0 | ||
| 651 | subs @acc[7],@acc[7],@acc[0] | ||
| 652 | sbcs @acc[8],@acc[8],#0 | ||
| 653 | sbc @acc[0],$t3,#0 @ overflow bit | ||
| 654 | |||
| 655 | @ Final step is "if result > mod, subtract mod", but we do it | ||
| 656 | @ "other way around", namely subtract modulus from result | ||
| 657 | @ and if it borrowed, add modulus back. | ||
| 658 | |||
| 659 | adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 | ||
| 660 | adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 | ||
| 661 | adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 | ||
| 662 | sbcs @acc[4],@acc[4],#0 | ||
| 663 | sbcs @acc[5],@acc[5],#0 | ||
| 664 | sbcs @acc[6],@acc[6],#0 | ||
| 665 | sbcs @acc[7],@acc[7],#1 | ||
| 666 | adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 | ||
| 667 | ldr lr,[sp,#44] @ restore lr | ||
| 668 | sbc @acc[0],@acc[0],#0 @ broadcast borrow bit | ||
| 669 | add sp,sp,#48 | ||
| 670 | |||
| 671 | @ Note that because mod has special form, i.e. consists of | ||
| 672 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 673 | @ broadcasting borrow bit to a register, @acc[0], and using it as | ||
| 674 | @ a whole or extracting single bit. | ||
| 675 | |||
| 676 | adds @acc[1],@acc[1],@acc[0] @ add modulus or zero | ||
| 677 | adcs @acc[2],@acc[2],@acc[0] | ||
| 678 | str @acc[1],[$r_ptr,#0] | ||
| 679 | adcs @acc[3],@acc[3],@acc[0] | ||
| 680 | str @acc[2],[$r_ptr,#4] | ||
| 681 | adcs @acc[4],@acc[4],#0 | ||
| 682 | str @acc[3],[$r_ptr,#8] | ||
| 683 | adcs @acc[5],@acc[5],#0 | ||
| 684 | str @acc[4],[$r_ptr,#12] | ||
| 685 | adcs @acc[6],@acc[6],#0 | ||
| 686 | str @acc[5],[$r_ptr,#16] | ||
| 687 | adcs @acc[7],@acc[7],@acc[0],lsr#31 | ||
| 688 | str @acc[6],[$r_ptr,#20] | ||
| 689 | adc @acc[8],@acc[8],@acc[0] | ||
| 690 | str @acc[7],[$r_ptr,#24] | ||
| 691 | str @acc[8],[$r_ptr,#28] | ||
| 692 | |||
| 693 | mov pc,lr | ||
| 694 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont | ||
| 695 | ___ | ||
| 696 | } | ||
| 697 | |||
| 698 | { | ||
| 699 | my ($out,$inp,$index,$mask)=map("r$_",(0..3)); | ||
| 700 | $code.=<<___; | ||
| 701 | @ void ecp_nistz256_select_w5(P256_POINT *r0,const void *r1, | ||
| 702 | @ int r2); | ||
| 703 | .globl ecp_nistz256_select_w5 | ||
| 704 | .type ecp_nistz256_select_w5,%function | ||
| 705 | .align 5 | ||
| 706 | ecp_nistz256_select_w5: | ||
| 707 | stmdb sp!,{r4-r11} | ||
| 708 | |||
| 709 | cmp $index,#0 | ||
| 710 | mov $mask,#0 | ||
| 711 | #ifdef __thumb2__ | ||
| 712 | itt ne | ||
| 713 | #endif | ||
| 714 | subne $index,$index,#1 | ||
| 715 | movne $mask,#-1 | ||
| 716 | add $inp,$inp,$index,lsl#2 | ||
| 717 | |||
| 718 | ldr r4,[$inp,#64*0] | ||
| 719 | ldr r5,[$inp,#64*1] | ||
| 720 | ldr r6,[$inp,#64*2] | ||
| 721 | and r4,r4,$mask | ||
| 722 | ldr r7,[$inp,#64*3] | ||
| 723 | and r5,r5,$mask | ||
| 724 | ldr r8,[$inp,#64*4] | ||
| 725 | and r6,r6,$mask | ||
| 726 | ldr r9,[$inp,#64*5] | ||
| 727 | and r7,r7,$mask | ||
| 728 | ldr r10,[$inp,#64*6] | ||
| 729 | and r8,r8,$mask | ||
| 730 | ldr r11,[$inp,#64*7] | ||
| 731 | add $inp,$inp,#64*8 | ||
| 732 | and r9,r9,$mask | ||
| 733 | and r10,r10,$mask | ||
| 734 | and r11,r11,$mask | ||
| 735 | stmia $out!,{r4-r11} @ X | ||
| 736 | |||
| 737 | ldr r4,[$inp,#64*0] | ||
| 738 | ldr r5,[$inp,#64*1] | ||
| 739 | ldr r6,[$inp,#64*2] | ||
| 740 | and r4,r4,$mask | ||
| 741 | ldr r7,[$inp,#64*3] | ||
| 742 | and r5,r5,$mask | ||
| 743 | ldr r8,[$inp,#64*4] | ||
| 744 | and r6,r6,$mask | ||
| 745 | ldr r9,[$inp,#64*5] | ||
| 746 | and r7,r7,$mask | ||
| 747 | ldr r10,[$inp,#64*6] | ||
| 748 | and r8,r8,$mask | ||
| 749 | ldr r11,[$inp,#64*7] | ||
| 750 | add $inp,$inp,#64*8 | ||
| 751 | and r9,r9,$mask | ||
| 752 | and r10,r10,$mask | ||
| 753 | and r11,r11,$mask | ||
| 754 | stmia $out!,{r4-r11} @ Y | ||
| 755 | |||
| 756 | ldr r4,[$inp,#64*0] | ||
| 757 | ldr r5,[$inp,#64*1] | ||
| 758 | ldr r6,[$inp,#64*2] | ||
| 759 | and r4,r4,$mask | ||
| 760 | ldr r7,[$inp,#64*3] | ||
| 761 | and r5,r5,$mask | ||
| 762 | ldr r8,[$inp,#64*4] | ||
| 763 | and r6,r6,$mask | ||
| 764 | ldr r9,[$inp,#64*5] | ||
| 765 | and r7,r7,$mask | ||
| 766 | ldr r10,[$inp,#64*6] | ||
| 767 | and r8,r8,$mask | ||
| 768 | ldr r11,[$inp,#64*7] | ||
| 769 | and r9,r9,$mask | ||
| 770 | and r10,r10,$mask | ||
| 771 | and r11,r11,$mask | ||
| 772 | stmia $out,{r4-r11} @ Z | ||
| 773 | |||
| 774 | ldmia sp!,{r4-r11} | ||
| 775 | #if __ARM_ARCH__>=5 || defined(__thumb__) | ||
| 776 | bx lr | ||
| 777 | #else | ||
| 778 | mov pc,lr | ||
| 779 | #endif | ||
| 780 | .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 | ||
| 781 | |||
| 782 | @ void ecp_nistz256_select_w7(P256_POINT_AFFINE *r0,const void *r1, | ||
| 783 | @ int r2); | ||
| 784 | .globl ecp_nistz256_select_w7 | ||
| 785 | .type ecp_nistz256_select_w7,%function | ||
| 786 | .align 5 | ||
| 787 | ecp_nistz256_select_w7: | ||
| 788 | stmdb sp!,{r4-r7} | ||
| 789 | |||
| 790 | cmp $index,#0 | ||
| 791 | mov $mask,#0 | ||
| 792 | #ifdef __thumb2__ | ||
| 793 | itt ne | ||
| 794 | #endif | ||
| 795 | subne $index,$index,#1 | ||
| 796 | movne $mask,#-1 | ||
| 797 | add $inp,$inp,$index | ||
| 798 | mov $index,#64/4 | ||
| 799 | nop | ||
| 800 | .Loop_select_w7: | ||
| 801 | ldrb r4,[$inp,#64*0] | ||
| 802 | subs $index,$index,#1 | ||
| 803 | ldrb r5,[$inp,#64*1] | ||
| 804 | ldrb r6,[$inp,#64*2] | ||
| 805 | ldrb r7,[$inp,#64*3] | ||
| 806 | add $inp,$inp,#64*4 | ||
| 807 | orr r4,r4,r5,lsl#8 | ||
| 808 | orr r4,r4,r6,lsl#16 | ||
| 809 | orr r4,r4,r7,lsl#24 | ||
| 810 | and r4,r4,$mask | ||
| 811 | str r4,[$out],#4 | ||
| 812 | bne .Loop_select_w7 | ||
| 813 | |||
| 814 | ldmia sp!,{r4-r7} | ||
| 815 | #if __ARM_ARCH__>=5 || defined(__thumb__) | ||
| 816 | bx lr | ||
| 817 | #else | ||
| 818 | mov pc,lr | ||
| 819 | #endif | ||
| 820 | .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 | ||
| 821 | ___ | ||
| 822 | } | ||
| 823 | if (0) { | ||
| 824 | # In comparison to integer-only equivalent of below subroutine: | ||
| 825 | # | ||
| 826 | # Cortex-A8 +10% | ||
| 827 | # Cortex-A9 -10% | ||
| 828 | # Snapdragon S4 +5% | ||
| 829 | # | ||
| 830 | # As not all time is spent in multiplication, overall impact is deemed | ||
| 831 | # too low to care about. | ||
| 832 | |||
| 833 | my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); | ||
| 834 | my $mask="q4"; | ||
| 835 | my $mult="q5"; | ||
| 836 | my @AxB=map("q$_",(8..15)); | ||
| 837 | |||
| 838 | my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); | ||
| 839 | |||
| 840 | $code.=<<___; | ||
| 841 | #if __ARM_ARCH__>=7 | ||
| 842 | .fpu neon | ||
| 843 | |||
| 844 | .globl ecp_nistz256_mul_mont_neon | ||
| 845 | .type ecp_nistz256_mul_mont_neon,%function | ||
| 846 | .align 5 | ||
| 847 | ecp_nistz256_mul_mont_neon: | ||
| 848 | mov ip,sp | ||
| 849 | stmdb sp!,{r4-r9} | ||
| 850 | vstmdb sp!,{q4-q5} @ ABI specification says so | ||
| 851 | |||
| 852 | sub $toutptr,sp,#40 | ||
| 853 | vld1.32 {${Bi}[0]},[$bptr,:32]! | ||
| 854 | veor $zero,$zero,$zero | ||
| 855 | vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( | ||
| 856 | vzip.16 $Bi,$zero | ||
| 857 | mov sp,$toutptr @ alloca | ||
| 858 | vmov.i64 $mask,#0xffff | ||
| 859 | |||
| 860 | vmull.u32 @AxB[0],$Bi,${A0}[0] | ||
| 861 | vmull.u32 @AxB[1],$Bi,${A0}[1] | ||
| 862 | vmull.u32 @AxB[2],$Bi,${A1}[0] | ||
| 863 | vmull.u32 @AxB[3],$Bi,${A1}[1] | ||
| 864 | vshr.u64 $temp,@AxB[0]#lo,#16 | ||
| 865 | vmull.u32 @AxB[4],$Bi,${A2}[0] | ||
| 866 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp | ||
| 867 | vmull.u32 @AxB[5],$Bi,${A2}[1] | ||
| 868 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] | ||
| 869 | vmull.u32 @AxB[6],$Bi,${A3}[0] | ||
| 870 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] | ||
| 871 | vmull.u32 @AxB[7],$Bi,${A3}[1] | ||
| 872 | ___ | ||
| 873 | for($i=1;$i<8;$i++) { | ||
| 874 | $code.=<<___; | ||
| 875 | vld1.32 {${Bi}[0]},[$bptr,:32]! | ||
| 876 | veor $zero,$zero,$zero | ||
| 877 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction | ||
| 878 | vshl.u64 $mult,@AxB[0],#32 | ||
| 879 | vadd.u64 @AxB[3],@AxB[3],@AxB[0] | ||
| 880 | vsub.u64 $mult,$mult,@AxB[0] | ||
| 881 | vzip.16 $Bi,$zero | ||
| 882 | vadd.u64 @AxB[6],@AxB[6],@AxB[0] | ||
| 883 | vadd.u64 @AxB[7],@AxB[7],$mult | ||
| 884 | ___ | ||
| 885 | push(@AxB,shift(@AxB)); | ||
| 886 | $code.=<<___; | ||
| 887 | vmlal.u32 @AxB[0],$Bi,${A0}[0] | ||
| 888 | vmlal.u32 @AxB[1],$Bi,${A0}[1] | ||
| 889 | vmlal.u32 @AxB[2],$Bi,${A1}[0] | ||
| 890 | vmlal.u32 @AxB[3],$Bi,${A1}[1] | ||
| 891 | vshr.u64 $temp,@AxB[0]#lo,#16 | ||
| 892 | vmlal.u32 @AxB[4],$Bi,${A2}[0] | ||
| 893 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp | ||
| 894 | vmlal.u32 @AxB[5],$Bi,${A2}[1] | ||
| 895 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] | ||
| 896 | vmlal.u32 @AxB[6],$Bi,${A3}[0] | ||
| 897 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] | ||
| 898 | vmull.u32 @AxB[7],$Bi,${A3}[1] | ||
| 899 | ___ | ||
| 900 | } | ||
| 901 | $code.=<<___; | ||
| 902 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction | ||
| 903 | vshl.u64 $mult,@AxB[0],#32 | ||
| 904 | vadd.u64 @AxB[3],@AxB[3],@AxB[0] | ||
| 905 | vsub.u64 $mult,$mult,@AxB[0] | ||
| 906 | vadd.u64 @AxB[6],@AxB[6],@AxB[0] | ||
| 907 | vadd.u64 @AxB[7],@AxB[7],$mult | ||
| 908 | |||
| 909 | vshr.u64 $temp,@AxB[1]#lo,#16 @ convert | ||
| 910 | vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp | ||
| 911 | vshr.u64 $temp,@AxB[1]#hi,#16 | ||
| 912 | vzip.16 @AxB[1]#lo,@AxB[1]#hi | ||
| 913 | ___ | ||
| 914 | foreach (2..7) { | ||
| 915 | $code.=<<___; | ||
| 916 | vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp | ||
| 917 | vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! | ||
| 918 | vshr.u64 $temp,@AxB[$_]#lo,#16 | ||
| 919 | vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp | ||
| 920 | vshr.u64 $temp,@AxB[$_]#hi,#16 | ||
| 921 | vzip.16 @AxB[$_]#lo,@AxB[$_]#hi | ||
| 922 | ___ | ||
| 923 | } | ||
| 924 | $code.=<<___; | ||
| 925 | vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! | ||
| 926 | vst1.32 {$temp},[$toutptr] @ upper 33 bits | ||
| 927 | |||
| 928 | ldr r1,[sp,#0] | ||
| 929 | ldr r2,[sp,#4] | ||
| 930 | ldr r3,[sp,#8] | ||
| 931 | subs r1,r1,#-1 | ||
| 932 | ldr r4,[sp,#12] | ||
| 933 | sbcs r2,r2,#-1 | ||
| 934 | ldr r5,[sp,#16] | ||
| 935 | sbcs r3,r3,#-1 | ||
| 936 | ldr r6,[sp,#20] | ||
| 937 | sbcs r4,r4,#0 | ||
| 938 | ldr r7,[sp,#24] | ||
| 939 | sbcs r5,r5,#0 | ||
| 940 | ldr r8,[sp,#28] | ||
| 941 | sbcs r6,r6,#0 | ||
| 942 | ldr r9,[sp,#32] @ top-most bit | ||
| 943 | sbcs r7,r7,#1 | ||
| 944 | sub sp,ip,#40+16 | ||
| 945 | sbcs r8,r8,#-1 | ||
| 946 | sbc r9,r9,#0 | ||
| 947 | vldmia sp!,{q4-q5} | ||
| 948 | |||
| 949 | adds r1,r1,r9 | ||
| 950 | adcs r2,r2,r9 | ||
| 951 | str r1,[$rptr,#0] | ||
| 952 | adcs r3,r3,r9 | ||
| 953 | str r2,[$rptr,#4] | ||
| 954 | adcs r4,r4,#0 | ||
| 955 | str r3,[$rptr,#8] | ||
| 956 | adcs r5,r5,#0 | ||
| 957 | str r4,[$rptr,#12] | ||
| 958 | adcs r6,r6,#0 | ||
| 959 | str r5,[$rptr,#16] | ||
| 960 | adcs r7,r7,r9,lsr#31 | ||
| 961 | str r6,[$rptr,#20] | ||
| 962 | adcs r8,r8,r9 | ||
| 963 | str r7,[$rptr,#24] | ||
| 964 | str r8,[$rptr,#28] | ||
| 965 | |||
| 966 | ldmia sp!,{r4-r9} | ||
| 967 | bx lr | ||
| 968 | .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon | ||
| 969 | #endif | ||
| 970 | ___ | ||
| 971 | } | ||
| 972 | |||
| 973 | {{{ | ||
| 974 | ######################################################################## | ||
| 975 | # Below $aN assignment matches order in which 256-bit result appears in | ||
| 976 | # register bank at return from __ecp_nistz256_mul_mont, so that we can | ||
| 977 | # skip over reloading it from memory. This means that below functions | ||
| 978 | # use custom calling sequence accepting 256-bit input in registers, | ||
| 979 | # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. | ||
| 980 | # | ||
| 981 | # See their "normal" counterparts for insights on calculations. | ||
| 982 | |||
| 983 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, | ||
| 984 | $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); | ||
| 985 | my $ff=$b_ptr; | ||
| 986 | |||
| 987 | $code.=<<___; | ||
| 988 | .type __ecp_nistz256_sub_from,%function | ||
| 989 | .align 5 | ||
| 990 | __ecp_nistz256_sub_from: | ||
| 991 | str lr,[sp,#-4]! @ push lr | ||
| 992 | |||
| 993 | ldr $t0,[$b_ptr,#0] | ||
| 994 | ldr $t1,[$b_ptr,#4] | ||
| 995 | ldr $t2,[$b_ptr,#8] | ||
| 996 | ldr $t3,[$b_ptr,#12] | ||
| 997 | subs $a0,$a0,$t0 | ||
| 998 | ldr $t0,[$b_ptr,#16] | ||
| 999 | sbcs $a1,$a1,$t1 | ||
| 1000 | ldr $t1,[$b_ptr,#20] | ||
| 1001 | sbcs $a2,$a2,$t2 | ||
| 1002 | ldr $t2,[$b_ptr,#24] | ||
| 1003 | sbcs $a3,$a3,$t3 | ||
| 1004 | ldr $t3,[$b_ptr,#28] | ||
| 1005 | sbcs $a4,$a4,$t0 | ||
| 1006 | sbcs $a5,$a5,$t1 | ||
| 1007 | sbcs $a6,$a6,$t2 | ||
| 1008 | sbcs $a7,$a7,$t3 | ||
| 1009 | sbc $ff,$ff,$ff @ broadcast borrow bit | ||
| 1010 | ldr lr,[sp],#4 @ pop lr | ||
| 1011 | |||
| 1012 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 1013 | adcs $a1,$a1,$ff | ||
| 1014 | str $a0,[$r_ptr,#0] | ||
| 1015 | adcs $a2,$a2,$ff | ||
| 1016 | str $a1,[$r_ptr,#4] | ||
| 1017 | adcs $a3,$a3,#0 | ||
| 1018 | str $a2,[$r_ptr,#8] | ||
| 1019 | adcs $a4,$a4,#0 | ||
| 1020 | str $a3,[$r_ptr,#12] | ||
| 1021 | adcs $a5,$a5,#0 | ||
| 1022 | str $a4,[$r_ptr,#16] | ||
| 1023 | adcs $a6,$a6,$ff,lsr#31 | ||
| 1024 | str $a5,[$r_ptr,#20] | ||
| 1025 | adcs $a7,$a7,$ff | ||
| 1026 | str $a6,[$r_ptr,#24] | ||
| 1027 | str $a7,[$r_ptr,#28] | ||
| 1028 | |||
| 1029 | mov pc,lr | ||
| 1030 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | ||
| 1031 | |||
| 1032 | .type __ecp_nistz256_sub_morf,%function | ||
| 1033 | .align 5 | ||
| 1034 | __ecp_nistz256_sub_morf: | ||
| 1035 | str lr,[sp,#-4]! @ push lr | ||
| 1036 | |||
| 1037 | ldr $t0,[$b_ptr,#0] | ||
| 1038 | ldr $t1,[$b_ptr,#4] | ||
| 1039 | ldr $t2,[$b_ptr,#8] | ||
| 1040 | ldr $t3,[$b_ptr,#12] | ||
| 1041 | subs $a0,$t0,$a0 | ||
| 1042 | ldr $t0,[$b_ptr,#16] | ||
| 1043 | sbcs $a1,$t1,$a1 | ||
| 1044 | ldr $t1,[$b_ptr,#20] | ||
| 1045 | sbcs $a2,$t2,$a2 | ||
| 1046 | ldr $t2,[$b_ptr,#24] | ||
| 1047 | sbcs $a3,$t3,$a3 | ||
| 1048 | ldr $t3,[$b_ptr,#28] | ||
| 1049 | sbcs $a4,$t0,$a4 | ||
| 1050 | sbcs $a5,$t1,$a5 | ||
| 1051 | sbcs $a6,$t2,$a6 | ||
| 1052 | sbcs $a7,$t3,$a7 | ||
| 1053 | sbc $ff,$ff,$ff @ broadcast borrow bit | ||
| 1054 | ldr lr,[sp],#4 @ pop lr | ||
| 1055 | |||
| 1056 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 1057 | adcs $a1,$a1,$ff | ||
| 1058 | str $a0,[$r_ptr,#0] | ||
| 1059 | adcs $a2,$a2,$ff | ||
| 1060 | str $a1,[$r_ptr,#4] | ||
| 1061 | adcs $a3,$a3,#0 | ||
| 1062 | str $a2,[$r_ptr,#8] | ||
| 1063 | adcs $a4,$a4,#0 | ||
| 1064 | str $a3,[$r_ptr,#12] | ||
| 1065 | adcs $a5,$a5,#0 | ||
| 1066 | str $a4,[$r_ptr,#16] | ||
| 1067 | adcs $a6,$a6,$ff,lsr#31 | ||
| 1068 | str $a5,[$r_ptr,#20] | ||
| 1069 | adcs $a7,$a7,$ff | ||
| 1070 | str $a6,[$r_ptr,#24] | ||
| 1071 | str $a7,[$r_ptr,#28] | ||
| 1072 | |||
| 1073 | mov pc,lr | ||
| 1074 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | ||
| 1075 | |||
| 1076 | .type __ecp_nistz256_add_self,%function | ||
| 1077 | .align 4 | ||
| 1078 | __ecp_nistz256_add_self: | ||
| 1079 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] | ||
| 1080 | adcs $a1,$a1,$a1 | ||
| 1081 | adcs $a2,$a2,$a2 | ||
| 1082 | adcs $a3,$a3,$a3 | ||
| 1083 | adcs $a4,$a4,$a4 | ||
| 1084 | adcs $a5,$a5,$a5 | ||
| 1085 | adcs $a6,$a6,$a6 | ||
| 1086 | mov $ff,#0 | ||
| 1087 | adcs $a7,$a7,$a7 | ||
| 1088 | adc $ff,$ff,#0 | ||
| 1089 | |||
| 1090 | @ if a+b >= modulus, subtract modulus. | ||
| 1091 | @ | ||
| 1092 | @ But since comparison implies subtraction, we subtract | ||
| 1093 | @ modulus and then add it back if subtraction borrowed. | ||
| 1094 | |||
| 1095 | subs $a0,$a0,#-1 | ||
| 1096 | sbcs $a1,$a1,#-1 | ||
| 1097 | sbcs $a2,$a2,#-1 | ||
| 1098 | sbcs $a3,$a3,#0 | ||
| 1099 | sbcs $a4,$a4,#0 | ||
| 1100 | sbcs $a5,$a5,#0 | ||
| 1101 | sbcs $a6,$a6,#1 | ||
| 1102 | sbcs $a7,$a7,#-1 | ||
| 1103 | sbc $ff,$ff,#0 | ||
| 1104 | |||
| 1105 | @ Note that because mod has special form, i.e. consists of | ||
| 1106 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 1107 | @ using value of borrow as a whole or extracting single bit. | ||
| 1108 | @ Follow $ff register... | ||
| 1109 | |||
| 1110 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 1111 | adcs $a1,$a1,$ff | ||
| 1112 | str $a0,[$r_ptr,#0] | ||
| 1113 | adcs $a2,$a2,$ff | ||
| 1114 | str $a1,[$r_ptr,#4] | ||
| 1115 | adcs $a3,$a3,#0 | ||
| 1116 | str $a2,[$r_ptr,#8] | ||
| 1117 | adcs $a4,$a4,#0 | ||
| 1118 | str $a3,[$r_ptr,#12] | ||
| 1119 | adcs $a5,$a5,#0 | ||
| 1120 | str $a4,[$r_ptr,#16] | ||
| 1121 | adcs $a6,$a6,$ff,lsr#31 | ||
| 1122 | str $a5,[$r_ptr,#20] | ||
| 1123 | adcs $a7,$a7,$ff | ||
| 1124 | str $a6,[$r_ptr,#24] | ||
| 1125 | str $a7,[$r_ptr,#28] | ||
| 1126 | |||
| 1127 | mov pc,lr | ||
| 1128 | .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self | ||
| 1129 | |||
| 1130 | ___ | ||
| 1131 | |||
| 1132 | ######################################################################## | ||
| 1133 | # following subroutines are "literal" implementation of those found in | ||
| 1134 | # ecp_nistz256.c | ||
| 1135 | # | ||
| 1136 | ######################################################################## | ||
| 1137 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 1138 | # | ||
| 1139 | { | ||
| 1140 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | ||
| 1141 | # above map() describes stack layout with 5 temporary | ||
| 1142 | # 256-bit vectors on top. Then note that we push | ||
| 1143 | # starting from r0, which means that we have copy of | ||
| 1144 | # input arguments just below these temporary vectors. | ||
| 1145 | |||
| 1146 | $code.=<<___; | ||
| 1147 | .globl ecp_nistz256_point_double | ||
| 1148 | .type ecp_nistz256_point_double,%function | ||
| 1149 | .align 5 | ||
| 1150 | ecp_nistz256_point_double: | ||
| 1151 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | ||
| 1152 | sub sp,sp,#32*5 | ||
| 1153 | |||
| 1154 | .Lpoint_double_shortcut: | ||
| 1155 | add r3,sp,#$in_x | ||
| 1156 | ldmia $a_ptr!,{r4-r11} @ copy in_x | ||
| 1157 | stmia r3,{r4-r11} | ||
| 1158 | |||
| 1159 | add $r_ptr,sp,#$S | ||
| 1160 | bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); | ||
| 1161 | |||
| 1162 | add $b_ptr,$a_ptr,#32 | ||
| 1163 | add $a_ptr,$a_ptr,#32 | ||
| 1164 | add $r_ptr,sp,#$Zsqr | ||
| 1165 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); | ||
| 1166 | |||
| 1167 | add $a_ptr,sp,#$S | ||
| 1168 | add $b_ptr,sp,#$S | ||
| 1169 | add $r_ptr,sp,#$S | ||
| 1170 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); | ||
| 1171 | |||
| 1172 | ldr $b_ptr,[sp,#32*5+4] | ||
| 1173 | add $a_ptr,$b_ptr,#32 | ||
| 1174 | add $b_ptr,$b_ptr,#64 | ||
| 1175 | add $r_ptr,sp,#$tmp0 | ||
| 1176 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); | ||
| 1177 | |||
| 1178 | ldr $r_ptr,[sp,#32*5] | ||
| 1179 | add $r_ptr,$r_ptr,#64 | ||
| 1180 | bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); | ||
| 1181 | |||
| 1182 | add $a_ptr,sp,#$in_x | ||
| 1183 | add $b_ptr,sp,#$Zsqr | ||
| 1184 | add $r_ptr,sp,#$M | ||
| 1185 | bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); | ||
| 1186 | |||
| 1187 | add $a_ptr,sp,#$in_x | ||
| 1188 | add $b_ptr,sp,#$Zsqr | ||
| 1189 | add $r_ptr,sp,#$Zsqr | ||
| 1190 | bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); | ||
| 1191 | |||
| 1192 | add $a_ptr,sp,#$S | ||
| 1193 | add $b_ptr,sp,#$S | ||
| 1194 | add $r_ptr,sp,#$tmp0 | ||
| 1195 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); | ||
| 1196 | |||
| 1197 | add $a_ptr,sp,#$Zsqr | ||
| 1198 | add $b_ptr,sp,#$M | ||
| 1199 | add $r_ptr,sp,#$M | ||
| 1200 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); | ||
| 1201 | |||
| 1202 | ldr $r_ptr,[sp,#32*5] | ||
| 1203 | add $a_ptr,sp,#$tmp0 | ||
| 1204 | add $r_ptr,$r_ptr,#32 | ||
| 1205 | bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); | ||
| 1206 | |||
| 1207 | add $a_ptr,sp,#$M | ||
| 1208 | add $r_ptr,sp,#$M | ||
| 1209 | bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); | ||
| 1210 | |||
| 1211 | add $a_ptr,sp,#$in_x | ||
| 1212 | add $b_ptr,sp,#$S | ||
| 1213 | add $r_ptr,sp,#$S | ||
| 1214 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); | ||
| 1215 | |||
| 1216 | add $r_ptr,sp,#$tmp0 | ||
| 1217 | bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); | ||
| 1218 | |||
| 1219 | ldr $r_ptr,[sp,#32*5] | ||
| 1220 | add $a_ptr,sp,#$M | ||
| 1221 | add $b_ptr,sp,#$M | ||
| 1222 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); | ||
| 1223 | |||
| 1224 | add $b_ptr,sp,#$tmp0 | ||
| 1225 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); | ||
| 1226 | |||
| 1227 | add $b_ptr,sp,#$S | ||
| 1228 | add $r_ptr,sp,#$S | ||
| 1229 | bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); | ||
| 1230 | |||
| 1231 | add $a_ptr,sp,#$M | ||
| 1232 | add $b_ptr,sp,#$S | ||
| 1233 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); | ||
| 1234 | |||
| 1235 | ldr $r_ptr,[sp,#32*5] | ||
| 1236 | add $b_ptr,$r_ptr,#32 | ||
| 1237 | add $r_ptr,$r_ptr,#32 | ||
| 1238 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); | ||
| 1239 | |||
| 1240 | add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" | ||
| 1241 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 1242 | ldmia sp!,{r4-r12,pc} | ||
| 1243 | #else | ||
| 1244 | ldmia sp!,{r4-r12,lr} | ||
| 1245 | bx lr @ interoperable with Thumb ISA:-) | ||
| 1246 | #endif | ||
| 1247 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | ||
| 1248 | ___ | ||
| 1249 | } | ||
| 1250 | |||
| 1251 | ######################################################################## | ||
| 1252 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 1253 | # const P256_POINT *in2); | ||
| 1254 | { | ||
| 1255 | my ($res_x,$res_y,$res_z, | ||
| 1256 | $in1_x,$in1_y,$in1_z, | ||
| 1257 | $in2_x,$in2_y,$in2_z, | ||
| 1258 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 1259 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | ||
| 1260 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 1261 | # above map() describes stack layout with 18 temporary | ||
| 1262 | # 256-bit vectors on top. Then note that we push | ||
| 1263 | # starting from r0, which means that we have copy of | ||
| 1264 | # input arguments just below these temporary vectors. | ||
| 1265 | # We use three of them for !in1infty, !in2intfy and | ||
| 1266 | # result of check for zero. | ||
| 1267 | |||
| 1268 | $code.=<<___; | ||
| 1269 | .globl ecp_nistz256_point_add | ||
| 1270 | .type ecp_nistz256_point_add,%function | ||
| 1271 | .align 5 | ||
| 1272 | ecp_nistz256_point_add: | ||
| 1273 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | ||
| 1274 | sub sp,sp,#32*18+16 | ||
| 1275 | |||
| 1276 | ldmia $b_ptr!,{r4-r11} @ copy in2_x | ||
| 1277 | add r3,sp,#$in2_x | ||
| 1278 | stmia r3!,{r4-r11} | ||
| 1279 | ldmia $b_ptr!,{r4-r11} @ copy in2_y | ||
| 1280 | stmia r3!,{r4-r11} | ||
| 1281 | ldmia $b_ptr,{r4-r11} @ copy in2_z | ||
| 1282 | orr r12,r4,r5 | ||
| 1283 | orr r12,r12,r6 | ||
| 1284 | orr r12,r12,r7 | ||
| 1285 | orr r12,r12,r8 | ||
| 1286 | orr r12,r12,r9 | ||
| 1287 | orr r12,r12,r10 | ||
| 1288 | orr r12,r12,r11 | ||
| 1289 | cmp r12,#0 | ||
| 1290 | #ifdef __thumb2__ | ||
| 1291 | it ne | ||
| 1292 | #endif | ||
| 1293 | movne r12,#-1 | ||
| 1294 | stmia r3,{r4-r11} | ||
| 1295 | str r12,[sp,#32*18+8] @ !in2infty | ||
| 1296 | |||
| 1297 | ldmia $a_ptr!,{r4-r11} @ copy in1_x | ||
| 1298 | add r3,sp,#$in1_x | ||
| 1299 | stmia r3!,{r4-r11} | ||
| 1300 | ldmia $a_ptr!,{r4-r11} @ copy in1_y | ||
| 1301 | stmia r3!,{r4-r11} | ||
| 1302 | ldmia $a_ptr,{r4-r11} @ copy in1_z | ||
| 1303 | orr r12,r4,r5 | ||
| 1304 | orr r12,r12,r6 | ||
| 1305 | orr r12,r12,r7 | ||
| 1306 | orr r12,r12,r8 | ||
| 1307 | orr r12,r12,r9 | ||
| 1308 | orr r12,r12,r10 | ||
| 1309 | orr r12,r12,r11 | ||
| 1310 | cmp r12,#0 | ||
| 1311 | #ifdef __thumb2__ | ||
| 1312 | it ne | ||
| 1313 | #endif | ||
| 1314 | movne r12,#-1 | ||
| 1315 | stmia r3,{r4-r11} | ||
| 1316 | str r12,[sp,#32*18+4] @ !in1infty | ||
| 1317 | |||
| 1318 | add $a_ptr,sp,#$in2_z | ||
| 1319 | add $b_ptr,sp,#$in2_z | ||
| 1320 | add $r_ptr,sp,#$Z2sqr | ||
| 1321 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); | ||
| 1322 | |||
| 1323 | add $a_ptr,sp,#$in1_z | ||
| 1324 | add $b_ptr,sp,#$in1_z | ||
| 1325 | add $r_ptr,sp,#$Z1sqr | ||
| 1326 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); | ||
| 1327 | |||
| 1328 | add $a_ptr,sp,#$in2_z | ||
| 1329 | add $b_ptr,sp,#$Z2sqr | ||
| 1330 | add $r_ptr,sp,#$S1 | ||
| 1331 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 1332 | |||
| 1333 | add $a_ptr,sp,#$in1_z | ||
| 1334 | add $b_ptr,sp,#$Z1sqr | ||
| 1335 | add $r_ptr,sp,#$S2 | ||
| 1336 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1337 | |||
| 1338 | add $a_ptr,sp,#$in1_y | ||
| 1339 | add $b_ptr,sp,#$S1 | ||
| 1340 | add $r_ptr,sp,#$S1 | ||
| 1341 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); | ||
| 1342 | |||
| 1343 | add $a_ptr,sp,#$in2_y | ||
| 1344 | add $b_ptr,sp,#$S2 | ||
| 1345 | add $r_ptr,sp,#$S2 | ||
| 1346 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); | ||
| 1347 | |||
| 1348 | add $b_ptr,sp,#$S1 | ||
| 1349 | add $r_ptr,sp,#$R | ||
| 1350 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); | ||
| 1351 | |||
| 1352 | orr $a0,$a0,$a1 @ see if result is zero | ||
| 1353 | orr $a2,$a2,$a3 | ||
| 1354 | orr $a4,$a4,$a5 | ||
| 1355 | orr $a0,$a0,$a2 | ||
| 1356 | orr $a4,$a4,$a6 | ||
| 1357 | orr $a0,$a0,$a7 | ||
| 1358 | add $a_ptr,sp,#$in1_x | ||
| 1359 | orr $a0,$a0,$a4 | ||
| 1360 | add $b_ptr,sp,#$Z2sqr | ||
| 1361 | str $a0,[sp,#32*18+12] | ||
| 1362 | |||
| 1363 | add $r_ptr,sp,#$U1 | ||
| 1364 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 1365 | |||
| 1366 | add $a_ptr,sp,#$in2_x | ||
| 1367 | add $b_ptr,sp,#$Z1sqr | ||
| 1368 | add $r_ptr,sp,#$U2 | ||
| 1369 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 1370 | |||
| 1371 | add $b_ptr,sp,#$U1 | ||
| 1372 | add $r_ptr,sp,#$H | ||
| 1373 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); | ||
| 1374 | |||
| 1375 | orr $a0,$a0,$a1 @ see if result is zero | ||
| 1376 | orr $a2,$a2,$a3 | ||
| 1377 | orr $a4,$a4,$a5 | ||
| 1378 | orr $a0,$a0,$a2 | ||
| 1379 | orr $a4,$a4,$a6 | ||
| 1380 | orr $a0,$a0,$a7 | ||
| 1381 | orrs $a0,$a0,$a4 | ||
| 1382 | |||
| 1383 | bne .Ladd_proceed @ is_equal(U1,U2)? | ||
| 1384 | |||
| 1385 | ldr $t0,[sp,#32*18+4] | ||
| 1386 | ldr $t1,[sp,#32*18+8] | ||
| 1387 | ldr $t2,[sp,#32*18+12] | ||
| 1388 | tst $t0,$t1 | ||
| 1389 | beq .Ladd_proceed @ (in1infty || in2infty)? | ||
| 1390 | tst $t2,$t2 | ||
| 1391 | beq .Ladd_double @ is_equal(S1,S2)? | ||
| 1392 | |||
| 1393 | ldr $r_ptr,[sp,#32*18+16] | ||
| 1394 | eor r4,r4,r4 | ||
| 1395 | eor r5,r5,r5 | ||
| 1396 | eor r6,r6,r6 | ||
| 1397 | eor r7,r7,r7 | ||
| 1398 | eor r8,r8,r8 | ||
| 1399 | eor r9,r9,r9 | ||
| 1400 | eor r10,r10,r10 | ||
| 1401 | eor r11,r11,r11 | ||
| 1402 | stmia $r_ptr!,{r4-r11} | ||
| 1403 | stmia $r_ptr!,{r4-r11} | ||
| 1404 | stmia $r_ptr!,{r4-r11} | ||
| 1405 | b .Ladd_done | ||
| 1406 | |||
| 1407 | .align 4 | ||
| 1408 | .Ladd_double: | ||
| 1409 | ldr $a_ptr,[sp,#32*18+20] | ||
| 1410 | add sp,sp,#32*(18-5)+16 @ difference in frame sizes | ||
| 1411 | b .Lpoint_double_shortcut | ||
| 1412 | |||
| 1413 | .align 4 | ||
| 1414 | .Ladd_proceed: | ||
| 1415 | add $a_ptr,sp,#$R | ||
| 1416 | add $b_ptr,sp,#$R | ||
| 1417 | add $r_ptr,sp,#$Rsqr | ||
| 1418 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); | ||
| 1419 | |||
| 1420 | add $a_ptr,sp,#$H | ||
| 1421 | add $b_ptr,sp,#$in1_z | ||
| 1422 | add $r_ptr,sp,#$res_z | ||
| 1423 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); | ||
| 1424 | |||
| 1425 | add $a_ptr,sp,#$H | ||
| 1426 | add $b_ptr,sp,#$H | ||
| 1427 | add $r_ptr,sp,#$Hsqr | ||
| 1428 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); | ||
| 1429 | |||
| 1430 | add $a_ptr,sp,#$in2_z | ||
| 1431 | add $b_ptr,sp,#$res_z | ||
| 1432 | add $r_ptr,sp,#$res_z | ||
| 1433 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); | ||
| 1434 | |||
| 1435 | add $a_ptr,sp,#$H | ||
| 1436 | add $b_ptr,sp,#$Hsqr | ||
| 1437 | add $r_ptr,sp,#$Hcub | ||
| 1438 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); | ||
| 1439 | |||
| 1440 | add $a_ptr,sp,#$Hsqr | ||
| 1441 | add $b_ptr,sp,#$U1 | ||
| 1442 | add $r_ptr,sp,#$U2 | ||
| 1443 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); | ||
| 1444 | |||
| 1445 | add $r_ptr,sp,#$Hsqr | ||
| 1446 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); | ||
| 1447 | |||
| 1448 | add $b_ptr,sp,#$Rsqr | ||
| 1449 | add $r_ptr,sp,#$res_x | ||
| 1450 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); | ||
| 1451 | |||
| 1452 | add $b_ptr,sp,#$Hcub | ||
| 1453 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); | ||
| 1454 | |||
| 1455 | add $b_ptr,sp,#$U2 | ||
| 1456 | add $r_ptr,sp,#$res_y | ||
| 1457 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); | ||
| 1458 | |||
| 1459 | add $a_ptr,sp,#$Hcub | ||
| 1460 | add $b_ptr,sp,#$S1 | ||
| 1461 | add $r_ptr,sp,#$S2 | ||
| 1462 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); | ||
| 1463 | |||
| 1464 | add $a_ptr,sp,#$R | ||
| 1465 | add $b_ptr,sp,#$res_y | ||
| 1466 | add $r_ptr,sp,#$res_y | ||
| 1467 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); | ||
| 1468 | |||
| 1469 | add $b_ptr,sp,#$S2 | ||
| 1470 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); | ||
| 1471 | |||
| 1472 | ldr r11,[sp,#32*18+4] @ !in1intfy | ||
| 1473 | ldr r12,[sp,#32*18+8] @ !in2intfy | ||
| 1474 | add r1,sp,#$res_x | ||
| 1475 | add r2,sp,#$in2_x | ||
| 1476 | and r10,r11,r12 | ||
| 1477 | mvn r11,r11 | ||
| 1478 | add r3,sp,#$in1_x | ||
| 1479 | and r11,r11,r12 | ||
| 1480 | mvn r12,r12 | ||
| 1481 | ldr $r_ptr,[sp,#32*18+16] | ||
| 1482 | ___ | ||
| 1483 | for($i=0;$i<96;$i+=8) { # conditional moves | ||
| 1484 | $code.=<<___; | ||
| 1485 | ldmia r1!,{r4-r5} @ res_x | ||
| 1486 | ldmia r2!,{r6-r7} @ in2_x | ||
| 1487 | ldmia r3!,{r8-r9} @ in1_x | ||
| 1488 | and r4,r4,r10 | ||
| 1489 | and r5,r5,r10 | ||
| 1490 | and r6,r6,r11 | ||
| 1491 | and r7,r7,r11 | ||
| 1492 | and r8,r8,r12 | ||
| 1493 | and r9,r9,r12 | ||
| 1494 | orr r4,r4,r6 | ||
| 1495 | orr r5,r5,r7 | ||
| 1496 | orr r4,r4,r8 | ||
| 1497 | orr r5,r5,r9 | ||
| 1498 | stmia $r_ptr!,{r4-r5} | ||
| 1499 | ___ | ||
| 1500 | } | ||
| 1501 | $code.=<<___; | ||
| 1502 | .Ladd_done: | ||
| 1503 | add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" | ||
| 1504 | #if __ARM_ARCH__>=5 || defined(__thumb__) | ||
| 1505 | ldmia sp!,{r4-r12,pc} | ||
| 1506 | #else | ||
| 1507 | ldmia sp!,{r4-r12,lr} | ||
| 1508 | bx lr @ interoperable with Thumb ISA:-) | ||
| 1509 | #endif | ||
| 1510 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | ||
| 1511 | ___ | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | ######################################################################## | ||
| 1515 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | ||
| 1516 | # const P256_POINT_AFFINE *in2); | ||
| 1517 | { | ||
| 1518 | my ($res_x,$res_y,$res_z, | ||
| 1519 | $in1_x,$in1_y,$in1_z, | ||
| 1520 | $in2_x,$in2_y, | ||
| 1521 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | ||
| 1522 | my $Z1sqr = $S2; | ||
| 1523 | # above map() describes stack layout with 18 temporary | ||
| 1524 | # 256-bit vectors on top. Then note that we push | ||
| 1525 | # starting from r0, which means that we have copy of | ||
| 1526 | # input arguments just below these temporary vectors. | ||
| 1527 | # We use two of them for !in1infty, !in2intfy. | ||
| 1528 | |||
| 1529 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | ||
| 1530 | |||
| 1531 | $code.=<<___; | ||
| 1532 | .globl ecp_nistz256_point_add_affine | ||
| 1533 | .type ecp_nistz256_point_add_affine,%function | ||
| 1534 | .align 5 | ||
| 1535 | ecp_nistz256_point_add_affine: | ||
| 1536 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | ||
| 1537 | sub sp,sp,#32*15 | ||
| 1538 | |||
| 1539 | ldmia $a_ptr!,{r4-r11} @ copy in1_x | ||
| 1540 | add r3,sp,#$in1_x | ||
| 1541 | stmia r3!,{r4-r11} | ||
| 1542 | ldmia $a_ptr!,{r4-r11} @ copy in1_y | ||
| 1543 | stmia r3!,{r4-r11} | ||
| 1544 | ldmia $a_ptr,{r4-r11} @ copy in1_z | ||
| 1545 | orr r12,r4,r5 | ||
| 1546 | orr r12,r12,r6 | ||
| 1547 | orr r12,r12,r7 | ||
| 1548 | orr r12,r12,r8 | ||
| 1549 | orr r12,r12,r9 | ||
| 1550 | orr r12,r12,r10 | ||
| 1551 | orr r12,r12,r11 | ||
| 1552 | cmp r12,#0 | ||
| 1553 | #ifdef __thumb2__ | ||
| 1554 | it ne | ||
| 1555 | #endif | ||
| 1556 | movne r12,#-1 | ||
| 1557 | stmia r3,{r4-r11} | ||
| 1558 | str r12,[sp,#32*15+4] @ !in1infty | ||
| 1559 | |||
| 1560 | ldmia $b_ptr!,{r4-r11} @ copy in2_x | ||
| 1561 | add r3,sp,#$in2_x | ||
| 1562 | orr r12,r4,r5 | ||
| 1563 | orr r12,r12,r6 | ||
| 1564 | orr r12,r12,r7 | ||
| 1565 | orr r12,r12,r8 | ||
| 1566 | orr r12,r12,r9 | ||
| 1567 | orr r12,r12,r10 | ||
| 1568 | orr r12,r12,r11 | ||
| 1569 | stmia r3!,{r4-r11} | ||
| 1570 | ldmia $b_ptr!,{r4-r11} @ copy in2_y | ||
| 1571 | orr r12,r12,r4 | ||
| 1572 | orr r12,r12,r5 | ||
| 1573 | orr r12,r12,r6 | ||
| 1574 | orr r12,r12,r7 | ||
| 1575 | orr r12,r12,r8 | ||
| 1576 | orr r12,r12,r9 | ||
| 1577 | orr r12,r12,r10 | ||
| 1578 | orr r12,r12,r11 | ||
| 1579 | stmia r3!,{r4-r11} | ||
| 1580 | cmp r12,#0 | ||
| 1581 | #ifdef __thumb2__ | ||
| 1582 | it ne | ||
| 1583 | #endif | ||
| 1584 | movne r12,#-1 | ||
| 1585 | str r12,[sp,#32*15+8] @ !in2infty | ||
| 1586 | |||
| 1587 | add $a_ptr,sp,#$in1_z | ||
| 1588 | add $b_ptr,sp,#$in1_z | ||
| 1589 | add $r_ptr,sp,#$Z1sqr | ||
| 1590 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); | ||
| 1591 | |||
| 1592 | add $a_ptr,sp,#$Z1sqr | ||
| 1593 | add $b_ptr,sp,#$in2_x | ||
| 1594 | add $r_ptr,sp,#$U2 | ||
| 1595 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1596 | |||
| 1597 | add $b_ptr,sp,#$in1_x | ||
| 1598 | add $r_ptr,sp,#$H | ||
| 1599 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); | ||
| 1600 | |||
| 1601 | add $a_ptr,sp,#$Z1sqr | ||
| 1602 | add $b_ptr,sp,#$in1_z | ||
| 1603 | add $r_ptr,sp,#$S2 | ||
| 1604 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1605 | |||
| 1606 | add $a_ptr,sp,#$H | ||
| 1607 | add $b_ptr,sp,#$in1_z | ||
| 1608 | add $r_ptr,sp,#$res_z | ||
| 1609 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); | ||
| 1610 | |||
| 1611 | add $a_ptr,sp,#$in2_y | ||
| 1612 | add $b_ptr,sp,#$S2 | ||
| 1613 | add $r_ptr,sp,#$S2 | ||
| 1614 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); | ||
| 1615 | |||
| 1616 | add $b_ptr,sp,#$in1_y | ||
| 1617 | add $r_ptr,sp,#$R | ||
| 1618 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); | ||
| 1619 | |||
| 1620 | add $a_ptr,sp,#$H | ||
| 1621 | add $b_ptr,sp,#$H | ||
| 1622 | add $r_ptr,sp,#$Hsqr | ||
| 1623 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); | ||
| 1624 | |||
| 1625 | add $a_ptr,sp,#$R | ||
| 1626 | add $b_ptr,sp,#$R | ||
| 1627 | add $r_ptr,sp,#$Rsqr | ||
| 1628 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); | ||
| 1629 | |||
| 1630 | add $a_ptr,sp,#$H | ||
| 1631 | add $b_ptr,sp,#$Hsqr | ||
| 1632 | add $r_ptr,sp,#$Hcub | ||
| 1633 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); | ||
| 1634 | |||
| 1635 | add $a_ptr,sp,#$Hsqr | ||
| 1636 | add $b_ptr,sp,#$in1_x | ||
| 1637 | add $r_ptr,sp,#$U2 | ||
| 1638 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1639 | |||
| 1640 | add $r_ptr,sp,#$Hsqr | ||
| 1641 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); | ||
| 1642 | |||
| 1643 | add $b_ptr,sp,#$Rsqr | ||
| 1644 | add $r_ptr,sp,#$res_x | ||
| 1645 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); | ||
| 1646 | |||
| 1647 | add $b_ptr,sp,#$Hcub | ||
| 1648 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); | ||
| 1649 | |||
| 1650 | add $b_ptr,sp,#$U2 | ||
| 1651 | add $r_ptr,sp,#$res_y | ||
| 1652 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); | ||
| 1653 | |||
| 1654 | add $a_ptr,sp,#$Hcub | ||
| 1655 | add $b_ptr,sp,#$in1_y | ||
| 1656 | add $r_ptr,sp,#$S2 | ||
| 1657 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); | ||
| 1658 | |||
| 1659 | add $a_ptr,sp,#$R | ||
| 1660 | add $b_ptr,sp,#$res_y | ||
| 1661 | add $r_ptr,sp,#$res_y | ||
| 1662 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); | ||
| 1663 | |||
| 1664 | add $b_ptr,sp,#$S2 | ||
| 1665 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); | ||
| 1666 | |||
| 1667 | ldr r11,[sp,#32*15+4] @ !in1intfy | ||
| 1668 | ldr r12,[sp,#32*15+8] @ !in2intfy | ||
| 1669 | add r1,sp,#$res_x | ||
| 1670 | add r2,sp,#$in2_x | ||
| 1671 | and r10,r11,r12 | ||
| 1672 | mvn r11,r11 | ||
| 1673 | add r3,sp,#$in1_x | ||
| 1674 | and r11,r11,r12 | ||
| 1675 | mvn r12,r12 | ||
| 1676 | ldr $r_ptr,[sp,#32*15] | ||
| 1677 | ___ | ||
| 1678 | for($i=0;$i<64;$i+=8) { # conditional moves | ||
| 1679 | $code.=<<___; | ||
| 1680 | ldmia r1!,{r4-r5} @ res_x | ||
| 1681 | ldmia r2!,{r6-r7} @ in2_x | ||
| 1682 | ldmia r3!,{r8-r9} @ in1_x | ||
| 1683 | and r4,r4,r10 | ||
| 1684 | and r5,r5,r10 | ||
| 1685 | and r6,r6,r11 | ||
| 1686 | and r7,r7,r11 | ||
| 1687 | and r8,r8,r12 | ||
| 1688 | and r9,r9,r12 | ||
| 1689 | orr r4,r4,r6 | ||
| 1690 | orr r5,r5,r7 | ||
| 1691 | orr r4,r4,r8 | ||
| 1692 | orr r5,r5,r9 | ||
| 1693 | stmia $r_ptr!,{r4-r5} | ||
| 1694 | ___ | ||
| 1695 | } | ||
| 1696 | for(;$i<96;$i+=8) { | ||
| 1697 | my $j=($i-64)/4; | ||
| 1698 | $code.=<<___; | ||
| 1699 | ldmia r1!,{r4-r5} @ res_z | ||
| 1700 | ldmia r3!,{r8-r9} @ in1_z | ||
| 1701 | and r4,r4,r10 | ||
| 1702 | and r5,r5,r10 | ||
| 1703 | and r6,r11,#@ONE_mont[$j] | ||
| 1704 | and r7,r11,#@ONE_mont[$j+1] | ||
| 1705 | and r8,r8,r12 | ||
| 1706 | and r9,r9,r12 | ||
| 1707 | orr r4,r4,r6 | ||
| 1708 | orr r5,r5,r7 | ||
| 1709 | orr r4,r4,r8 | ||
| 1710 | orr r5,r5,r9 | ||
| 1711 | stmia $r_ptr!,{r4-r5} | ||
| 1712 | ___ | ||
| 1713 | } | ||
| 1714 | $code.=<<___; | ||
| 1715 | add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" | ||
| 1716 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 1717 | ldmia sp!,{r4-r12,pc} | ||
| 1718 | #else | ||
| 1719 | ldmia sp!,{r4-r12,lr} | ||
| 1720 | bx lr @ interoperable with Thumb ISA:-) | ||
| 1721 | #endif | ||
| 1722 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | ||
| 1723 | ___ | ||
| 1724 | } }}} | ||
| 1725 | |||
| 1726 | foreach (split("\n",$code)) { | ||
| 1727 | s/\`([^\`]*)\`/eval $1/geo; | ||
| 1728 | |||
| 1729 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; | ||
| 1730 | |||
| 1731 | print $_,"\n"; | ||
| 1732 | } | ||
| 1733 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl deleted file mode 100644 index 49460fefdc..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl +++ /dev/null | |||
| @@ -1,2890 +0,0 @@ | |||
| 1 | #! /usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-sparcv9.pl,v 1.2 2022/12/26 07:18:51 jmc Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | |||
| 12 | # ==================================================================== | ||
| 13 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 14 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 15 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 16 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 17 | # ==================================================================== | ||
| 18 | # | ||
| 19 | # ECP_NISTZ256 module for SPARCv9. | ||
| 20 | # | ||
| 21 | # February 2015. | ||
| 22 | # | ||
| 23 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | ||
| 24 | # http://eprint.iacr.org/2013/816. In the process of adaptation | ||
| 25 | # original .c module was made 32-bit savvy in order to make this | ||
| 26 | # implementation possible. | ||
| 27 | # | ||
| 28 | # with/without -DECP_NISTZ256_ASM | ||
| 29 | # UltraSPARC III +12-18% | ||
| 30 | # SPARC T4 +99-550% (+66-150% on 32-bit Solaris) | ||
| 31 | # | ||
| 32 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 33 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | ||
| 34 | # operation. Keep in mind that +200% means 3x improvement. | ||
| 35 | |||
| 36 | # Uncomment when all sparcv9 assembly generators are updated to take the output | ||
| 37 | # file as last argument... | ||
| 38 | # $output = pop; | ||
| 39 | # open STDOUT,">$output"; | ||
| 40 | |||
| 41 | $code.=<<___; | ||
| 42 | #define STACK_FRAME 192 | ||
| 43 | #define STACK_BIAS 2047 | ||
| 44 | |||
| 45 | #define LOCALS (STACK_BIAS+STACK_FRAME) | ||
| 46 | .register %g2,#scratch | ||
| 47 | .register %g3,#scratch | ||
| 48 | # define STACK64_FRAME STACK_FRAME | ||
| 49 | # define LOCALS64 LOCALS | ||
| 50 | |||
| 51 | .section ".text",#alloc,#execinstr | ||
| 52 | ___ | ||
| 53 | |||
| 54 | {{{ | ||
| 55 | my ($rp,$ap,$bp)=map("%i$_",(0..2)); | ||
| 56 | my @acc=map("%l$_",(0..7)); | ||
| 57 | my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5"); | ||
| 58 | my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1"); | ||
| 59 | my ($rp_real,$ap_real)=("%g2","%g3"); | ||
| 60 | |||
| 61 | $code.=<<___; | ||
| 62 | .align 64 | ||
| 63 | .Lone: | ||
| 64 | .long 1,0,0,0,0,0,0,0 | ||
| 65 | |||
| 66 | ! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 67 | .globl ecp_nistz256_from_mont | ||
| 68 | .align 32 | ||
| 69 | ecp_nistz256_from_mont: | ||
| 70 | save %sp,-STACK_FRAME,%sp | ||
| 71 | nop | ||
| 72 | 1: call .+8 | ||
| 73 | add %o7,.Lone-1b,$bp | ||
| 74 | call __ecp_nistz256_mul_mont | ||
| 75 | nop | ||
| 76 | ret | ||
| 77 | restore | ||
| 78 | .type ecp_nistz256_from_mont,#function | ||
| 79 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | ||
| 80 | |||
| 81 | ! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8], | ||
| 82 | ! const BN_ULONG %i2[8]); | ||
| 83 | .globl ecp_nistz256_mul_mont | ||
| 84 | .align 32 | ||
| 85 | ecp_nistz256_mul_mont: | ||
| 86 | save %sp,-STACK_FRAME,%sp | ||
| 87 | nop | ||
| 88 | call __ecp_nistz256_mul_mont | ||
| 89 | nop | ||
| 90 | ret | ||
| 91 | restore | ||
| 92 | .type ecp_nistz256_mul_mont,#function | ||
| 93 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | ||
| 94 | |||
| 95 | ! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]); | ||
| 96 | .globl ecp_nistz256_sqr_mont | ||
| 97 | .align 32 | ||
| 98 | ecp_nistz256_sqr_mont: | ||
| 99 | save %sp,-STACK_FRAME,%sp | ||
| 100 | mov $ap,$bp | ||
| 101 | call __ecp_nistz256_mul_mont | ||
| 102 | nop | ||
| 103 | ret | ||
| 104 | restore | ||
| 105 | .type ecp_nistz256_sqr_mont,#function | ||
| 106 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | ||
| 107 | ___ | ||
| 108 | |||
| 109 | ######################################################################## | ||
| 110 | # Special thing to keep in mind is that $t0-$t7 hold 64-bit values, | ||
| 111 | # while all others are meant to keep 32. "Meant to" means that additions | ||
| 112 | # to @acc[0-7] do "contaminate" upper bits, but they are cleared before | ||
| 113 | # they can affect outcome (follow 'and' with $mask). Also keep in mind | ||
| 114 | # that addition with carry is addition with 32-bit carry, even though | ||
| 115 | # CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see | ||
| 116 | # below for VIS3 code paths.] | ||
| 117 | |||
| 118 | $code.=<<___; | ||
| 119 | .align 32 | ||
| 120 | __ecp_nistz256_mul_mont: | ||
| 121 | ld [$bp+0],$bi ! b[0] | ||
| 122 | mov -1,$mask | ||
| 123 | ld [$ap+0],$a0 | ||
| 124 | srl $mask,0,$mask ! 0xffffffff | ||
| 125 | ld [$ap+4],$t1 | ||
| 126 | ld [$ap+8],$t2 | ||
| 127 | ld [$ap+12],$t3 | ||
| 128 | ld [$ap+16],$t4 | ||
| 129 | ld [$ap+20],$t5 | ||
| 130 | ld [$ap+24],$t6 | ||
| 131 | ld [$ap+28],$t7 | ||
| 132 | mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results | ||
| 133 | mulx $t1,$bi,$t1 | ||
| 134 | mulx $t2,$bi,$t2 | ||
| 135 | mulx $t3,$bi,$t3 | ||
| 136 | mulx $t4,$bi,$t4 | ||
| 137 | mulx $t5,$bi,$t5 | ||
| 138 | mulx $t6,$bi,$t6 | ||
| 139 | mulx $t7,$bi,$t7 | ||
| 140 | srlx $t0,32,@acc[1] ! extract high parts | ||
| 141 | srlx $t1,32,@acc[2] | ||
| 142 | srlx $t2,32,@acc[3] | ||
| 143 | srlx $t3,32,@acc[4] | ||
| 144 | srlx $t4,32,@acc[5] | ||
| 145 | srlx $t5,32,@acc[6] | ||
| 146 | srlx $t6,32,@acc[7] | ||
| 147 | srlx $t7,32,@acc[0] ! "@acc[8]" | ||
| 148 | mov 0,$carry | ||
| 149 | ___ | ||
| 150 | for($i=1;$i<8;$i++) { | ||
| 151 | $code.=<<___; | ||
| 152 | addcc @acc[1],$t1,@acc[1] ! accumulate high parts | ||
| 153 | ld [$bp+4*$i],$bi ! b[$i] | ||
| 154 | ld [$ap+4],$t1 ! re-load a[1-7] | ||
| 155 | addccc @acc[2],$t2,@acc[2] | ||
| 156 | addccc @acc[3],$t3,@acc[3] | ||
| 157 | ld [$ap+8],$t2 | ||
| 158 | ld [$ap+12],$t3 | ||
| 159 | addccc @acc[4],$t4,@acc[4] | ||
| 160 | addccc @acc[5],$t5,@acc[5] | ||
| 161 | ld [$ap+16],$t4 | ||
| 162 | ld [$ap+20],$t5 | ||
| 163 | addccc @acc[6],$t6,@acc[6] | ||
| 164 | addccc @acc[7],$t7,@acc[7] | ||
| 165 | ld [$ap+24],$t6 | ||
| 166 | ld [$ap+28],$t7 | ||
| 167 | addccc @acc[0],$carry,@acc[0] ! "@acc[8]" | ||
| 168 | addc %g0,%g0,$carry | ||
| 169 | ___ | ||
| 170 | # Reduction iteration is normally performed by accumulating | ||
| 171 | # result of multiplication of modulus by "magic" digit [and | ||
| 172 | # omitting least significant word, which is guaranteed to | ||
| 173 | # be 0], but thanks to special form of modulus and "magic" | ||
| 174 | # digit being equal to least significant word, it can be | ||
| 175 | # performed with additions and subtractions alone. Indeed: | ||
| 176 | # | ||
| 177 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | ||
| 178 | # * abcd | ||
| 179 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 180 | # | ||
| 181 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 182 | # rewrite above as: | ||
| 183 | # | ||
| 184 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 185 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | ||
| 186 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | ||
| 187 | # | ||
| 188 | # or marking redundant operations: | ||
| 189 | # | ||
| 190 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | ||
| 191 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | ||
| 192 | # - abcd.----.----.----.----.----.----.---- | ||
| 193 | |||
| 194 | $code.=<<___; | ||
| 195 | ! multiplication-less reduction | ||
| 196 | addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0] | ||
| 197 | addccc @acc[4],%g0,@acc[4] ! r[4]+=0 | ||
| 198 | and @acc[1],$mask,@acc[1] | ||
| 199 | and @acc[2],$mask,@acc[2] | ||
| 200 | addccc @acc[5],%g0,@acc[5] ! r[5]+=0 | ||
| 201 | addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0] | ||
| 202 | and @acc[3],$mask,@acc[3] | ||
| 203 | and @acc[4],$mask,@acc[4] | ||
| 204 | addccc @acc[7],%g0,@acc[7] ! r[7]+=0 | ||
| 205 | addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]" | ||
| 206 | and @acc[5],$mask,@acc[5] | ||
| 207 | and @acc[6],$mask,@acc[6] | ||
| 208 | addc $carry,%g0,$carry ! top-most carry | ||
| 209 | subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0] | ||
| 210 | subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]" | ||
| 211 | subc $carry,%g0,$carry ! top-most carry | ||
| 212 | and @acc[7],$mask,@acc[7] | ||
| 213 | and @acc[0],$mask,@acc[0] ! "@acc[8]" | ||
| 214 | ___ | ||
| 215 | push(@acc,shift(@acc)); # rotate registers to "omit" acc[0] | ||
| 216 | $code.=<<___; | ||
| 217 | mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results | ||
| 218 | mulx $t1,$bi,$t1 | ||
| 219 | mulx $t2,$bi,$t2 | ||
| 220 | mulx $t3,$bi,$t3 | ||
| 221 | mulx $t4,$bi,$t4 | ||
| 222 | mulx $t5,$bi,$t5 | ||
| 223 | mulx $t6,$bi,$t6 | ||
| 224 | mulx $t7,$bi,$t7 | ||
| 225 | add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow | ||
| 226 | add @acc[1],$t1,$t1 | ||
| 227 | srlx $t0,32,@acc[1] ! extract high parts | ||
| 228 | add @acc[2],$t2,$t2 | ||
| 229 | srlx $t1,32,@acc[2] | ||
| 230 | add @acc[3],$t3,$t3 | ||
| 231 | srlx $t2,32,@acc[3] | ||
| 232 | add @acc[4],$t4,$t4 | ||
| 233 | srlx $t3,32,@acc[4] | ||
| 234 | add @acc[5],$t5,$t5 | ||
| 235 | srlx $t4,32,@acc[5] | ||
| 236 | add @acc[6],$t6,$t6 | ||
| 237 | srlx $t5,32,@acc[6] | ||
| 238 | add @acc[7],$t7,$t7 | ||
| 239 | srlx $t6,32,@acc[7] | ||
| 240 | srlx $t7,32,@acc[0] ! "@acc[8]" | ||
| 241 | ___ | ||
| 242 | } | ||
| 243 | $code.=<<___; | ||
| 244 | addcc @acc[1],$t1,@acc[1] ! accumulate high parts | ||
| 245 | addccc @acc[2],$t2,@acc[2] | ||
| 246 | addccc @acc[3],$t3,@acc[3] | ||
| 247 | addccc @acc[4],$t4,@acc[4] | ||
| 248 | addccc @acc[5],$t5,@acc[5] | ||
| 249 | addccc @acc[6],$t6,@acc[6] | ||
| 250 | addccc @acc[7],$t7,@acc[7] | ||
| 251 | addccc @acc[0],$carry,@acc[0] ! "@acc[8]" | ||
| 252 | addc %g0,%g0,$carry | ||
| 253 | |||
| 254 | addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction | ||
| 255 | addccc @acc[4],%g0,@acc[4] | ||
| 256 | addccc @acc[5],%g0,@acc[5] | ||
| 257 | addccc @acc[6],$t0,@acc[6] | ||
| 258 | addccc @acc[7],%g0,@acc[7] | ||
| 259 | addccc @acc[0],$t0,@acc[0] ! "@acc[8]" | ||
| 260 | addc $carry,%g0,$carry | ||
| 261 | subcc @acc[7],$t0,@acc[7] | ||
| 262 | subccc @acc[0],%g0,@acc[0] ! "@acc[8]" | ||
| 263 | subc $carry,%g0,$carry ! top-most carry | ||
| 264 | ___ | ||
| 265 | push(@acc,shift(@acc)); # rotate registers to omit acc[0] | ||
| 266 | $code.=<<___; | ||
| 267 | ! Final step is "if result > mod, subtract mod", but we do it | ||
| 268 | ! "other way around", namely subtract modulus from result | ||
| 269 | ! and if it borrowed, add modulus back. | ||
| 270 | |||
| 271 | subcc @acc[0],-1,@acc[0] ! subtract modulus | ||
| 272 | subccc @acc[1],-1,@acc[1] | ||
| 273 | subccc @acc[2],-1,@acc[2] | ||
| 274 | subccc @acc[3],0,@acc[3] | ||
| 275 | subccc @acc[4],0,@acc[4] | ||
| 276 | subccc @acc[5],0,@acc[5] | ||
| 277 | subccc @acc[6],1,@acc[6] | ||
| 278 | subccc @acc[7],-1,@acc[7] | ||
| 279 | subc $carry,0,$carry ! broadcast borrow bit | ||
| 280 | |||
| 281 | ! Note that because mod has special form, i.e. consists of | ||
| 282 | ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 283 | ! using value of broadcasted borrow and the borrow bit itself. | ||
| 284 | ! To minimize dependency chain we first broadcast and then | ||
| 285 | ! extract the bit by negating (follow $bi). | ||
| 286 | |||
| 287 | addcc @acc[0],$carry,@acc[0] ! add modulus or zero | ||
| 288 | addccc @acc[1],$carry,@acc[1] | ||
| 289 | neg $carry,$bi | ||
| 290 | st @acc[0],[$rp] | ||
| 291 | addccc @acc[2],$carry,@acc[2] | ||
| 292 | st @acc[1],[$rp+4] | ||
| 293 | addccc @acc[3],0,@acc[3] | ||
| 294 | st @acc[2],[$rp+8] | ||
| 295 | addccc @acc[4],0,@acc[4] | ||
| 296 | st @acc[3],[$rp+12] | ||
| 297 | addccc @acc[5],0,@acc[5] | ||
| 298 | st @acc[4],[$rp+16] | ||
| 299 | addccc @acc[6],$bi,@acc[6] | ||
| 300 | st @acc[5],[$rp+20] | ||
| 301 | addc @acc[7],$carry,@acc[7] | ||
| 302 | st @acc[6],[$rp+24] | ||
| 303 | retl | ||
| 304 | st @acc[7],[$rp+28] | ||
| 305 | .type __ecp_nistz256_mul_mont,#function | ||
| 306 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont | ||
| 307 | |||
| 308 | ! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8], | ||
| 309 | ! const BN_ULONG %i2[8]); | ||
| 310 | .globl ecp_nistz256_add | ||
| 311 | .align 32 | ||
| 312 | ecp_nistz256_add: | ||
| 313 | save %sp,-STACK_FRAME,%sp | ||
| 314 | ld [$ap],@acc[0] | ||
| 315 | ld [$ap+4],@acc[1] | ||
| 316 | ld [$ap+8],@acc[2] | ||
| 317 | ld [$ap+12],@acc[3] | ||
| 318 | ld [$ap+16],@acc[4] | ||
| 319 | ld [$ap+20],@acc[5] | ||
| 320 | ld [$ap+24],@acc[6] | ||
| 321 | call __ecp_nistz256_add | ||
| 322 | ld [$ap+28],@acc[7] | ||
| 323 | ret | ||
| 324 | restore | ||
| 325 | .type ecp_nistz256_add,#function | ||
| 326 | .size ecp_nistz256_add,.-ecp_nistz256_add | ||
| 327 | |||
| 328 | .align 32 | ||
| 329 | __ecp_nistz256_add: | ||
| 330 | ld [$bp+0],$t0 ! b[0] | ||
| 331 | ld [$bp+4],$t1 | ||
| 332 | ld [$bp+8],$t2 | ||
| 333 | ld [$bp+12],$t3 | ||
| 334 | addcc @acc[0],$t0,@acc[0] | ||
| 335 | ld [$bp+16],$t4 | ||
| 336 | ld [$bp+20],$t5 | ||
| 337 | addccc @acc[1],$t1,@acc[1] | ||
| 338 | ld [$bp+24],$t6 | ||
| 339 | ld [$bp+28],$t7 | ||
| 340 | addccc @acc[2],$t2,@acc[2] | ||
| 341 | addccc @acc[3],$t3,@acc[3] | ||
| 342 | addccc @acc[4],$t4,@acc[4] | ||
| 343 | addccc @acc[5],$t5,@acc[5] | ||
| 344 | addccc @acc[6],$t6,@acc[6] | ||
| 345 | addccc @acc[7],$t7,@acc[7] | ||
| 346 | addc %g0,%g0,$carry | ||
| 347 | |||
| 348 | .Lreduce_by_sub: | ||
| 349 | |||
| 350 | ! if a+b >= modulus, subtract modulus. | ||
| 351 | ! | ||
| 352 | ! But since comparison implies subtraction, we subtract | ||
| 353 | ! modulus and then add it back if subtraction borrowed. | ||
| 354 | |||
| 355 | subcc @acc[0],-1,@acc[0] | ||
| 356 | subccc @acc[1],-1,@acc[1] | ||
| 357 | subccc @acc[2],-1,@acc[2] | ||
| 358 | subccc @acc[3], 0,@acc[3] | ||
| 359 | subccc @acc[4], 0,@acc[4] | ||
| 360 | subccc @acc[5], 0,@acc[5] | ||
| 361 | subccc @acc[6], 1,@acc[6] | ||
| 362 | subccc @acc[7],-1,@acc[7] | ||
| 363 | subc $carry,0,$carry | ||
| 364 | |||
| 365 | ! Note that because mod has special form, i.e. consists of | ||
| 366 | ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 367 | ! using value of borrow and its negative. | ||
| 368 | |||
| 369 | addcc @acc[0],$carry,@acc[0] ! add synthesized modulus | ||
| 370 | addccc @acc[1],$carry,@acc[1] | ||
| 371 | neg $carry,$bi | ||
| 372 | st @acc[0],[$rp] | ||
| 373 | addccc @acc[2],$carry,@acc[2] | ||
| 374 | st @acc[1],[$rp+4] | ||
| 375 | addccc @acc[3],0,@acc[3] | ||
| 376 | st @acc[2],[$rp+8] | ||
| 377 | addccc @acc[4],0,@acc[4] | ||
| 378 | st @acc[3],[$rp+12] | ||
| 379 | addccc @acc[5],0,@acc[5] | ||
| 380 | st @acc[4],[$rp+16] | ||
| 381 | addccc @acc[6],$bi,@acc[6] | ||
| 382 | st @acc[5],[$rp+20] | ||
| 383 | addc @acc[7],$carry,@acc[7] | ||
| 384 | st @acc[6],[$rp+24] | ||
| 385 | retl | ||
| 386 | st @acc[7],[$rp+28] | ||
| 387 | .type __ecp_nistz256_add,#function | ||
| 388 | .size __ecp_nistz256_add,.-__ecp_nistz256_add | ||
| 389 | |||
| 390 | ! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 391 | .globl ecp_nistz256_mul_by_2 | ||
| 392 | .align 32 | ||
| 393 | ecp_nistz256_mul_by_2: | ||
| 394 | save %sp,-STACK_FRAME,%sp | ||
| 395 | ld [$ap],@acc[0] | ||
| 396 | ld [$ap+4],@acc[1] | ||
| 397 | ld [$ap+8],@acc[2] | ||
| 398 | ld [$ap+12],@acc[3] | ||
| 399 | ld [$ap+16],@acc[4] | ||
| 400 | ld [$ap+20],@acc[5] | ||
| 401 | ld [$ap+24],@acc[6] | ||
| 402 | call __ecp_nistz256_mul_by_2 | ||
| 403 | ld [$ap+28],@acc[7] | ||
| 404 | ret | ||
| 405 | restore | ||
| 406 | .type ecp_nistz256_mul_by_2,#function | ||
| 407 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | ||
| 408 | |||
| 409 | .align 32 | ||
| 410 | __ecp_nistz256_mul_by_2: | ||
| 411 | addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a | ||
| 412 | addccc @acc[1],@acc[1],@acc[1] | ||
| 413 | addccc @acc[2],@acc[2],@acc[2] | ||
| 414 | addccc @acc[3],@acc[3],@acc[3] | ||
| 415 | addccc @acc[4],@acc[4],@acc[4] | ||
| 416 | addccc @acc[5],@acc[5],@acc[5] | ||
| 417 | addccc @acc[6],@acc[6],@acc[6] | ||
| 418 | addccc @acc[7],@acc[7],@acc[7] | ||
| 419 | b .Lreduce_by_sub | ||
| 420 | addc %g0,%g0,$carry | ||
| 421 | .type __ecp_nistz256_mul_by_2,#function | ||
| 422 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 | ||
| 423 | |||
| 424 | ! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 425 | .globl ecp_nistz256_mul_by_3 | ||
| 426 | .align 32 | ||
| 427 | ecp_nistz256_mul_by_3: | ||
| 428 | save %sp,-STACK_FRAME,%sp | ||
| 429 | ld [$ap],@acc[0] | ||
| 430 | ld [$ap+4],@acc[1] | ||
| 431 | ld [$ap+8],@acc[2] | ||
| 432 | ld [$ap+12],@acc[3] | ||
| 433 | ld [$ap+16],@acc[4] | ||
| 434 | ld [$ap+20],@acc[5] | ||
| 435 | ld [$ap+24],@acc[6] | ||
| 436 | call __ecp_nistz256_mul_by_3 | ||
| 437 | ld [$ap+28],@acc[7] | ||
| 438 | ret | ||
| 439 | restore | ||
| 440 | .type ecp_nistz256_mul_by_3,#function | ||
| 441 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | ||
| 442 | |||
| 443 | .align 32 | ||
| 444 | __ecp_nistz256_mul_by_3: | ||
| 445 | addcc @acc[0],@acc[0],$t0 ! a+a=2*a | ||
| 446 | addccc @acc[1],@acc[1],$t1 | ||
| 447 | addccc @acc[2],@acc[2],$t2 | ||
| 448 | addccc @acc[3],@acc[3],$t3 | ||
| 449 | addccc @acc[4],@acc[4],$t4 | ||
| 450 | addccc @acc[5],@acc[5],$t5 | ||
| 451 | addccc @acc[6],@acc[6],$t6 | ||
| 452 | addccc @acc[7],@acc[7],$t7 | ||
| 453 | addc %g0,%g0,$carry | ||
| 454 | |||
| 455 | subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores | ||
| 456 | subccc $t1,-1,$t1 | ||
| 457 | subccc $t2,-1,$t2 | ||
| 458 | subccc $t3, 0,$t3 | ||
| 459 | subccc $t4, 0,$t4 | ||
| 460 | subccc $t5, 0,$t5 | ||
| 461 | subccc $t6, 1,$t6 | ||
| 462 | subccc $t7,-1,$t7 | ||
| 463 | subc $carry,0,$carry | ||
| 464 | |||
| 465 | addcc $t0,$carry,$t0 ! add synthesized modulus | ||
| 466 | addccc $t1,$carry,$t1 | ||
| 467 | neg $carry,$bi | ||
| 468 | addccc $t2,$carry,$t2 | ||
| 469 | addccc $t3,0,$t3 | ||
| 470 | addccc $t4,0,$t4 | ||
| 471 | addccc $t5,0,$t5 | ||
| 472 | addccc $t6,$bi,$t6 | ||
| 473 | addc $t7,$carry,$t7 | ||
| 474 | |||
| 475 | addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a | ||
| 476 | addccc $t1,@acc[1],@acc[1] | ||
| 477 | addccc $t2,@acc[2],@acc[2] | ||
| 478 | addccc $t3,@acc[3],@acc[3] | ||
| 479 | addccc $t4,@acc[4],@acc[4] | ||
| 480 | addccc $t5,@acc[5],@acc[5] | ||
| 481 | addccc $t6,@acc[6],@acc[6] | ||
| 482 | addccc $t7,@acc[7],@acc[7] | ||
| 483 | b .Lreduce_by_sub | ||
| 484 | addc %g0,%g0,$carry | ||
| 485 | .type __ecp_nistz256_mul_by_3,#function | ||
| 486 | .size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 | ||
| 487 | |||
| 488 | ! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 489 | .globl ecp_nistz256_neg | ||
| 490 | .align 32 | ||
| 491 | ecp_nistz256_neg: | ||
| 492 | save %sp,-STACK_FRAME,%sp | ||
| 493 | mov $ap,$bp | ||
| 494 | mov 0,@acc[0] | ||
| 495 | mov 0,@acc[1] | ||
| 496 | mov 0,@acc[2] | ||
| 497 | mov 0,@acc[3] | ||
| 498 | mov 0,@acc[4] | ||
| 499 | mov 0,@acc[5] | ||
| 500 | mov 0,@acc[6] | ||
| 501 | call __ecp_nistz256_sub_from | ||
| 502 | mov 0,@acc[7] | ||
| 503 | ret | ||
| 504 | restore | ||
| 505 | .type ecp_nistz256_neg,#function | ||
| 506 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | ||
| 507 | |||
| 508 | .align 32 | ||
| 509 | __ecp_nistz256_sub_from: | ||
| 510 | ld [$bp+0],$t0 ! b[0] | ||
| 511 | ld [$bp+4],$t1 | ||
| 512 | ld [$bp+8],$t2 | ||
| 513 | ld [$bp+12],$t3 | ||
| 514 | subcc @acc[0],$t0,@acc[0] | ||
| 515 | ld [$bp+16],$t4 | ||
| 516 | ld [$bp+20],$t5 | ||
| 517 | subccc @acc[1],$t1,@acc[1] | ||
| 518 | subccc @acc[2],$t2,@acc[2] | ||
| 519 | ld [$bp+24],$t6 | ||
| 520 | ld [$bp+28],$t7 | ||
| 521 | subccc @acc[3],$t3,@acc[3] | ||
| 522 | subccc @acc[4],$t4,@acc[4] | ||
| 523 | subccc @acc[5],$t5,@acc[5] | ||
| 524 | subccc @acc[6],$t6,@acc[6] | ||
| 525 | subccc @acc[7],$t7,@acc[7] | ||
| 526 | subc %g0,%g0,$carry ! broadcast borrow bit | ||
| 527 | |||
| 528 | .Lreduce_by_add: | ||
| 529 | |||
| 530 | ! if a-b borrows, add modulus. | ||
| 531 | ! | ||
| 532 | ! Note that because mod has special form, i.e. consists of | ||
| 533 | ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 534 | ! using value of broadcasted borrow and the borrow bit itself. | ||
| 535 | ! To minimize dependency chain we first broadcast and then | ||
| 536 | ! extract the bit by negating (follow $bi). | ||
| 537 | |||
| 538 | addcc @acc[0],$carry,@acc[0] ! add synthesized modulus | ||
| 539 | addccc @acc[1],$carry,@acc[1] | ||
| 540 | neg $carry,$bi | ||
| 541 | st @acc[0],[$rp] | ||
| 542 | addccc @acc[2],$carry,@acc[2] | ||
| 543 | st @acc[1],[$rp+4] | ||
| 544 | addccc @acc[3],0,@acc[3] | ||
| 545 | st @acc[2],[$rp+8] | ||
| 546 | addccc @acc[4],0,@acc[4] | ||
| 547 | st @acc[3],[$rp+12] | ||
| 548 | addccc @acc[5],0,@acc[5] | ||
| 549 | st @acc[4],[$rp+16] | ||
| 550 | addccc @acc[6],$bi,@acc[6] | ||
| 551 | st @acc[5],[$rp+20] | ||
| 552 | addc @acc[7],$carry,@acc[7] | ||
| 553 | st @acc[6],[$rp+24] | ||
| 554 | retl | ||
| 555 | st @acc[7],[$rp+28] | ||
| 556 | .type __ecp_nistz256_sub_from,#function | ||
| 557 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | ||
| 558 | |||
| 559 | .align 32 | ||
| 560 | __ecp_nistz256_sub_morf: | ||
| 561 | ld [$bp+0],$t0 ! b[0] | ||
| 562 | ld [$bp+4],$t1 | ||
| 563 | ld [$bp+8],$t2 | ||
| 564 | ld [$bp+12],$t3 | ||
| 565 | subcc $t0,@acc[0],@acc[0] | ||
| 566 | ld [$bp+16],$t4 | ||
| 567 | ld [$bp+20],$t5 | ||
| 568 | subccc $t1,@acc[1],@acc[1] | ||
| 569 | subccc $t2,@acc[2],@acc[2] | ||
| 570 | ld [$bp+24],$t6 | ||
| 571 | ld [$bp+28],$t7 | ||
| 572 | subccc $t3,@acc[3],@acc[3] | ||
| 573 | subccc $t4,@acc[4],@acc[4] | ||
| 574 | subccc $t5,@acc[5],@acc[5] | ||
| 575 | subccc $t6,@acc[6],@acc[6] | ||
| 576 | subccc $t7,@acc[7],@acc[7] | ||
| 577 | b .Lreduce_by_add | ||
| 578 | subc %g0,%g0,$carry ! broadcast borrow bit | ||
| 579 | .type __ecp_nistz256_sub_morf,#function | ||
| 580 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | ||
| 581 | |||
| 582 | ! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 583 | .globl ecp_nistz256_div_by_2 | ||
| 584 | .align 32 | ||
| 585 | ecp_nistz256_div_by_2: | ||
| 586 | save %sp,-STACK_FRAME,%sp | ||
| 587 | ld [$ap],@acc[0] | ||
| 588 | ld [$ap+4],@acc[1] | ||
| 589 | ld [$ap+8],@acc[2] | ||
| 590 | ld [$ap+12],@acc[3] | ||
| 591 | ld [$ap+16],@acc[4] | ||
| 592 | ld [$ap+20],@acc[5] | ||
| 593 | ld [$ap+24],@acc[6] | ||
| 594 | call __ecp_nistz256_div_by_2 | ||
| 595 | ld [$ap+28],@acc[7] | ||
| 596 | ret | ||
| 597 | restore | ||
| 598 | .type ecp_nistz256_div_by_2,#function | ||
| 599 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | ||
| 600 | |||
| 601 | .align 32 | ||
| 602 | __ecp_nistz256_div_by_2: | ||
| 603 | ! ret = (a is odd ? a+mod : a) >> 1 | ||
| 604 | |||
| 605 | and @acc[0],1,$bi | ||
| 606 | neg $bi,$carry | ||
| 607 | addcc @acc[0],$carry,@acc[0] | ||
| 608 | addccc @acc[1],$carry,@acc[1] | ||
| 609 | addccc @acc[2],$carry,@acc[2] | ||
| 610 | addccc @acc[3],0,@acc[3] | ||
| 611 | addccc @acc[4],0,@acc[4] | ||
| 612 | addccc @acc[5],0,@acc[5] | ||
| 613 | addccc @acc[6],$bi,@acc[6] | ||
| 614 | addccc @acc[7],$carry,@acc[7] | ||
| 615 | addc %g0,%g0,$carry | ||
| 616 | |||
| 617 | ! ret >>= 1 | ||
| 618 | |||
| 619 | srl @acc[0],1,@acc[0] | ||
| 620 | sll @acc[1],31,$t0 | ||
| 621 | srl @acc[1],1,@acc[1] | ||
| 622 | or @acc[0],$t0,@acc[0] | ||
| 623 | sll @acc[2],31,$t1 | ||
| 624 | srl @acc[2],1,@acc[2] | ||
| 625 | or @acc[1],$t1,@acc[1] | ||
| 626 | sll @acc[3],31,$t2 | ||
| 627 | st @acc[0],[$rp] | ||
| 628 | srl @acc[3],1,@acc[3] | ||
| 629 | or @acc[2],$t2,@acc[2] | ||
| 630 | sll @acc[4],31,$t3 | ||
| 631 | st @acc[1],[$rp+4] | ||
| 632 | srl @acc[4],1,@acc[4] | ||
| 633 | or @acc[3],$t3,@acc[3] | ||
| 634 | sll @acc[5],31,$t4 | ||
| 635 | st @acc[2],[$rp+8] | ||
| 636 | srl @acc[5],1,@acc[5] | ||
| 637 | or @acc[4],$t4,@acc[4] | ||
| 638 | sll @acc[6],31,$t5 | ||
| 639 | st @acc[3],[$rp+12] | ||
| 640 | srl @acc[6],1,@acc[6] | ||
| 641 | or @acc[5],$t5,@acc[5] | ||
| 642 | sll @acc[7],31,$t6 | ||
| 643 | st @acc[4],[$rp+16] | ||
| 644 | srl @acc[7],1,@acc[7] | ||
| 645 | or @acc[6],$t6,@acc[6] | ||
| 646 | sll $carry,31,$t7 | ||
| 647 | st @acc[5],[$rp+20] | ||
| 648 | or @acc[7],$t7,@acc[7] | ||
| 649 | st @acc[6],[$rp+24] | ||
| 650 | retl | ||
| 651 | st @acc[7],[$rp+28] | ||
| 652 | .type __ecp_nistz256_div_by_2,#function | ||
| 653 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 | ||
| 654 | ___ | ||
| 655 | |||
| 656 | ######################################################################## | ||
| 657 | # following subroutines are "literal" implementation of those found in | ||
| 658 | # ecp_nistz256.c | ||
| 659 | # | ||
| 660 | ######################################################################## | ||
| 661 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 662 | # | ||
| 663 | { | ||
| 664 | my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); | ||
| 665 | # above map() describes stack layout with 4 temporary | ||
| 666 | # 256-bit vectors on top. | ||
| 667 | |||
| 668 | $code.=<<___; | ||
| 669 | #if 0 | ||
| 670 | #ifdef __PIC__ | ||
| 671 | SPARC_PIC_THUNK(%g1) | ||
| 672 | #endif | ||
| 673 | #endif | ||
| 674 | |||
| 675 | .globl ecp_nistz256_point_double | ||
| 676 | .align 32 | ||
| 677 | ecp_nistz256_point_double: | ||
| 678 | #if 0 | ||
| 679 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | ||
| 680 | ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] | ||
| 681 | and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 | ||
| 682 | cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) | ||
| 683 | be ecp_nistz256_point_double_vis3 | ||
| 684 | nop | ||
| 685 | #endif | ||
| 686 | |||
| 687 | save %sp,-STACK_FRAME-32*4,%sp | ||
| 688 | |||
| 689 | mov $rp,$rp_real | ||
| 690 | mov $ap,$ap_real | ||
| 691 | |||
| 692 | .Lpoint_double_shortcut: | ||
| 693 | ld [$ap+32],@acc[0] | ||
| 694 | ld [$ap+32+4],@acc[1] | ||
| 695 | ld [$ap+32+8],@acc[2] | ||
| 696 | ld [$ap+32+12],@acc[3] | ||
| 697 | ld [$ap+32+16],@acc[4] | ||
| 698 | ld [$ap+32+20],@acc[5] | ||
| 699 | ld [$ap+32+24],@acc[6] | ||
| 700 | ld [$ap+32+28],@acc[7] | ||
| 701 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y); | ||
| 702 | add %sp,LOCALS+$S,$rp | ||
| 703 | |||
| 704 | add $ap_real,64,$bp | ||
| 705 | add $ap_real,64,$ap | ||
| 706 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z); | ||
| 707 | add %sp,LOCALS+$Zsqr,$rp | ||
| 708 | |||
| 709 | add $ap_real,0,$bp | ||
| 710 | call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x); | ||
| 711 | add %sp,LOCALS+$M,$rp | ||
| 712 | |||
| 713 | add %sp,LOCALS+$S,$bp | ||
| 714 | add %sp,LOCALS+$S,$ap | ||
| 715 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S); | ||
| 716 | add %sp,LOCALS+$S,$rp | ||
| 717 | |||
| 718 | ld [$ap_real],@acc[0] | ||
| 719 | add %sp,LOCALS+$Zsqr,$bp | ||
| 720 | ld [$ap_real+4],@acc[1] | ||
| 721 | ld [$ap_real+8],@acc[2] | ||
| 722 | ld [$ap_real+12],@acc[3] | ||
| 723 | ld [$ap_real+16],@acc[4] | ||
| 724 | ld [$ap_real+20],@acc[5] | ||
| 725 | ld [$ap_real+24],@acc[6] | ||
| 726 | ld [$ap_real+28],@acc[7] | ||
| 727 | call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr); | ||
| 728 | add %sp,LOCALS+$Zsqr,$rp | ||
| 729 | |||
| 730 | add $ap_real,32,$bp | ||
| 731 | add $ap_real,64,$ap | ||
| 732 | call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y); | ||
| 733 | add %sp,LOCALS+$tmp0,$rp | ||
| 734 | |||
| 735 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0); | ||
| 736 | add $rp_real,64,$rp | ||
| 737 | |||
| 738 | add %sp,LOCALS+$Zsqr,$bp | ||
| 739 | add %sp,LOCALS+$M,$ap | ||
| 740 | call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr); | ||
| 741 | add %sp,LOCALS+$M,$rp | ||
| 742 | |||
| 743 | call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M); | ||
| 744 | add %sp,LOCALS+$M,$rp | ||
| 745 | |||
| 746 | add %sp,LOCALS+$S,$bp | ||
| 747 | add %sp,LOCALS+$S,$ap | ||
| 748 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S); | ||
| 749 | add %sp,LOCALS+$tmp0,$rp | ||
| 750 | |||
| 751 | call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0); | ||
| 752 | add $rp_real,32,$rp | ||
| 753 | |||
| 754 | add $ap_real,0,$bp | ||
| 755 | add %sp,LOCALS+$S,$ap | ||
| 756 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x); | ||
| 757 | add %sp,LOCALS+$S,$rp | ||
| 758 | |||
| 759 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S); | ||
| 760 | add %sp,LOCALS+$tmp0,$rp | ||
| 761 | |||
| 762 | add %sp,LOCALS+$M,$bp | ||
| 763 | add %sp,LOCALS+$M,$ap | ||
| 764 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M); | ||
| 765 | add $rp_real,0,$rp | ||
| 766 | |||
| 767 | add %sp,LOCALS+$tmp0,$bp | ||
| 768 | call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0); | ||
| 769 | add $rp_real,0,$rp | ||
| 770 | |||
| 771 | add %sp,LOCALS+$S,$bp | ||
| 772 | call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x); | ||
| 773 | add %sp,LOCALS+$S,$rp | ||
| 774 | |||
| 775 | add %sp,LOCALS+$M,$bp | ||
| 776 | add %sp,LOCALS+$S,$ap | ||
| 777 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M); | ||
| 778 | add %sp,LOCALS+$S,$rp | ||
| 779 | |||
| 780 | add $rp_real,32,$bp | ||
| 781 | call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y); | ||
| 782 | add $rp_real,32,$rp | ||
| 783 | |||
| 784 | ret | ||
| 785 | restore | ||
| 786 | .type ecp_nistz256_point_double,#function | ||
| 787 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | ||
| 788 | ___ | ||
| 789 | } | ||
| 790 | |||
| 791 | ######################################################################## | ||
| 792 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 793 | # const P256_POINT *in2); | ||
| 794 | { | ||
| 795 | my ($res_x,$res_y,$res_z, | ||
| 796 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 797 | $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); | ||
| 798 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 799 | |||
| 800 | # above map() describes stack layout with 12 temporary | ||
| 801 | # 256-bit vectors on top. Then we reserve some space for | ||
| 802 | # !in1infty, !in2infty, result of check for zero and return pointer. | ||
| 803 | |||
| 804 | my $bp_real=$rp_real; | ||
| 805 | |||
| 806 | $code.=<<___; | ||
| 807 | .globl ecp_nistz256_point_add | ||
| 808 | .align 32 | ||
| 809 | ecp_nistz256_point_add: | ||
| 810 | #if 0 | ||
| 811 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | ||
| 812 | ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] | ||
| 813 | and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 | ||
| 814 | cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) | ||
| 815 | be ecp_nistz256_point_add_vis3 | ||
| 816 | nop | ||
| 817 | #endif | ||
| 818 | |||
| 819 | save %sp,-STACK_FRAME-32*12-32,%sp | ||
| 820 | |||
| 821 | stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp | ||
| 822 | mov $ap,$ap_real | ||
| 823 | mov $bp,$bp_real | ||
| 824 | |||
| 825 | ld [$bp+64],$t0 ! in2_z | ||
| 826 | ld [$bp+64+4],$t1 | ||
| 827 | ld [$bp+64+8],$t2 | ||
| 828 | ld [$bp+64+12],$t3 | ||
| 829 | ld [$bp+64+16],$t4 | ||
| 830 | ld [$bp+64+20],$t5 | ||
| 831 | ld [$bp+64+24],$t6 | ||
| 832 | ld [$bp+64+28],$t7 | ||
| 833 | or $t1,$t0,$t0 | ||
| 834 | or $t3,$t2,$t2 | ||
| 835 | or $t5,$t4,$t4 | ||
| 836 | or $t7,$t6,$t6 | ||
| 837 | or $t2,$t0,$t0 | ||
| 838 | or $t6,$t4,$t4 | ||
| 839 | or $t4,$t0,$t0 ! !in2infty | ||
| 840 | movrnz $t0,-1,$t0 | ||
| 841 | st $t0,[%fp+STACK_BIAS-12] | ||
| 842 | |||
| 843 | ld [$ap+64],$t0 ! in1_z | ||
| 844 | ld [$ap+64+4],$t1 | ||
| 845 | ld [$ap+64+8],$t2 | ||
| 846 | ld [$ap+64+12],$t3 | ||
| 847 | ld [$ap+64+16],$t4 | ||
| 848 | ld [$ap+64+20],$t5 | ||
| 849 | ld [$ap+64+24],$t6 | ||
| 850 | ld [$ap+64+28],$t7 | ||
| 851 | or $t1,$t0,$t0 | ||
| 852 | or $t3,$t2,$t2 | ||
| 853 | or $t5,$t4,$t4 | ||
| 854 | or $t7,$t6,$t6 | ||
| 855 | or $t2,$t0,$t0 | ||
| 856 | or $t6,$t4,$t4 | ||
| 857 | or $t4,$t0,$t0 ! !in1infty | ||
| 858 | movrnz $t0,-1,$t0 | ||
| 859 | st $t0,[%fp+STACK_BIAS-16] | ||
| 860 | |||
| 861 | add $bp_real,64,$bp | ||
| 862 | add $bp_real,64,$ap | ||
| 863 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z); | ||
| 864 | add %sp,LOCALS+$Z2sqr,$rp | ||
| 865 | |||
| 866 | add $ap_real,64,$bp | ||
| 867 | add $ap_real,64,$ap | ||
| 868 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 869 | add %sp,LOCALS+$Z1sqr,$rp | ||
| 870 | |||
| 871 | add $bp_real,64,$bp | ||
| 872 | add %sp,LOCALS+$Z2sqr,$ap | ||
| 873 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 874 | add %sp,LOCALS+$S1,$rp | ||
| 875 | |||
| 876 | add $ap_real,64,$bp | ||
| 877 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 878 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 879 | add %sp,LOCALS+$S2,$rp | ||
| 880 | |||
| 881 | add $ap_real,32,$bp | ||
| 882 | add %sp,LOCALS+$S1,$ap | ||
| 883 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y); | ||
| 884 | add %sp,LOCALS+$S1,$rp | ||
| 885 | |||
| 886 | add $bp_real,32,$bp | ||
| 887 | add %sp,LOCALS+$S2,$ap | ||
| 888 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); | ||
| 889 | add %sp,LOCALS+$S2,$rp | ||
| 890 | |||
| 891 | add %sp,LOCALS+$S1,$bp | ||
| 892 | call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1); | ||
| 893 | add %sp,LOCALS+$R,$rp | ||
| 894 | |||
| 895 | or @acc[1],@acc[0],@acc[0] ! see if result is zero | ||
| 896 | or @acc[3],@acc[2],@acc[2] | ||
| 897 | or @acc[5],@acc[4],@acc[4] | ||
| 898 | or @acc[7],@acc[6],@acc[6] | ||
| 899 | or @acc[2],@acc[0],@acc[0] | ||
| 900 | or @acc[6],@acc[4],@acc[4] | ||
| 901 | or @acc[4],@acc[0],@acc[0] | ||
| 902 | st @acc[0],[%fp+STACK_BIAS-20] | ||
| 903 | |||
| 904 | add $ap_real,0,$bp | ||
| 905 | add %sp,LOCALS+$Z2sqr,$ap | ||
| 906 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 907 | add %sp,LOCALS+$U1,$rp | ||
| 908 | |||
| 909 | add $bp_real,0,$bp | ||
| 910 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 911 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 912 | add %sp,LOCALS+$U2,$rp | ||
| 913 | |||
| 914 | add %sp,LOCALS+$U1,$bp | ||
| 915 | call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1); | ||
| 916 | add %sp,LOCALS+$H,$rp | ||
| 917 | |||
| 918 | or @acc[1],@acc[0],@acc[0] ! see if result is zero | ||
| 919 | or @acc[3],@acc[2],@acc[2] | ||
| 920 | or @acc[5],@acc[4],@acc[4] | ||
| 921 | or @acc[7],@acc[6],@acc[6] | ||
| 922 | or @acc[2],@acc[0],@acc[0] | ||
| 923 | or @acc[6],@acc[4],@acc[4] | ||
| 924 | orcc @acc[4],@acc[0],@acc[0] | ||
| 925 | |||
| 926 | bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)? | ||
| 927 | nop | ||
| 928 | |||
| 929 | ld [%fp+STACK_BIAS-12],$t0 | ||
| 930 | ld [%fp+STACK_BIAS-16],$t1 | ||
| 931 | ld [%fp+STACK_BIAS-20],$t2 | ||
| 932 | andcc $t0,$t1,%g0 | ||
| 933 | be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)? | ||
| 934 | nop | ||
| 935 | andcc $t2,$t2,%g0 | ||
| 936 | be,pt %icc,.Ladd_double ! is_equal(S1,S2)? | ||
| 937 | nop | ||
| 938 | |||
| 939 | ldx [%fp+STACK_BIAS-8],$rp | ||
| 940 | st %g0,[$rp] | ||
| 941 | st %g0,[$rp+4] | ||
| 942 | st %g0,[$rp+8] | ||
| 943 | st %g0,[$rp+12] | ||
| 944 | st %g0,[$rp+16] | ||
| 945 | st %g0,[$rp+20] | ||
| 946 | st %g0,[$rp+24] | ||
| 947 | st %g0,[$rp+28] | ||
| 948 | st %g0,[$rp+32] | ||
| 949 | st %g0,[$rp+32+4] | ||
| 950 | st %g0,[$rp+32+8] | ||
| 951 | st %g0,[$rp+32+12] | ||
| 952 | st %g0,[$rp+32+16] | ||
| 953 | st %g0,[$rp+32+20] | ||
| 954 | st %g0,[$rp+32+24] | ||
| 955 | st %g0,[$rp+32+28] | ||
| 956 | st %g0,[$rp+64] | ||
| 957 | st %g0,[$rp+64+4] | ||
| 958 | st %g0,[$rp+64+8] | ||
| 959 | st %g0,[$rp+64+12] | ||
| 960 | st %g0,[$rp+64+16] | ||
| 961 | st %g0,[$rp+64+20] | ||
| 962 | st %g0,[$rp+64+24] | ||
| 963 | st %g0,[$rp+64+28] | ||
| 964 | b .Ladd_done | ||
| 965 | nop | ||
| 966 | |||
| 967 | .align 16 | ||
| 968 | .Ladd_double: | ||
| 969 | ldx [%fp+STACK_BIAS-8],$rp_real | ||
| 970 | mov $ap_real,$ap | ||
| 971 | b .Lpoint_double_shortcut | ||
| 972 | add %sp,32*(12-4)+32,%sp ! difference in frame sizes | ||
| 973 | |||
| 974 | .align 16 | ||
| 975 | .Ladd_proceed: | ||
| 976 | add %sp,LOCALS+$R,$bp | ||
| 977 | add %sp,LOCALS+$R,$ap | ||
| 978 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); | ||
| 979 | add %sp,LOCALS+$Rsqr,$rp | ||
| 980 | |||
| 981 | add $ap_real,64,$bp | ||
| 982 | add %sp,LOCALS+$H,$ap | ||
| 983 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); | ||
| 984 | add %sp,LOCALS+$res_z,$rp | ||
| 985 | |||
| 986 | add %sp,LOCALS+$H,$bp | ||
| 987 | add %sp,LOCALS+$H,$ap | ||
| 988 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); | ||
| 989 | add %sp,LOCALS+$Hsqr,$rp | ||
| 990 | |||
| 991 | add $bp_real,64,$bp | ||
| 992 | add %sp,LOCALS+$res_z,$ap | ||
| 993 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z); | ||
| 994 | add %sp,LOCALS+$res_z,$rp | ||
| 995 | |||
| 996 | add %sp,LOCALS+$H,$bp | ||
| 997 | add %sp,LOCALS+$Hsqr,$ap | ||
| 998 | call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 999 | add %sp,LOCALS+$Hcub,$rp | ||
| 1000 | |||
| 1001 | add %sp,LOCALS+$U1,$bp | ||
| 1002 | add %sp,LOCALS+$Hsqr,$ap | ||
| 1003 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr); | ||
| 1004 | add %sp,LOCALS+$U2,$rp | ||
| 1005 | |||
| 1006 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); | ||
| 1007 | add %sp,LOCALS+$Hsqr,$rp | ||
| 1008 | |||
| 1009 | add %sp,LOCALS+$Rsqr,$bp | ||
| 1010 | call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 1011 | add %sp,LOCALS+$res_x,$rp | ||
| 1012 | |||
| 1013 | add %sp,LOCALS+$Hcub,$bp | ||
| 1014 | call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); | ||
| 1015 | add %sp,LOCALS+$res_x,$rp | ||
| 1016 | |||
| 1017 | add %sp,LOCALS+$U2,$bp | ||
| 1018 | call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); | ||
| 1019 | add %sp,LOCALS+$res_y,$rp | ||
| 1020 | |||
| 1021 | add %sp,LOCALS+$Hcub,$bp | ||
| 1022 | add %sp,LOCALS+$S1,$ap | ||
| 1023 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub); | ||
| 1024 | add %sp,LOCALS+$S2,$rp | ||
| 1025 | |||
| 1026 | add %sp,LOCALS+$R,$bp | ||
| 1027 | add %sp,LOCALS+$res_y,$ap | ||
| 1028 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); | ||
| 1029 | add %sp,LOCALS+$res_y,$rp | ||
| 1030 | |||
| 1031 | add %sp,LOCALS+$S2,$bp | ||
| 1032 | call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); | ||
| 1033 | add %sp,LOCALS+$res_y,$rp | ||
| 1034 | |||
| 1035 | ld [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 1036 | ld [%fp+STACK_BIAS-12],$t2 ! !in2infty | ||
| 1037 | ldx [%fp+STACK_BIAS-8],$rp | ||
| 1038 | ___ | ||
| 1039 | for($i=0;$i<96;$i+=8) { # conditional moves | ||
| 1040 | $code.=<<___; | ||
| 1041 | ld [%sp+LOCALS+$i],@acc[0] ! res | ||
| 1042 | ld [%sp+LOCALS+$i+4],@acc[1] | ||
| 1043 | ld [$bp_real+$i],@acc[2] ! in2 | ||
| 1044 | ld [$bp_real+$i+4],@acc[3] | ||
| 1045 | ld [$ap_real+$i],@acc[4] ! in1 | ||
| 1046 | ld [$ap_real+$i+4],@acc[5] | ||
| 1047 | movrz $t1,@acc[2],@acc[0] | ||
| 1048 | movrz $t1,@acc[3],@acc[1] | ||
| 1049 | movrz $t2,@acc[4],@acc[0] | ||
| 1050 | movrz $t2,@acc[5],@acc[1] | ||
| 1051 | st @acc[0],[$rp+$i] | ||
| 1052 | st @acc[1],[$rp+$i+4] | ||
| 1053 | ___ | ||
| 1054 | } | ||
| 1055 | $code.=<<___; | ||
| 1056 | .Ladd_done: | ||
| 1057 | ret | ||
| 1058 | restore | ||
| 1059 | .type ecp_nistz256_point_add,#function | ||
| 1060 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | ||
| 1061 | ___ | ||
| 1062 | } | ||
| 1063 | |||
| 1064 | ######################################################################## | ||
| 1065 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | ||
| 1066 | # const P256_POINT_AFFINE *in2); | ||
| 1067 | { | ||
| 1068 | my ($res_x,$res_y,$res_z, | ||
| 1069 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); | ||
| 1070 | my $Z1sqr = $S2; | ||
| 1071 | # above map() describes stack layout with 10 temporary | ||
| 1072 | # 256-bit vectors on top. Then we reserve some space for | ||
| 1073 | # !in1infty, !in2infty, result of check for zero and return pointer. | ||
| 1074 | |||
| 1075 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | ||
| 1076 | my $bp_real=$rp_real; | ||
| 1077 | |||
| 1078 | $code.=<<___; | ||
| 1079 | .globl ecp_nistz256_point_add_affine | ||
| 1080 | .align 32 | ||
| 1081 | ecp_nistz256_point_add_affine: | ||
| 1082 | #if 0 | ||
| 1083 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | ||
| 1084 | ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] | ||
| 1085 | and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 | ||
| 1086 | cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) | ||
| 1087 | be ecp_nistz256_point_add_affine_vis3 | ||
| 1088 | nop | ||
| 1089 | #endif | ||
| 1090 | |||
| 1091 | save %sp,-STACK_FRAME-32*10-32,%sp | ||
| 1092 | |||
| 1093 | stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp | ||
| 1094 | mov $ap,$ap_real | ||
| 1095 | mov $bp,$bp_real | ||
| 1096 | |||
| 1097 | ld [$ap+64],$t0 ! in1_z | ||
| 1098 | ld [$ap+64+4],$t1 | ||
| 1099 | ld [$ap+64+8],$t2 | ||
| 1100 | ld [$ap+64+12],$t3 | ||
| 1101 | ld [$ap+64+16],$t4 | ||
| 1102 | ld [$ap+64+20],$t5 | ||
| 1103 | ld [$ap+64+24],$t6 | ||
| 1104 | ld [$ap+64+28],$t7 | ||
| 1105 | or $t1,$t0,$t0 | ||
| 1106 | or $t3,$t2,$t2 | ||
| 1107 | or $t5,$t4,$t4 | ||
| 1108 | or $t7,$t6,$t6 | ||
| 1109 | or $t2,$t0,$t0 | ||
| 1110 | or $t6,$t4,$t4 | ||
| 1111 | or $t4,$t0,$t0 ! !in1infty | ||
| 1112 | movrnz $t0,-1,$t0 | ||
| 1113 | st $t0,[%fp+STACK_BIAS-16] | ||
| 1114 | |||
| 1115 | ld [$bp],@acc[0] ! in2_x | ||
| 1116 | ld [$bp+4],@acc[1] | ||
| 1117 | ld [$bp+8],@acc[2] | ||
| 1118 | ld [$bp+12],@acc[3] | ||
| 1119 | ld [$bp+16],@acc[4] | ||
| 1120 | ld [$bp+20],@acc[5] | ||
| 1121 | ld [$bp+24],@acc[6] | ||
| 1122 | ld [$bp+28],@acc[7] | ||
| 1123 | ld [$bp+32],$t0 ! in2_y | ||
| 1124 | ld [$bp+32+4],$t1 | ||
| 1125 | ld [$bp+32+8],$t2 | ||
| 1126 | ld [$bp+32+12],$t3 | ||
| 1127 | ld [$bp+32+16],$t4 | ||
| 1128 | ld [$bp+32+20],$t5 | ||
| 1129 | ld [$bp+32+24],$t6 | ||
| 1130 | ld [$bp+32+28],$t7 | ||
| 1131 | or @acc[1],@acc[0],@acc[0] | ||
| 1132 | or @acc[3],@acc[2],@acc[2] | ||
| 1133 | or @acc[5],@acc[4],@acc[4] | ||
| 1134 | or @acc[7],@acc[6],@acc[6] | ||
| 1135 | or @acc[2],@acc[0],@acc[0] | ||
| 1136 | or @acc[6],@acc[4],@acc[4] | ||
| 1137 | or @acc[4],@acc[0],@acc[0] | ||
| 1138 | or $t1,$t0,$t0 | ||
| 1139 | or $t3,$t2,$t2 | ||
| 1140 | or $t5,$t4,$t4 | ||
| 1141 | or $t7,$t6,$t6 | ||
| 1142 | or $t2,$t0,$t0 | ||
| 1143 | or $t6,$t4,$t4 | ||
| 1144 | or $t4,$t0,$t0 | ||
| 1145 | or @acc[0],$t0,$t0 ! !in2infty | ||
| 1146 | movrnz $t0,-1,$t0 | ||
| 1147 | st $t0,[%fp+STACK_BIAS-12] | ||
| 1148 | |||
| 1149 | add $ap_real,64,$bp | ||
| 1150 | add $ap_real,64,$ap | ||
| 1151 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 1152 | add %sp,LOCALS+$Z1sqr,$rp | ||
| 1153 | |||
| 1154 | add $bp_real,0,$bp | ||
| 1155 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 1156 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1157 | add %sp,LOCALS+$U2,$rp | ||
| 1158 | |||
| 1159 | add $ap_real,0,$bp | ||
| 1160 | call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x); | ||
| 1161 | add %sp,LOCALS+$H,$rp | ||
| 1162 | |||
| 1163 | add $ap_real,64,$bp | ||
| 1164 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 1165 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1166 | add %sp,LOCALS+$S2,$rp | ||
| 1167 | |||
| 1168 | add $ap_real,64,$bp | ||
| 1169 | add %sp,LOCALS+$H,$ap | ||
| 1170 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); | ||
| 1171 | add %sp,LOCALS+$res_z,$rp | ||
| 1172 | |||
| 1173 | add $bp_real,32,$bp | ||
| 1174 | add %sp,LOCALS+$S2,$ap | ||
| 1175 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); | ||
| 1176 | add %sp,LOCALS+$S2,$rp | ||
| 1177 | |||
| 1178 | add $ap_real,32,$bp | ||
| 1179 | call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y); | ||
| 1180 | add %sp,LOCALS+$R,$rp | ||
| 1181 | |||
| 1182 | add %sp,LOCALS+$H,$bp | ||
| 1183 | add %sp,LOCALS+$H,$ap | ||
| 1184 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); | ||
| 1185 | add %sp,LOCALS+$Hsqr,$rp | ||
| 1186 | |||
| 1187 | add %sp,LOCALS+$R,$bp | ||
| 1188 | add %sp,LOCALS+$R,$ap | ||
| 1189 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); | ||
| 1190 | add %sp,LOCALS+$Rsqr,$rp | ||
| 1191 | |||
| 1192 | add %sp,LOCALS+$H,$bp | ||
| 1193 | add %sp,LOCALS+$Hsqr,$ap | ||
| 1194 | call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 1195 | add %sp,LOCALS+$Hcub,$rp | ||
| 1196 | |||
| 1197 | add $ap_real,0,$bp | ||
| 1198 | add %sp,LOCALS+$Hsqr,$ap | ||
| 1199 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1200 | add %sp,LOCALS+$U2,$rp | ||
| 1201 | |||
| 1202 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); | ||
| 1203 | add %sp,LOCALS+$Hsqr,$rp | ||
| 1204 | |||
| 1205 | add %sp,LOCALS+$Rsqr,$bp | ||
| 1206 | call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 1207 | add %sp,LOCALS+$res_x,$rp | ||
| 1208 | |||
| 1209 | add %sp,LOCALS+$Hcub,$bp | ||
| 1210 | call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); | ||
| 1211 | add %sp,LOCALS+$res_x,$rp | ||
| 1212 | |||
| 1213 | add %sp,LOCALS+$U2,$bp | ||
| 1214 | call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); | ||
| 1215 | add %sp,LOCALS+$res_y,$rp | ||
| 1216 | |||
| 1217 | add $ap_real,32,$bp | ||
| 1218 | add %sp,LOCALS+$Hcub,$ap | ||
| 1219 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub); | ||
| 1220 | add %sp,LOCALS+$S2,$rp | ||
| 1221 | |||
| 1222 | add %sp,LOCALS+$R,$bp | ||
| 1223 | add %sp,LOCALS+$res_y,$ap | ||
| 1224 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); | ||
| 1225 | add %sp,LOCALS+$res_y,$rp | ||
| 1226 | |||
| 1227 | add %sp,LOCALS+$S2,$bp | ||
| 1228 | call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); | ||
| 1229 | add %sp,LOCALS+$res_y,$rp | ||
| 1230 | |||
| 1231 | ld [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 1232 | ld [%fp+STACK_BIAS-12],$t2 ! !in2infty | ||
| 1233 | ldx [%fp+STACK_BIAS-8],$rp | ||
| 1234 | ___ | ||
| 1235 | for($i=0;$i<64;$i+=8) { # conditional moves | ||
| 1236 | $code.=<<___; | ||
| 1237 | ld [%sp+LOCALS+$i],@acc[0] ! res | ||
| 1238 | ld [%sp+LOCALS+$i+4],@acc[1] | ||
| 1239 | ld [$bp_real+$i],@acc[2] ! in2 | ||
| 1240 | ld [$bp_real+$i+4],@acc[3] | ||
| 1241 | ld [$ap_real+$i],@acc[4] ! in1 | ||
| 1242 | ld [$ap_real+$i+4],@acc[5] | ||
| 1243 | movrz $t1,@acc[2],@acc[0] | ||
| 1244 | movrz $t1,@acc[3],@acc[1] | ||
| 1245 | movrz $t2,@acc[4],@acc[0] | ||
| 1246 | movrz $t2,@acc[5],@acc[1] | ||
| 1247 | st @acc[0],[$rp+$i] | ||
| 1248 | st @acc[1],[$rp+$i+4] | ||
| 1249 | ___ | ||
| 1250 | } | ||
| 1251 | for(;$i<96;$i+=8) { | ||
| 1252 | my $j=($i-64)/4; | ||
| 1253 | $code.=<<___; | ||
| 1254 | ld [%sp+LOCALS+$i],@acc[0] ! res | ||
| 1255 | ld [%sp+LOCALS+$i+4],@acc[1] | ||
| 1256 | ld [$ap_real+$i],@acc[4] ! in1 | ||
| 1257 | ld [$ap_real+$i+4],@acc[5] | ||
| 1258 | movrz $t1,@ONE_mont[$j],@acc[0] | ||
| 1259 | movrz $t1,@ONE_mont[$j+1],@acc[1] | ||
| 1260 | movrz $t2,@acc[4],@acc[0] | ||
| 1261 | movrz $t2,@acc[5],@acc[1] | ||
| 1262 | st @acc[0],[$rp+$i] | ||
| 1263 | st @acc[1],[$rp+$i+4] | ||
| 1264 | ___ | ||
| 1265 | } | ||
| 1266 | $code.=<<___; | ||
| 1267 | ret | ||
| 1268 | restore | ||
| 1269 | .type ecp_nistz256_point_add_affine,#function | ||
| 1270 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | ||
| 1271 | ___ | ||
| 1272 | } }}} | ||
| 1273 | {{{ | ||
| 1274 | my ($out,$inp,$index)=map("%i$_",(0..2)); | ||
| 1275 | my $mask="%o0"; | ||
| 1276 | |||
| 1277 | $code.=<<___; | ||
| 1278 | ! void ecp_nistz256_select_w5(P256_POINT *%i0,const void *%i1, | ||
| 1279 | ! int %i2); | ||
| 1280 | .globl ecp_nistz256_select_w5 | ||
| 1281 | .align 32 | ||
| 1282 | ecp_nistz256_select_w5: | ||
| 1283 | save %sp,-STACK_FRAME,%sp | ||
| 1284 | |||
| 1285 | neg $index,$mask | ||
| 1286 | srax $mask,63,$mask | ||
| 1287 | |||
| 1288 | add $index,$mask,$index | ||
| 1289 | sll $index,2,$index | ||
| 1290 | add $inp,$index,$inp | ||
| 1291 | |||
| 1292 | ld [$inp+64*0],%l0 | ||
| 1293 | ld [$inp+64*1],%l1 | ||
| 1294 | ld [$inp+64*2],%l2 | ||
| 1295 | ld [$inp+64*3],%l3 | ||
| 1296 | ld [$inp+64*4],%l4 | ||
| 1297 | ld [$inp+64*5],%l5 | ||
| 1298 | ld [$inp+64*6],%l6 | ||
| 1299 | ld [$inp+64*7],%l7 | ||
| 1300 | add $inp,64*8,$inp | ||
| 1301 | and %l0,$mask,%l0 | ||
| 1302 | and %l1,$mask,%l1 | ||
| 1303 | st %l0,[$out] ! X | ||
| 1304 | and %l2,$mask,%l2 | ||
| 1305 | st %l1,[$out+4] | ||
| 1306 | and %l3,$mask,%l3 | ||
| 1307 | st %l2,[$out+8] | ||
| 1308 | and %l4,$mask,%l4 | ||
| 1309 | st %l3,[$out+12] | ||
| 1310 | and %l5,$mask,%l5 | ||
| 1311 | st %l4,[$out+16] | ||
| 1312 | and %l6,$mask,%l6 | ||
| 1313 | st %l5,[$out+20] | ||
| 1314 | and %l7,$mask,%l7 | ||
| 1315 | st %l6,[$out+24] | ||
| 1316 | st %l7,[$out+28] | ||
| 1317 | add $out,32,$out | ||
| 1318 | |||
| 1319 | ld [$inp+64*0],%l0 | ||
| 1320 | ld [$inp+64*1],%l1 | ||
| 1321 | ld [$inp+64*2],%l2 | ||
| 1322 | ld [$inp+64*3],%l3 | ||
| 1323 | ld [$inp+64*4],%l4 | ||
| 1324 | ld [$inp+64*5],%l5 | ||
| 1325 | ld [$inp+64*6],%l6 | ||
| 1326 | ld [$inp+64*7],%l7 | ||
| 1327 | add $inp,64*8,$inp | ||
| 1328 | and %l0,$mask,%l0 | ||
| 1329 | and %l1,$mask,%l1 | ||
| 1330 | st %l0,[$out] ! Y | ||
| 1331 | and %l2,$mask,%l2 | ||
| 1332 | st %l1,[$out+4] | ||
| 1333 | and %l3,$mask,%l3 | ||
| 1334 | st %l2,[$out+8] | ||
| 1335 | and %l4,$mask,%l4 | ||
| 1336 | st %l3,[$out+12] | ||
| 1337 | and %l5,$mask,%l5 | ||
| 1338 | st %l4,[$out+16] | ||
| 1339 | and %l6,$mask,%l6 | ||
| 1340 | st %l5,[$out+20] | ||
| 1341 | and %l7,$mask,%l7 | ||
| 1342 | st %l6,[$out+24] | ||
| 1343 | st %l7,[$out+28] | ||
| 1344 | add $out,32,$out | ||
| 1345 | |||
| 1346 | ld [$inp+64*0],%l0 | ||
| 1347 | ld [$inp+64*1],%l1 | ||
| 1348 | ld [$inp+64*2],%l2 | ||
| 1349 | ld [$inp+64*3],%l3 | ||
| 1350 | ld [$inp+64*4],%l4 | ||
| 1351 | ld [$inp+64*5],%l5 | ||
| 1352 | ld [$inp+64*6],%l6 | ||
| 1353 | ld [$inp+64*7],%l7 | ||
| 1354 | and %l0,$mask,%l0 | ||
| 1355 | and %l1,$mask,%l1 | ||
| 1356 | st %l0,[$out] ! Z | ||
| 1357 | and %l2,$mask,%l2 | ||
| 1358 | st %l1,[$out+4] | ||
| 1359 | and %l3,$mask,%l3 | ||
| 1360 | st %l2,[$out+8] | ||
| 1361 | and %l4,$mask,%l4 | ||
| 1362 | st %l3,[$out+12] | ||
| 1363 | and %l5,$mask,%l5 | ||
| 1364 | st %l4,[$out+16] | ||
| 1365 | and %l6,$mask,%l6 | ||
| 1366 | st %l5,[$out+20] | ||
| 1367 | and %l7,$mask,%l7 | ||
| 1368 | st %l6,[$out+24] | ||
| 1369 | st %l7,[$out+28] | ||
| 1370 | |||
| 1371 | ret | ||
| 1372 | restore | ||
| 1373 | .type ecp_nistz256_select_w5,#function | ||
| 1374 | .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 | ||
| 1375 | |||
| 1376 | ! void ecp_nistz256_select_w7(P256_POINT_AFFINE *%i0,const void *%i1, | ||
| 1377 | ! int %i2); | ||
| 1378 | .globl ecp_nistz256_select_w7 | ||
| 1379 | .align 32 | ||
| 1380 | ecp_nistz256_select_w7: | ||
| 1381 | save %sp,-STACK_FRAME,%sp | ||
| 1382 | |||
| 1383 | neg $index,$mask | ||
| 1384 | srax $mask,63,$mask | ||
| 1385 | |||
| 1386 | add $index,$mask,$index | ||
| 1387 | add $inp,$index,$inp | ||
| 1388 | mov 64/4,$index | ||
| 1389 | |||
| 1390 | .Loop_select_w7: | ||
| 1391 | ldub [$inp+64*0],%l0 | ||
| 1392 | prefetch [$inp+3840+64*0],1 | ||
| 1393 | subcc $index,1,$index | ||
| 1394 | ldub [$inp+64*1],%l1 | ||
| 1395 | prefetch [$inp+3840+64*1],1 | ||
| 1396 | ldub [$inp+64*2],%l2 | ||
| 1397 | prefetch [$inp+3840+64*2],1 | ||
| 1398 | ldub [$inp+64*3],%l3 | ||
| 1399 | prefetch [$inp+3840+64*3],1 | ||
| 1400 | add $inp,64*4,$inp | ||
| 1401 | sll %l1,8,%l1 | ||
| 1402 | sll %l2,16,%l2 | ||
| 1403 | or %l0,%l1,%l0 | ||
| 1404 | sll %l3,24,%l3 | ||
| 1405 | or %l0,%l2,%l0 | ||
| 1406 | or %l0,%l3,%l0 | ||
| 1407 | and %l0,$mask,%l0 | ||
| 1408 | st %l0,[$out] | ||
| 1409 | bne .Loop_select_w7 | ||
| 1410 | add $out,4,$out | ||
| 1411 | |||
| 1412 | ret | ||
| 1413 | restore | ||
| 1414 | .type ecp_nistz256_select_w7,#function | ||
| 1415 | .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 | ||
| 1416 | ___ | ||
| 1417 | }}} | ||
| 1418 | {{{ | ||
| 1419 | ######################################################################## | ||
| 1420 | # Following subroutines are VIS3 counterparts of those above that | ||
| 1421 | # implement ones found in ecp_nistz256.c. Key difference is that they | ||
| 1422 | # use 128-bit muliplication and addition with 64-bit carry, and in order | ||
| 1423 | # to do that they perform conversion from uin32_t[8] to uint64_t[4] upon | ||
| 1424 | # entry and vice versa on return. | ||
| 1425 | # | ||
| 1426 | my ($rp,$ap,$bp)=map("%i$_",(0..2)); | ||
| 1427 | my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7)); | ||
| 1428 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5)); | ||
| 1429 | my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1"); | ||
| 1430 | my ($rp_real,$ap_real)=("%g2","%g3"); | ||
| 1431 | my ($acc6,$acc7)=($bp,$bi); # used in squaring | ||
| 1432 | |||
| 1433 | $code.=<<___; | ||
| 1434 | #if 0 | ||
| 1435 | .align 32 | ||
| 1436 | __ecp_nistz256_mul_by_2_vis3: | ||
| 1437 | addcc $acc0,$acc0,$acc0 | ||
| 1438 | addxccc $acc1,$acc1,$acc1 | ||
| 1439 | addxccc $acc2,$acc2,$acc2 | ||
| 1440 | addxccc $acc3,$acc3,$acc3 | ||
| 1441 | b .Lreduce_by_sub_vis3 | ||
| 1442 | addxc %g0,%g0,$acc4 ! did it carry? | ||
| 1443 | .type __ecp_nistz256_mul_by_2_vis3,#function | ||
| 1444 | .size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3 | ||
| 1445 | |||
| 1446 | .align 32 | ||
| 1447 | __ecp_nistz256_add_vis3: | ||
| 1448 | ldx [$bp+0],$t0 | ||
| 1449 | ldx [$bp+8],$t1 | ||
| 1450 | ldx [$bp+16],$t2 | ||
| 1451 | ldx [$bp+24],$t3 | ||
| 1452 | |||
| 1453 | __ecp_nistz256_add_noload_vis3: | ||
| 1454 | |||
| 1455 | addcc $t0,$acc0,$acc0 | ||
| 1456 | addxccc $t1,$acc1,$acc1 | ||
| 1457 | addxccc $t2,$acc2,$acc2 | ||
| 1458 | addxccc $t3,$acc3,$acc3 | ||
| 1459 | addxc %g0,%g0,$acc4 ! did it carry? | ||
| 1460 | |||
| 1461 | .Lreduce_by_sub_vis3: | ||
| 1462 | |||
| 1463 | addcc $acc0,1,$t0 ! add -modulus, i.e. subtract | ||
| 1464 | addxccc $acc1,$poly1,$t1 | ||
| 1465 | addxccc $acc2,$minus1,$t2 | ||
| 1466 | addxccc $acc3,$poly3,$t3 | ||
| 1467 | addxc $acc4,$minus1,$acc4 | ||
| 1468 | |||
| 1469 | movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus | ||
| 1470 | movrz $acc4,$t1,$acc1 | ||
| 1471 | stx $acc0,[$rp] | ||
| 1472 | movrz $acc4,$t2,$acc2 | ||
| 1473 | stx $acc1,[$rp+8] | ||
| 1474 | movrz $acc4,$t3,$acc3 | ||
| 1475 | stx $acc2,[$rp+16] | ||
| 1476 | retl | ||
| 1477 | stx $acc3,[$rp+24] | ||
| 1478 | .type __ecp_nistz256_add_vis3,#function | ||
| 1479 | .size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3 | ||
| 1480 | |||
| 1481 | ! Trouble with subtraction is that there is no subtraction with 64-bit | ||
| 1482 | ! borrow, only with 32-bit one. For this reason we "decompose" 64-bit | ||
| 1483 | ! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But | ||
| 1484 | ! recall that SPARC is big-endian, which is why you'll observe that | ||
| 1485 | ! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we | ||
| 1486 | ! "collect" result back to 64-bit $acc0-$acc3. | ||
| 1487 | .align 32 | ||
| 1488 | __ecp_nistz256_sub_from_vis3: | ||
| 1489 | ld [$bp+4],$t0 | ||
| 1490 | ld [$bp+0],$t1 | ||
| 1491 | ld [$bp+12],$t2 | ||
| 1492 | ld [$bp+8],$t3 | ||
| 1493 | |||
| 1494 | srlx $acc0,32,$acc4 | ||
| 1495 | not $poly1,$poly1 | ||
| 1496 | srlx $acc1,32,$acc5 | ||
| 1497 | subcc $acc0,$t0,$acc0 | ||
| 1498 | ld [$bp+20],$t0 | ||
| 1499 | subccc $acc4,$t1,$acc4 | ||
| 1500 | ld [$bp+16],$t1 | ||
| 1501 | subccc $acc1,$t2,$acc1 | ||
| 1502 | ld [$bp+28],$t2 | ||
| 1503 | and $acc0,$poly1,$acc0 | ||
| 1504 | subccc $acc5,$t3,$acc5 | ||
| 1505 | ld [$bp+24],$t3 | ||
| 1506 | sllx $acc4,32,$acc4 | ||
| 1507 | and $acc1,$poly1,$acc1 | ||
| 1508 | sllx $acc5,32,$acc5 | ||
| 1509 | or $acc0,$acc4,$acc0 | ||
| 1510 | srlx $acc2,32,$acc4 | ||
| 1511 | or $acc1,$acc5,$acc1 | ||
| 1512 | srlx $acc3,32,$acc5 | ||
| 1513 | subccc $acc2,$t0,$acc2 | ||
| 1514 | subccc $acc4,$t1,$acc4 | ||
| 1515 | subccc $acc3,$t2,$acc3 | ||
| 1516 | and $acc2,$poly1,$acc2 | ||
| 1517 | subccc $acc5,$t3,$acc5 | ||
| 1518 | sllx $acc4,32,$acc4 | ||
| 1519 | and $acc3,$poly1,$acc3 | ||
| 1520 | sllx $acc5,32,$acc5 | ||
| 1521 | or $acc2,$acc4,$acc2 | ||
| 1522 | subc %g0,%g0,$acc4 ! did it borrow? | ||
| 1523 | b .Lreduce_by_add_vis3 | ||
| 1524 | or $acc3,$acc5,$acc3 | ||
| 1525 | .type __ecp_nistz256_sub_from_vis3,#function | ||
| 1526 | .size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3 | ||
| 1527 | |||
| 1528 | .align 32 | ||
| 1529 | __ecp_nistz256_sub_morf_vis3: | ||
| 1530 | ld [$bp+4],$t0 | ||
| 1531 | ld [$bp+0],$t1 | ||
| 1532 | ld [$bp+12],$t2 | ||
| 1533 | ld [$bp+8],$t3 | ||
| 1534 | |||
| 1535 | srlx $acc0,32,$acc4 | ||
| 1536 | not $poly1,$poly1 | ||
| 1537 | srlx $acc1,32,$acc5 | ||
| 1538 | subcc $t0,$acc0,$acc0 | ||
| 1539 | ld [$bp+20],$t0 | ||
| 1540 | subccc $t1,$acc4,$acc4 | ||
| 1541 | ld [$bp+16],$t1 | ||
| 1542 | subccc $t2,$acc1,$acc1 | ||
| 1543 | ld [$bp+28],$t2 | ||
| 1544 | and $acc0,$poly1,$acc0 | ||
| 1545 | subccc $t3,$acc5,$acc5 | ||
| 1546 | ld [$bp+24],$t3 | ||
| 1547 | sllx $acc4,32,$acc4 | ||
| 1548 | and $acc1,$poly1,$acc1 | ||
| 1549 | sllx $acc5,32,$acc5 | ||
| 1550 | or $acc0,$acc4,$acc0 | ||
| 1551 | srlx $acc2,32,$acc4 | ||
| 1552 | or $acc1,$acc5,$acc1 | ||
| 1553 | srlx $acc3,32,$acc5 | ||
| 1554 | subccc $t0,$acc2,$acc2 | ||
| 1555 | subccc $t1,$acc4,$acc4 | ||
| 1556 | subccc $t2,$acc3,$acc3 | ||
| 1557 | and $acc2,$poly1,$acc2 | ||
| 1558 | subccc $t3,$acc5,$acc5 | ||
| 1559 | sllx $acc4,32,$acc4 | ||
| 1560 | and $acc3,$poly1,$acc3 | ||
| 1561 | sllx $acc5,32,$acc5 | ||
| 1562 | or $acc2,$acc4,$acc2 | ||
| 1563 | subc %g0,%g0,$acc4 ! did it borrow? | ||
| 1564 | or $acc3,$acc5,$acc3 | ||
| 1565 | |||
| 1566 | .Lreduce_by_add_vis3: | ||
| 1567 | |||
| 1568 | addcc $acc0,-1,$t0 ! add modulus | ||
| 1569 | not $poly3,$t3 | ||
| 1570 | addxccc $acc1,$poly1,$t1 | ||
| 1571 | not $poly1,$poly1 ! restore $poly1 | ||
| 1572 | addxccc $acc2,%g0,$t2 | ||
| 1573 | addxc $acc3,$t3,$t3 | ||
| 1574 | |||
| 1575 | movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod | ||
| 1576 | movrnz $acc4,$t1,$acc1 | ||
| 1577 | stx $acc0,[$rp] | ||
| 1578 | movrnz $acc4,$t2,$acc2 | ||
| 1579 | stx $acc1,[$rp+8] | ||
| 1580 | movrnz $acc4,$t3,$acc3 | ||
| 1581 | stx $acc2,[$rp+16] | ||
| 1582 | retl | ||
| 1583 | stx $acc3,[$rp+24] | ||
| 1584 | .type __ecp_nistz256_sub_morf_vis3,#function | ||
| 1585 | .size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3 | ||
| 1586 | |||
| 1587 | .align 32 | ||
| 1588 | __ecp_nistz256_div_by_2_vis3: | ||
| 1589 | ! ret = (a is odd ? a+mod : a) >> 1 | ||
| 1590 | |||
| 1591 | not $poly1,$t1 | ||
| 1592 | not $poly3,$t3 | ||
| 1593 | and $acc0,1,$acc5 | ||
| 1594 | addcc $acc0,-1,$t0 ! add modulus | ||
| 1595 | addxccc $acc1,$t1,$t1 | ||
| 1596 | addxccc $acc2,%g0,$t2 | ||
| 1597 | addxccc $acc3,$t3,$t3 | ||
| 1598 | addxc %g0,%g0,$acc4 ! carry bit | ||
| 1599 | |||
| 1600 | movrnz $acc5,$t0,$acc0 | ||
| 1601 | movrnz $acc5,$t1,$acc1 | ||
| 1602 | movrnz $acc5,$t2,$acc2 | ||
| 1603 | movrnz $acc5,$t3,$acc3 | ||
| 1604 | movrz $acc5,%g0,$acc4 | ||
| 1605 | |||
| 1606 | ! ret >>= 1 | ||
| 1607 | |||
| 1608 | srlx $acc0,1,$acc0 | ||
| 1609 | sllx $acc1,63,$t0 | ||
| 1610 | srlx $acc1,1,$acc1 | ||
| 1611 | or $acc0,$t0,$acc0 | ||
| 1612 | sllx $acc2,63,$t1 | ||
| 1613 | srlx $acc2,1,$acc2 | ||
| 1614 | or $acc1,$t1,$acc1 | ||
| 1615 | sllx $acc3,63,$t2 | ||
| 1616 | stx $acc0,[$rp] | ||
| 1617 | srlx $acc3,1,$acc3 | ||
| 1618 | or $acc2,$t2,$acc2 | ||
| 1619 | sllx $acc4,63,$t3 ! don't forget carry bit | ||
| 1620 | stx $acc1,[$rp+8] | ||
| 1621 | or $acc3,$t3,$acc3 | ||
| 1622 | stx $acc2,[$rp+16] | ||
| 1623 | retl | ||
| 1624 | stx $acc3,[$rp+24] | ||
| 1625 | .type __ecp_nistz256_div_by_2_vis3,#function | ||
| 1626 | .size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3 | ||
| 1627 | |||
| 1628 | ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and | ||
| 1629 | ! 4x faster [on T4]... | ||
| 1630 | .align 32 | ||
| 1631 | __ecp_nistz256_mul_mont_vis3: | ||
| 1632 | mulx $a0,$bi,$acc0 | ||
| 1633 | not $poly3,$poly3 ! 0xFFFFFFFF00000001 | ||
| 1634 | umulxhi $a0,$bi,$t0 | ||
| 1635 | mulx $a1,$bi,$acc1 | ||
| 1636 | umulxhi $a1,$bi,$t1 | ||
| 1637 | mulx $a2,$bi,$acc2 | ||
| 1638 | umulxhi $a2,$bi,$t2 | ||
| 1639 | mulx $a3,$bi,$acc3 | ||
| 1640 | umulxhi $a3,$bi,$t3 | ||
| 1641 | ldx [$bp+8],$bi ! b[1] | ||
| 1642 | |||
| 1643 | addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication | ||
| 1644 | sllx $acc0,32,$t0 | ||
| 1645 | addxccc $acc2,$t1,$acc2 | ||
| 1646 | srlx $acc0,32,$t1 | ||
| 1647 | addxccc $acc3,$t2,$acc3 | ||
| 1648 | addxc %g0,$t3,$acc4 | ||
| 1649 | mov 0,$acc5 | ||
| 1650 | ___ | ||
| 1651 | for($i=1;$i<4;$i++) { | ||
| 1652 | # Reduction iteration is normally performed by accumulating | ||
| 1653 | # result of multiplication of modulus by "magic" digit [and | ||
| 1654 | # omitting least significant word, which is guaranteed to | ||
| 1655 | # be 0], but thanks to special form of modulus and "magic" | ||
| 1656 | # digit being equal to least significant word, it can be | ||
| 1657 | # performed with additions and subtractions alone. Indeed: | ||
| 1658 | # | ||
| 1659 | # ffff0001.00000000.0000ffff.ffffffff | ||
| 1660 | # * abcdefgh | ||
| 1661 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | ||
| 1662 | # | ||
| 1663 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 1664 | # rewrite above as: | ||
| 1665 | # | ||
| 1666 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | ||
| 1667 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 | ||
| 1668 | # - 0000abcd.efgh0000.00000000.00000000.abcdefgh | ||
| 1669 | # | ||
| 1670 | # or marking redundant operations: | ||
| 1671 | # | ||
| 1672 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- | ||
| 1673 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- | ||
| 1674 | # - 0000abcd.efgh0000.--------.--------.-------- | ||
| 1675 | # ^^^^^^^^ but this word is calculated with umulxhi, because | ||
| 1676 | # there is no subtract with 64-bit borrow:-( | ||
| 1677 | |||
| 1678 | $code.=<<___; | ||
| 1679 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1680 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1681 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1682 | mulx $a0,$bi,$t0 | ||
| 1683 | addxccc $acc2,$t1,$acc1 | ||
| 1684 | mulx $a1,$bi,$t1 | ||
| 1685 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1686 | mulx $a2,$bi,$t2 | ||
| 1687 | addxccc $acc4,$t3,$acc3 | ||
| 1688 | mulx $a3,$bi,$t3 | ||
| 1689 | addxc $acc5,%g0,$acc4 | ||
| 1690 | |||
| 1691 | addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication | ||
| 1692 | umulxhi $a0,$bi,$t0 | ||
| 1693 | addxccc $acc1,$t1,$acc1 | ||
| 1694 | umulxhi $a1,$bi,$t1 | ||
| 1695 | addxccc $acc2,$t2,$acc2 | ||
| 1696 | umulxhi $a2,$bi,$t2 | ||
| 1697 | addxccc $acc3,$t3,$acc3 | ||
| 1698 | umulxhi $a3,$bi,$t3 | ||
| 1699 | addxc $acc4,%g0,$acc4 | ||
| 1700 | ___ | ||
| 1701 | $code.=<<___ if ($i<3); | ||
| 1702 | ldx [$bp+8*($i+1)],$bi ! bp[$i+1] | ||
| 1703 | ___ | ||
| 1704 | $code.=<<___; | ||
| 1705 | addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication | ||
| 1706 | sllx $acc0,32,$t0 | ||
| 1707 | addxccc $acc2,$t1,$acc2 | ||
| 1708 | srlx $acc0,32,$t1 | ||
| 1709 | addxccc $acc3,$t2,$acc3 | ||
| 1710 | addxccc $acc4,$t3,$acc4 | ||
| 1711 | addxc %g0,%g0,$acc5 | ||
| 1712 | ___ | ||
| 1713 | } | ||
| 1714 | $code.=<<___; | ||
| 1715 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1716 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1717 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1718 | addxccc $acc2,$t1,$acc1 | ||
| 1719 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1720 | addxccc $acc4,$t3,$acc3 | ||
| 1721 | b .Lmul_final_vis3 ! see below | ||
| 1722 | addxc $acc5,%g0,$acc4 | ||
| 1723 | .type __ecp_nistz256_mul_mont_vis3,#function | ||
| 1724 | .size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3 | ||
| 1725 | |||
| 1726 | ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less | ||
| 1727 | ! instructions, but only 14% faster [on T4]... | ||
| 1728 | .align 32 | ||
| 1729 | __ecp_nistz256_sqr_mont_vis3: | ||
| 1730 | ! | | | | | |a1*a0| | | ||
| 1731 | ! | | | | |a2*a0| | | | ||
| 1732 | ! | |a3*a2|a3*a0| | | | | ||
| 1733 | ! | | | |a2*a1| | | | | ||
| 1734 | ! | | |a3*a1| | | | | | ||
| 1735 | ! *| | | | | | | | 2| | ||
| 1736 | ! +|a3*a3|a2*a2|a1*a1|a0*a0| | ||
| 1737 | ! |--+--+--+--+--+--+--+--| | ||
| 1738 | ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx | ||
| 1739 | ! | ||
| 1740 | ! "can't overflow" below mark carrying into high part of | ||
| 1741 | ! multiplication result, which can't overflow, because it | ||
| 1742 | ! can never be all ones. | ||
| 1743 | |||
| 1744 | mulx $a1,$a0,$acc1 ! a[1]*a[0] | ||
| 1745 | umulxhi $a1,$a0,$t1 | ||
| 1746 | mulx $a2,$a0,$acc2 ! a[2]*a[0] | ||
| 1747 | umulxhi $a2,$a0,$t2 | ||
| 1748 | mulx $a3,$a0,$acc3 ! a[3]*a[0] | ||
| 1749 | umulxhi $a3,$a0,$acc4 | ||
| 1750 | |||
| 1751 | addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication | ||
| 1752 | mulx $a2,$a1,$t0 ! a[2]*a[1] | ||
| 1753 | umulxhi $a2,$a1,$t1 | ||
| 1754 | addxccc $acc3,$t2,$acc3 | ||
| 1755 | mulx $a3,$a1,$t2 ! a[3]*a[1] | ||
| 1756 | umulxhi $a3,$a1,$t3 | ||
| 1757 | addxc $acc4,%g0,$acc4 ! can't overflow | ||
| 1758 | |||
| 1759 | mulx $a3,$a2,$acc5 ! a[3]*a[2] | ||
| 1760 | not $poly3,$poly3 ! 0xFFFFFFFF00000001 | ||
| 1761 | umulxhi $a3,$a2,$acc6 | ||
| 1762 | |||
| 1763 | addcc $t2,$t1,$t1 ! accumulate high parts of multiplication | ||
| 1764 | mulx $a0,$a0,$acc0 ! a[0]*a[0] | ||
| 1765 | addxc $t3,%g0,$t2 ! can't overflow | ||
| 1766 | |||
| 1767 | addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication | ||
| 1768 | umulxhi $a0,$a0,$a0 | ||
| 1769 | addxccc $acc4,$t1,$acc4 | ||
| 1770 | mulx $a1,$a1,$t1 ! a[1]*a[1] | ||
| 1771 | addxccc $acc5,$t2,$acc5 | ||
| 1772 | umulxhi $a1,$a1,$a1 | ||
| 1773 | addxc $acc6,%g0,$acc6 ! can't overflow | ||
| 1774 | |||
| 1775 | addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2 | ||
| 1776 | mulx $a2,$a2,$t2 ! a[2]*a[2] | ||
| 1777 | addxccc $acc2,$acc2,$acc2 | ||
| 1778 | umulxhi $a2,$a2,$a2 | ||
| 1779 | addxccc $acc3,$acc3,$acc3 | ||
| 1780 | mulx $a3,$a3,$t3 ! a[3]*a[3] | ||
| 1781 | addxccc $acc4,$acc4,$acc4 | ||
| 1782 | umulxhi $a3,$a3,$a3 | ||
| 1783 | addxccc $acc5,$acc5,$acc5 | ||
| 1784 | addxccc $acc6,$acc6,$acc6 | ||
| 1785 | addxc %g0,%g0,$acc7 | ||
| 1786 | |||
| 1787 | addcc $acc1,$a0,$acc1 ! +a[i]*a[i] | ||
| 1788 | addxccc $acc2,$t1,$acc2 | ||
| 1789 | addxccc $acc3,$a1,$acc3 | ||
| 1790 | addxccc $acc4,$t2,$acc4 | ||
| 1791 | sllx $acc0,32,$t0 | ||
| 1792 | addxccc $acc5,$a2,$acc5 | ||
| 1793 | srlx $acc0,32,$t1 | ||
| 1794 | addxccc $acc6,$t3,$acc6 | ||
| 1795 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1796 | addxc $acc7,$a3,$acc7 | ||
| 1797 | ___ | ||
| 1798 | for($i=0;$i<3;$i++) { # reductions, see commentary | ||
| 1799 | # in multiplication for details | ||
| 1800 | $code.=<<___; | ||
| 1801 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1802 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1803 | sllx $acc0,32,$t0 | ||
| 1804 | addxccc $acc2,$t1,$acc1 | ||
| 1805 | srlx $acc0,32,$t1 | ||
| 1806 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1807 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1808 | addxc %g0,$t3,$acc3 ! can't overflow | ||
| 1809 | ___ | ||
| 1810 | } | ||
| 1811 | $code.=<<___; | ||
| 1812 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1813 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1814 | addxccc $acc2,$t1,$acc1 | ||
| 1815 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1816 | addxc %g0,$t3,$acc3 ! can't overflow | ||
| 1817 | |||
| 1818 | addcc $acc0,$acc4,$acc0 ! accumulate upper half | ||
| 1819 | addxccc $acc1,$acc5,$acc1 | ||
| 1820 | addxccc $acc2,$acc6,$acc2 | ||
| 1821 | addxccc $acc3,$acc7,$acc3 | ||
| 1822 | addxc %g0,%g0,$acc4 | ||
| 1823 | |||
| 1824 | .Lmul_final_vis3: | ||
| 1825 | |||
| 1826 | ! Final step is "if result > mod, subtract mod", but as comparison | ||
| 1827 | ! means subtraction, we do the subtraction and then copy outcome | ||
| 1828 | ! if it didn't borrow. But note that as we [have to] replace | ||
| 1829 | ! subtraction with addition with negative, carry/borrow logic is | ||
| 1830 | ! inverse. | ||
| 1831 | |||
| 1832 | addcc $acc0,1,$t0 ! add -modulus, i.e. subtract | ||
| 1833 | not $poly3,$poly3 ! restore 0x00000000FFFFFFFE | ||
| 1834 | addxccc $acc1,$poly1,$t1 | ||
| 1835 | addxccc $acc2,$minus1,$t2 | ||
| 1836 | addxccc $acc3,$poly3,$t3 | ||
| 1837 | addxccc $acc4,$minus1,%g0 ! did it carry? | ||
| 1838 | |||
| 1839 | movcs %xcc,$t0,$acc0 | ||
| 1840 | movcs %xcc,$t1,$acc1 | ||
| 1841 | stx $acc0,[$rp] | ||
| 1842 | movcs %xcc,$t2,$acc2 | ||
| 1843 | stx $acc1,[$rp+8] | ||
| 1844 | movcs %xcc,$t3,$acc3 | ||
| 1845 | stx $acc2,[$rp+16] | ||
| 1846 | retl | ||
| 1847 | stx $acc3,[$rp+24] | ||
| 1848 | .type __ecp_nistz256_sqr_mont_vis3,#function | ||
| 1849 | .size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3 | ||
| 1850 | ___ | ||
| 1851 | |||
| 1852 | ######################################################################## | ||
| 1853 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 1854 | # | ||
| 1855 | { | ||
| 1856 | my ($res_x,$res_y,$res_z, | ||
| 1857 | $in_x,$in_y,$in_z, | ||
| 1858 | $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9)); | ||
| 1859 | # above map() describes stack layout with 10 temporary | ||
| 1860 | # 256-bit vectors on top. | ||
| 1861 | |||
| 1862 | $code.=<<___; | ||
| 1863 | .align 32 | ||
| 1864 | ecp_nistz256_point_double_vis3: | ||
| 1865 | save %sp,-STACK64_FRAME-32*10,%sp | ||
| 1866 | |||
| 1867 | mov $rp,$rp_real | ||
| 1868 | .Ldouble_shortcut_vis3: | ||
| 1869 | mov -1,$minus1 | ||
| 1870 | mov -2,$poly3 | ||
| 1871 | sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 | ||
| 1872 | srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE | ||
| 1873 | |||
| 1874 | ! convert input to uint64_t[4] | ||
| 1875 | ld [$ap],$a0 ! in_x | ||
| 1876 | ld [$ap+4],$t0 | ||
| 1877 | ld [$ap+8],$a1 | ||
| 1878 | ld [$ap+12],$t1 | ||
| 1879 | ld [$ap+16],$a2 | ||
| 1880 | ld [$ap+20],$t2 | ||
| 1881 | ld [$ap+24],$a3 | ||
| 1882 | ld [$ap+28],$t3 | ||
| 1883 | sllx $t0,32,$t0 | ||
| 1884 | sllx $t1,32,$t1 | ||
| 1885 | ld [$ap+32],$acc0 ! in_y | ||
| 1886 | or $a0,$t0,$a0 | ||
| 1887 | ld [$ap+32+4],$t0 | ||
| 1888 | sllx $t2,32,$t2 | ||
| 1889 | ld [$ap+32+8],$acc1 | ||
| 1890 | or $a1,$t1,$a1 | ||
| 1891 | ld [$ap+32+12],$t1 | ||
| 1892 | sllx $t3,32,$t3 | ||
| 1893 | ld [$ap+32+16],$acc2 | ||
| 1894 | or $a2,$t2,$a2 | ||
| 1895 | ld [$ap+32+20],$t2 | ||
| 1896 | or $a3,$t3,$a3 | ||
| 1897 | ld [$ap+32+24],$acc3 | ||
| 1898 | sllx $t0,32,$t0 | ||
| 1899 | ld [$ap+32+28],$t3 | ||
| 1900 | sllx $t1,32,$t1 | ||
| 1901 | stx $a0,[%sp+LOCALS64+$in_x] | ||
| 1902 | sllx $t2,32,$t2 | ||
| 1903 | stx $a1,[%sp+LOCALS64+$in_x+8] | ||
| 1904 | sllx $t3,32,$t3 | ||
| 1905 | stx $a2,[%sp+LOCALS64+$in_x+16] | ||
| 1906 | or $acc0,$t0,$acc0 | ||
| 1907 | stx $a3,[%sp+LOCALS64+$in_x+24] | ||
| 1908 | or $acc1,$t1,$acc1 | ||
| 1909 | stx $acc0,[%sp+LOCALS64+$in_y] | ||
| 1910 | or $acc2,$t2,$acc2 | ||
| 1911 | stx $acc1,[%sp+LOCALS64+$in_y+8] | ||
| 1912 | or $acc3,$t3,$acc3 | ||
| 1913 | stx $acc2,[%sp+LOCALS64+$in_y+16] | ||
| 1914 | stx $acc3,[%sp+LOCALS64+$in_y+24] | ||
| 1915 | |||
| 1916 | ld [$ap+64],$a0 ! in_z | ||
| 1917 | ld [$ap+64+4],$t0 | ||
| 1918 | ld [$ap+64+8],$a1 | ||
| 1919 | ld [$ap+64+12],$t1 | ||
| 1920 | ld [$ap+64+16],$a2 | ||
| 1921 | ld [$ap+64+20],$t2 | ||
| 1922 | ld [$ap+64+24],$a3 | ||
| 1923 | ld [$ap+64+28],$t3 | ||
| 1924 | sllx $t0,32,$t0 | ||
| 1925 | sllx $t1,32,$t1 | ||
| 1926 | or $a0,$t0,$a0 | ||
| 1927 | sllx $t2,32,$t2 | ||
| 1928 | or $a1,$t1,$a1 | ||
| 1929 | sllx $t3,32,$t3 | ||
| 1930 | or $a2,$t2,$a2 | ||
| 1931 | or $a3,$t3,$a3 | ||
| 1932 | sllx $t0,32,$t0 | ||
| 1933 | sllx $t1,32,$t1 | ||
| 1934 | stx $a0,[%sp+LOCALS64+$in_z] | ||
| 1935 | sllx $t2,32,$t2 | ||
| 1936 | stx $a1,[%sp+LOCALS64+$in_z+8] | ||
| 1937 | sllx $t3,32,$t3 | ||
| 1938 | stx $a2,[%sp+LOCALS64+$in_z+16] | ||
| 1939 | stx $a3,[%sp+LOCALS64+$in_z+24] | ||
| 1940 | |||
| 1941 | ! in_y is still in $acc0-$acc3 | ||
| 1942 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y); | ||
| 1943 | add %sp,LOCALS64+$S,$rp | ||
| 1944 | |||
| 1945 | ! in_z is still in $a0-$a3 | ||
| 1946 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z); | ||
| 1947 | add %sp,LOCALS64+$Zsqr,$rp | ||
| 1948 | |||
| 1949 | mov $acc0,$a0 ! put Zsqr aside | ||
| 1950 | mov $acc1,$a1 | ||
| 1951 | mov $acc2,$a2 | ||
| 1952 | mov $acc3,$a3 | ||
| 1953 | |||
| 1954 | add %sp,LOCALS64+$in_x,$bp | ||
| 1955 | call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x); | ||
| 1956 | add %sp,LOCALS64+$M,$rp | ||
| 1957 | |||
| 1958 | mov $a0,$acc0 ! restore Zsqr | ||
| 1959 | ldx [%sp+LOCALS64+$S],$a0 ! forward load | ||
| 1960 | mov $a1,$acc1 | ||
| 1961 | ldx [%sp+LOCALS64+$S+8],$a1 | ||
| 1962 | mov $a2,$acc2 | ||
| 1963 | ldx [%sp+LOCALS64+$S+16],$a2 | ||
| 1964 | mov $a3,$acc3 | ||
| 1965 | ldx [%sp+LOCALS64+$S+24],$a3 | ||
| 1966 | |||
| 1967 | add %sp,LOCALS64+$in_x,$bp | ||
| 1968 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr); | ||
| 1969 | add %sp,LOCALS64+$Zsqr,$rp | ||
| 1970 | |||
| 1971 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S); | ||
| 1972 | add %sp,LOCALS64+$S,$rp | ||
| 1973 | |||
| 1974 | ldx [%sp+LOCALS64+$in_z],$bi | ||
| 1975 | ldx [%sp+LOCALS64+$in_y],$a0 | ||
| 1976 | ldx [%sp+LOCALS64+$in_y+8],$a1 | ||
| 1977 | ldx [%sp+LOCALS64+$in_y+16],$a2 | ||
| 1978 | ldx [%sp+LOCALS64+$in_y+24],$a3 | ||
| 1979 | add %sp,LOCALS64+$in_z,$bp | ||
| 1980 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y); | ||
| 1981 | add %sp,LOCALS64+$tmp0,$rp | ||
| 1982 | |||
| 1983 | ldx [%sp+LOCALS64+$M],$bi ! forward load | ||
| 1984 | ldx [%sp+LOCALS64+$Zsqr],$a0 | ||
| 1985 | ldx [%sp+LOCALS64+$Zsqr+8],$a1 | ||
| 1986 | ldx [%sp+LOCALS64+$Zsqr+16],$a2 | ||
| 1987 | ldx [%sp+LOCALS64+$Zsqr+24],$a3 | ||
| 1988 | |||
| 1989 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0); | ||
| 1990 | add %sp,LOCALS64+$res_z,$rp | ||
| 1991 | |||
| 1992 | add %sp,LOCALS64+$M,$bp | ||
| 1993 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr); | ||
| 1994 | add %sp,LOCALS64+$M,$rp | ||
| 1995 | |||
| 1996 | mov $acc0,$a0 ! put aside M | ||
| 1997 | mov $acc1,$a1 | ||
| 1998 | mov $acc2,$a2 | ||
| 1999 | mov $acc3,$a3 | ||
| 2000 | call __ecp_nistz256_mul_by_2_vis3 | ||
| 2001 | add %sp,LOCALS64+$M,$rp | ||
| 2002 | mov $a0,$t0 ! copy M | ||
| 2003 | ldx [%sp+LOCALS64+$S],$a0 ! forward load | ||
| 2004 | mov $a1,$t1 | ||
| 2005 | ldx [%sp+LOCALS64+$S+8],$a1 | ||
| 2006 | mov $a2,$t2 | ||
| 2007 | ldx [%sp+LOCALS64+$S+16],$a2 | ||
| 2008 | mov $a3,$t3 | ||
| 2009 | ldx [%sp+LOCALS64+$S+24],$a3 | ||
| 2010 | call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M); | ||
| 2011 | add %sp,LOCALS64+$M,$rp | ||
| 2012 | |||
| 2013 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S); | ||
| 2014 | add %sp,LOCALS64+$tmp0,$rp | ||
| 2015 | |||
| 2016 | ldx [%sp+LOCALS64+$S],$bi ! forward load | ||
| 2017 | ldx [%sp+LOCALS64+$in_x],$a0 | ||
| 2018 | ldx [%sp+LOCALS64+$in_x+8],$a1 | ||
| 2019 | ldx [%sp+LOCALS64+$in_x+16],$a2 | ||
| 2020 | ldx [%sp+LOCALS64+$in_x+24],$a3 | ||
| 2021 | |||
| 2022 | call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0); | ||
| 2023 | add %sp,LOCALS64+$res_y,$rp | ||
| 2024 | |||
| 2025 | add %sp,LOCALS64+$S,$bp | ||
| 2026 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x); | ||
| 2027 | add %sp,LOCALS64+$S,$rp | ||
| 2028 | |||
| 2029 | ldx [%sp+LOCALS64+$M],$a0 ! forward load | ||
| 2030 | ldx [%sp+LOCALS64+$M+8],$a1 | ||
| 2031 | ldx [%sp+LOCALS64+$M+16],$a2 | ||
| 2032 | ldx [%sp+LOCALS64+$M+24],$a3 | ||
| 2033 | |||
| 2034 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S); | ||
| 2035 | add %sp,LOCALS64+$tmp0,$rp | ||
| 2036 | |||
| 2037 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M); | ||
| 2038 | add %sp,LOCALS64+$res_x,$rp | ||
| 2039 | |||
| 2040 | add %sp,LOCALS64+$tmp0,$bp | ||
| 2041 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0); | ||
| 2042 | add %sp,LOCALS64+$res_x,$rp | ||
| 2043 | |||
| 2044 | ldx [%sp+LOCALS64+$M],$a0 ! forward load | ||
| 2045 | ldx [%sp+LOCALS64+$M+8],$a1 | ||
| 2046 | ldx [%sp+LOCALS64+$M+16],$a2 | ||
| 2047 | ldx [%sp+LOCALS64+$M+24],$a3 | ||
| 2048 | |||
| 2049 | add %sp,LOCALS64+$S,$bp | ||
| 2050 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x); | ||
| 2051 | add %sp,LOCALS64+$S,$rp | ||
| 2052 | |||
| 2053 | mov $acc0,$bi | ||
| 2054 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M); | ||
| 2055 | add %sp,LOCALS64+$S,$rp | ||
| 2056 | |||
| 2057 | ldx [%sp+LOCALS64+$res_x],$a0 ! forward load | ||
| 2058 | ldx [%sp+LOCALS64+$res_x+8],$a1 | ||
| 2059 | ldx [%sp+LOCALS64+$res_x+16],$a2 | ||
| 2060 | ldx [%sp+LOCALS64+$res_x+24],$a3 | ||
| 2061 | |||
| 2062 | add %sp,LOCALS64+$res_y,$bp | ||
| 2063 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y); | ||
| 2064 | add %sp,LOCALS64+$res_y,$bp | ||
| 2065 | |||
| 2066 | ! convert output to uint_32[8] | ||
| 2067 | srlx $a0,32,$t0 | ||
| 2068 | srlx $a1,32,$t1 | ||
| 2069 | st $a0,[$rp_real] ! res_x | ||
| 2070 | srlx $a2,32,$t2 | ||
| 2071 | st $t0,[$rp_real+4] | ||
| 2072 | srlx $a3,32,$t3 | ||
| 2073 | st $a1,[$rp_real+8] | ||
| 2074 | st $t1,[$rp_real+12] | ||
| 2075 | st $a2,[$rp_real+16] | ||
| 2076 | st $t2,[$rp_real+20] | ||
| 2077 | st $a3,[$rp_real+24] | ||
| 2078 | st $t3,[$rp_real+28] | ||
| 2079 | |||
| 2080 | ldx [%sp+LOCALS64+$res_z],$a0 ! forward load | ||
| 2081 | srlx $acc0,32,$t0 | ||
| 2082 | ldx [%sp+LOCALS64+$res_z+8],$a1 | ||
| 2083 | srlx $acc1,32,$t1 | ||
| 2084 | ldx [%sp+LOCALS64+$res_z+16],$a2 | ||
| 2085 | srlx $acc2,32,$t2 | ||
| 2086 | ldx [%sp+LOCALS64+$res_z+24],$a3 | ||
| 2087 | srlx $acc3,32,$t3 | ||
| 2088 | st $acc0,[$rp_real+32] ! res_y | ||
| 2089 | st $t0, [$rp_real+32+4] | ||
| 2090 | st $acc1,[$rp_real+32+8] | ||
| 2091 | st $t1, [$rp_real+32+12] | ||
| 2092 | st $acc2,[$rp_real+32+16] | ||
| 2093 | st $t2, [$rp_real+32+20] | ||
| 2094 | st $acc3,[$rp_real+32+24] | ||
| 2095 | st $t3, [$rp_real+32+28] | ||
| 2096 | |||
| 2097 | srlx $a0,32,$t0 | ||
| 2098 | srlx $a1,32,$t1 | ||
| 2099 | st $a0,[$rp_real+64] ! res_z | ||
| 2100 | srlx $a2,32,$t2 | ||
| 2101 | st $t0,[$rp_real+64+4] | ||
| 2102 | srlx $a3,32,$t3 | ||
| 2103 | st $a1,[$rp_real+64+8] | ||
| 2104 | st $t1,[$rp_real+64+12] | ||
| 2105 | st $a2,[$rp_real+64+16] | ||
| 2106 | st $t2,[$rp_real+64+20] | ||
| 2107 | st $a3,[$rp_real+64+24] | ||
| 2108 | st $t3,[$rp_real+64+28] | ||
| 2109 | |||
| 2110 | ret | ||
| 2111 | restore | ||
| 2112 | .type ecp_nistz256_point_double_vis3,#function | ||
| 2113 | .size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3 | ||
| 2114 | ___ | ||
| 2115 | } | ||
| 2116 | ######################################################################## | ||
| 2117 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 2118 | # const P256_POINT *in2); | ||
| 2119 | { | ||
| 2120 | my ($res_x,$res_y,$res_z, | ||
| 2121 | $in1_x,$in1_y,$in1_z, | ||
| 2122 | $in2_x,$in2_y,$in2_z, | ||
| 2123 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 2124 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | ||
| 2125 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 2126 | |||
| 2127 | # above map() describes stack layout with 18 temporary | ||
| 2128 | # 256-bit vectors on top. Then we reserve some space for | ||
| 2129 | # !in1infty, !in2infty and result of check for zero. | ||
| 2130 | |||
| 2131 | $code.=<<___; | ||
| 2132 | .globl ecp_nistz256_point_add_vis3 | ||
| 2133 | .align 32 | ||
| 2134 | ecp_nistz256_point_add_vis3: | ||
| 2135 | save %sp,-STACK64_FRAME-32*18-32,%sp | ||
| 2136 | |||
| 2137 | mov $rp,$rp_real | ||
| 2138 | mov -1,$minus1 | ||
| 2139 | mov -2,$poly3 | ||
| 2140 | sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 | ||
| 2141 | srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE | ||
| 2142 | |||
| 2143 | ! convert input to uint64_t[4] | ||
| 2144 | ld [$bp],$a0 ! in2_x | ||
| 2145 | ld [$bp+4],$t0 | ||
| 2146 | ld [$bp+8],$a1 | ||
| 2147 | ld [$bp+12],$t1 | ||
| 2148 | ld [$bp+16],$a2 | ||
| 2149 | ld [$bp+20],$t2 | ||
| 2150 | ld [$bp+24],$a3 | ||
| 2151 | ld [$bp+28],$t3 | ||
| 2152 | sllx $t0,32,$t0 | ||
| 2153 | sllx $t1,32,$t1 | ||
| 2154 | ld [$bp+32],$acc0 ! in2_y | ||
| 2155 | or $a0,$t0,$a0 | ||
| 2156 | ld [$bp+32+4],$t0 | ||
| 2157 | sllx $t2,32,$t2 | ||
| 2158 | ld [$bp+32+8],$acc1 | ||
| 2159 | or $a1,$t1,$a1 | ||
| 2160 | ld [$bp+32+12],$t1 | ||
| 2161 | sllx $t3,32,$t3 | ||
| 2162 | ld [$bp+32+16],$acc2 | ||
| 2163 | or $a2,$t2,$a2 | ||
| 2164 | ld [$bp+32+20],$t2 | ||
| 2165 | or $a3,$t3,$a3 | ||
| 2166 | ld [$bp+32+24],$acc3 | ||
| 2167 | sllx $t0,32,$t0 | ||
| 2168 | ld [$bp+32+28],$t3 | ||
| 2169 | sllx $t1,32,$t1 | ||
| 2170 | stx $a0,[%sp+LOCALS64+$in2_x] | ||
| 2171 | sllx $t2,32,$t2 | ||
| 2172 | stx $a1,[%sp+LOCALS64+$in2_x+8] | ||
| 2173 | sllx $t3,32,$t3 | ||
| 2174 | stx $a2,[%sp+LOCALS64+$in2_x+16] | ||
| 2175 | or $acc0,$t0,$acc0 | ||
| 2176 | stx $a3,[%sp+LOCALS64+$in2_x+24] | ||
| 2177 | or $acc1,$t1,$acc1 | ||
| 2178 | stx $acc0,[%sp+LOCALS64+$in2_y] | ||
| 2179 | or $acc2,$t2,$acc2 | ||
| 2180 | stx $acc1,[%sp+LOCALS64+$in2_y+8] | ||
| 2181 | or $acc3,$t3,$acc3 | ||
| 2182 | stx $acc2,[%sp+LOCALS64+$in2_y+16] | ||
| 2183 | stx $acc3,[%sp+LOCALS64+$in2_y+24] | ||
| 2184 | |||
| 2185 | ld [$bp+64],$acc0 ! in2_z | ||
| 2186 | ld [$bp+64+4],$t0 | ||
| 2187 | ld [$bp+64+8],$acc1 | ||
| 2188 | ld [$bp+64+12],$t1 | ||
| 2189 | ld [$bp+64+16],$acc2 | ||
| 2190 | ld [$bp+64+20],$t2 | ||
| 2191 | ld [$bp+64+24],$acc3 | ||
| 2192 | ld [$bp+64+28],$t3 | ||
| 2193 | sllx $t0,32,$t0 | ||
| 2194 | sllx $t1,32,$t1 | ||
| 2195 | ld [$ap],$a0 ! in1_x | ||
| 2196 | or $acc0,$t0,$acc0 | ||
| 2197 | ld [$ap+4],$t0 | ||
| 2198 | sllx $t2,32,$t2 | ||
| 2199 | ld [$ap+8],$a1 | ||
| 2200 | or $acc1,$t1,$acc1 | ||
| 2201 | ld [$ap+12],$t1 | ||
| 2202 | sllx $t3,32,$t3 | ||
| 2203 | ld [$ap+16],$a2 | ||
| 2204 | or $acc2,$t2,$acc2 | ||
| 2205 | ld [$ap+20],$t2 | ||
| 2206 | or $acc3,$t3,$acc3 | ||
| 2207 | ld [$ap+24],$a3 | ||
| 2208 | sllx $t0,32,$t0 | ||
| 2209 | ld [$ap+28],$t3 | ||
| 2210 | sllx $t1,32,$t1 | ||
| 2211 | stx $acc0,[%sp+LOCALS64+$in2_z] | ||
| 2212 | sllx $t2,32,$t2 | ||
| 2213 | stx $acc1,[%sp+LOCALS64+$in2_z+8] | ||
| 2214 | sllx $t3,32,$t3 | ||
| 2215 | stx $acc2,[%sp+LOCALS64+$in2_z+16] | ||
| 2216 | stx $acc3,[%sp+LOCALS64+$in2_z+24] | ||
| 2217 | |||
| 2218 | or $acc1,$acc0,$acc0 | ||
| 2219 | or $acc3,$acc2,$acc2 | ||
| 2220 | or $acc2,$acc0,$acc0 | ||
| 2221 | movrnz $acc0,-1,$acc0 ! !in2infty | ||
| 2222 | stx $acc0,[%fp+STACK_BIAS-8] | ||
| 2223 | |||
| 2224 | or $a0,$t0,$a0 | ||
| 2225 | ld [$ap+32],$acc0 ! in1_y | ||
| 2226 | or $a1,$t1,$a1 | ||
| 2227 | ld [$ap+32+4],$t0 | ||
| 2228 | or $a2,$t2,$a2 | ||
| 2229 | ld [$ap+32+8],$acc1 | ||
| 2230 | or $a3,$t3,$a3 | ||
| 2231 | ld [$ap+32+12],$t1 | ||
| 2232 | ld [$ap+32+16],$acc2 | ||
| 2233 | ld [$ap+32+20],$t2 | ||
| 2234 | ld [$ap+32+24],$acc3 | ||
| 2235 | sllx $t0,32,$t0 | ||
| 2236 | ld [$ap+32+28],$t3 | ||
| 2237 | sllx $t1,32,$t1 | ||
| 2238 | stx $a0,[%sp+LOCALS64+$in1_x] | ||
| 2239 | sllx $t2,32,$t2 | ||
| 2240 | stx $a1,[%sp+LOCALS64+$in1_x+8] | ||
| 2241 | sllx $t3,32,$t3 | ||
| 2242 | stx $a2,[%sp+LOCALS64+$in1_x+16] | ||
| 2243 | or $acc0,$t0,$acc0 | ||
| 2244 | stx $a3,[%sp+LOCALS64+$in1_x+24] | ||
| 2245 | or $acc1,$t1,$acc1 | ||
| 2246 | stx $acc0,[%sp+LOCALS64+$in1_y] | ||
| 2247 | or $acc2,$t2,$acc2 | ||
| 2248 | stx $acc1,[%sp+LOCALS64+$in1_y+8] | ||
| 2249 | or $acc3,$t3,$acc3 | ||
| 2250 | stx $acc2,[%sp+LOCALS64+$in1_y+16] | ||
| 2251 | stx $acc3,[%sp+LOCALS64+$in1_y+24] | ||
| 2252 | |||
| 2253 | ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load | ||
| 2254 | ldx [%sp+LOCALS64+$in2_z+8],$a1 | ||
| 2255 | ldx [%sp+LOCALS64+$in2_z+16],$a2 | ||
| 2256 | ldx [%sp+LOCALS64+$in2_z+24],$a3 | ||
| 2257 | |||
| 2258 | ld [$ap+64],$acc0 ! in1_z | ||
| 2259 | ld [$ap+64+4],$t0 | ||
| 2260 | ld [$ap+64+8],$acc1 | ||
| 2261 | ld [$ap+64+12],$t1 | ||
| 2262 | ld [$ap+64+16],$acc2 | ||
| 2263 | ld [$ap+64+20],$t2 | ||
| 2264 | ld [$ap+64+24],$acc3 | ||
| 2265 | ld [$ap+64+28],$t3 | ||
| 2266 | sllx $t0,32,$t0 | ||
| 2267 | sllx $t1,32,$t1 | ||
| 2268 | or $acc0,$t0,$acc0 | ||
| 2269 | sllx $t2,32,$t2 | ||
| 2270 | or $acc1,$t1,$acc1 | ||
| 2271 | sllx $t3,32,$t3 | ||
| 2272 | stx $acc0,[%sp+LOCALS64+$in1_z] | ||
| 2273 | or $acc2,$t2,$acc2 | ||
| 2274 | stx $acc1,[%sp+LOCALS64+$in1_z+8] | ||
| 2275 | or $acc3,$t3,$acc3 | ||
| 2276 | stx $acc2,[%sp+LOCALS64+$in1_z+16] | ||
| 2277 | stx $acc3,[%sp+LOCALS64+$in1_z+24] | ||
| 2278 | |||
| 2279 | or $acc1,$acc0,$acc0 | ||
| 2280 | or $acc3,$acc2,$acc2 | ||
| 2281 | or $acc2,$acc0,$acc0 | ||
| 2282 | movrnz $acc0,-1,$acc0 ! !in1infty | ||
| 2283 | stx $acc0,[%fp+STACK_BIAS-16] | ||
| 2284 | |||
| 2285 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z); | ||
| 2286 | add %sp,LOCALS64+$Z2sqr,$rp | ||
| 2287 | |||
| 2288 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2289 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2290 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2291 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2292 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 2293 | add %sp,LOCALS64+$Z1sqr,$rp | ||
| 2294 | |||
| 2295 | ldx [%sp+LOCALS64+$Z2sqr],$bi | ||
| 2296 | ldx [%sp+LOCALS64+$in2_z],$a0 | ||
| 2297 | ldx [%sp+LOCALS64+$in2_z+8],$a1 | ||
| 2298 | ldx [%sp+LOCALS64+$in2_z+16],$a2 | ||
| 2299 | ldx [%sp+LOCALS64+$in2_z+24],$a3 | ||
| 2300 | add %sp,LOCALS64+$Z2sqr,$bp | ||
| 2301 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 2302 | add %sp,LOCALS64+$S1,$rp | ||
| 2303 | |||
| 2304 | ldx [%sp+LOCALS64+$Z1sqr],$bi | ||
| 2305 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2306 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2307 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2308 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2309 | add %sp,LOCALS64+$Z1sqr,$bp | ||
| 2310 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 2311 | add %sp,LOCALS64+$S2,$rp | ||
| 2312 | |||
| 2313 | ldx [%sp+LOCALS64+$S1],$bi | ||
| 2314 | ldx [%sp+LOCALS64+$in1_y],$a0 | ||
| 2315 | ldx [%sp+LOCALS64+$in1_y+8],$a1 | ||
| 2316 | ldx [%sp+LOCALS64+$in1_y+16],$a2 | ||
| 2317 | ldx [%sp+LOCALS64+$in1_y+24],$a3 | ||
| 2318 | add %sp,LOCALS64+$S1,$bp | ||
| 2319 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y); | ||
| 2320 | add %sp,LOCALS64+$S1,$rp | ||
| 2321 | |||
| 2322 | ldx [%sp+LOCALS64+$S2],$bi | ||
| 2323 | ldx [%sp+LOCALS64+$in2_y],$a0 | ||
| 2324 | ldx [%sp+LOCALS64+$in2_y+8],$a1 | ||
| 2325 | ldx [%sp+LOCALS64+$in2_y+16],$a2 | ||
| 2326 | ldx [%sp+LOCALS64+$in2_y+24],$a3 | ||
| 2327 | add %sp,LOCALS64+$S2,$bp | ||
| 2328 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); | ||
| 2329 | add %sp,LOCALS64+$S2,$rp | ||
| 2330 | |||
| 2331 | ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load | ||
| 2332 | ldx [%sp+LOCALS64+$in1_x],$a0 | ||
| 2333 | ldx [%sp+LOCALS64+$in1_x+8],$a1 | ||
| 2334 | ldx [%sp+LOCALS64+$in1_x+16],$a2 | ||
| 2335 | ldx [%sp+LOCALS64+$in1_x+24],$a3 | ||
| 2336 | |||
| 2337 | add %sp,LOCALS64+$S1,$bp | ||
| 2338 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1); | ||
| 2339 | add %sp,LOCALS64+$R,$rp | ||
| 2340 | |||
| 2341 | or $acc1,$acc0,$acc0 ! see if result is zero | ||
| 2342 | or $acc3,$acc2,$acc2 | ||
| 2343 | or $acc2,$acc0,$acc0 | ||
| 2344 | stx $acc0,[%fp+STACK_BIAS-24] | ||
| 2345 | |||
| 2346 | add %sp,LOCALS64+$Z2sqr,$bp | ||
| 2347 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 2348 | add %sp,LOCALS64+$U1,$rp | ||
| 2349 | |||
| 2350 | ldx [%sp+LOCALS64+$Z1sqr],$bi | ||
| 2351 | ldx [%sp+LOCALS64+$in2_x],$a0 | ||
| 2352 | ldx [%sp+LOCALS64+$in2_x+8],$a1 | ||
| 2353 | ldx [%sp+LOCALS64+$in2_x+16],$a2 | ||
| 2354 | ldx [%sp+LOCALS64+$in2_x+24],$a3 | ||
| 2355 | add %sp,LOCALS64+$Z1sqr,$bp | ||
| 2356 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 2357 | add %sp,LOCALS64+$U2,$rp | ||
| 2358 | |||
| 2359 | ldx [%sp+LOCALS64+$R],$a0 ! forward load | ||
| 2360 | ldx [%sp+LOCALS64+$R+8],$a1 | ||
| 2361 | ldx [%sp+LOCALS64+$R+16],$a2 | ||
| 2362 | ldx [%sp+LOCALS64+$R+24],$a3 | ||
| 2363 | |||
| 2364 | add %sp,LOCALS64+$U1,$bp | ||
| 2365 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1); | ||
| 2366 | add %sp,LOCALS64+$H,$rp | ||
| 2367 | |||
| 2368 | or $acc1,$acc0,$acc0 ! see if result is zero | ||
| 2369 | or $acc3,$acc2,$acc2 | ||
| 2370 | orcc $acc2,$acc0,$acc0 | ||
| 2371 | |||
| 2372 | bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)? | ||
| 2373 | nop | ||
| 2374 | |||
| 2375 | ldx [%fp+STACK_BIAS-8],$t0 | ||
| 2376 | ldx [%fp+STACK_BIAS-16],$t1 | ||
| 2377 | ldx [%fp+STACK_BIAS-24],$t2 | ||
| 2378 | andcc $t0,$t1,%g0 | ||
| 2379 | be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)? | ||
| 2380 | nop | ||
| 2381 | andcc $t2,$t2,%g0 | ||
| 2382 | be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)? | ||
| 2383 | add %sp,32*(12-10)+32,%sp ! difference in frame sizes | ||
| 2384 | |||
| 2385 | st %g0,[$rp_real] | ||
| 2386 | st %g0,[$rp_real+4] | ||
| 2387 | st %g0,[$rp_real+8] | ||
| 2388 | st %g0,[$rp_real+12] | ||
| 2389 | st %g0,[$rp_real+16] | ||
| 2390 | st %g0,[$rp_real+20] | ||
| 2391 | st %g0,[$rp_real+24] | ||
| 2392 | st %g0,[$rp_real+28] | ||
| 2393 | st %g0,[$rp_real+32] | ||
| 2394 | st %g0,[$rp_real+32+4] | ||
| 2395 | st %g0,[$rp_real+32+8] | ||
| 2396 | st %g0,[$rp_real+32+12] | ||
| 2397 | st %g0,[$rp_real+32+16] | ||
| 2398 | st %g0,[$rp_real+32+20] | ||
| 2399 | st %g0,[$rp_real+32+24] | ||
| 2400 | st %g0,[$rp_real+32+28] | ||
| 2401 | st %g0,[$rp_real+64] | ||
| 2402 | st %g0,[$rp_real+64+4] | ||
| 2403 | st %g0,[$rp_real+64+8] | ||
| 2404 | st %g0,[$rp_real+64+12] | ||
| 2405 | st %g0,[$rp_real+64+16] | ||
| 2406 | st %g0,[$rp_real+64+20] | ||
| 2407 | st %g0,[$rp_real+64+24] | ||
| 2408 | st %g0,[$rp_real+64+28] | ||
| 2409 | b .Ladd_done_vis3 | ||
| 2410 | nop | ||
| 2411 | |||
| 2412 | .align 16 | ||
| 2413 | .Ladd_proceed_vis3: | ||
| 2414 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); | ||
| 2415 | add %sp,LOCALS64+$Rsqr,$rp | ||
| 2416 | |||
| 2417 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2418 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2419 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2420 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2421 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2422 | add %sp,LOCALS64+$H,$bp | ||
| 2423 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); | ||
| 2424 | add %sp,LOCALS64+$res_z,$rp | ||
| 2425 | |||
| 2426 | ldx [%sp+LOCALS64+$H],$a0 | ||
| 2427 | ldx [%sp+LOCALS64+$H+8],$a1 | ||
| 2428 | ldx [%sp+LOCALS64+$H+16],$a2 | ||
| 2429 | ldx [%sp+LOCALS64+$H+24],$a3 | ||
| 2430 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); | ||
| 2431 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2432 | |||
| 2433 | ldx [%sp+LOCALS64+$res_z],$bi | ||
| 2434 | ldx [%sp+LOCALS64+$in2_z],$a0 | ||
| 2435 | ldx [%sp+LOCALS64+$in2_z+8],$a1 | ||
| 2436 | ldx [%sp+LOCALS64+$in2_z+16],$a2 | ||
| 2437 | ldx [%sp+LOCALS64+$in2_z+24],$a3 | ||
| 2438 | add %sp,LOCALS64+$res_z,$bp | ||
| 2439 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z); | ||
| 2440 | add %sp,LOCALS64+$res_z,$rp | ||
| 2441 | |||
| 2442 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2443 | ldx [%sp+LOCALS64+$Hsqr],$a0 | ||
| 2444 | ldx [%sp+LOCALS64+$Hsqr+8],$a1 | ||
| 2445 | ldx [%sp+LOCALS64+$Hsqr+16],$a2 | ||
| 2446 | ldx [%sp+LOCALS64+$Hsqr+24],$a3 | ||
| 2447 | add %sp,LOCALS64+$H,$bp | ||
| 2448 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 2449 | add %sp,LOCALS64+$Hcub,$rp | ||
| 2450 | |||
| 2451 | ldx [%sp+LOCALS64+$U1],$bi | ||
| 2452 | ldx [%sp+LOCALS64+$Hsqr],$a0 | ||
| 2453 | ldx [%sp+LOCALS64+$Hsqr+8],$a1 | ||
| 2454 | ldx [%sp+LOCALS64+$Hsqr+16],$a2 | ||
| 2455 | ldx [%sp+LOCALS64+$Hsqr+24],$a3 | ||
| 2456 | add %sp,LOCALS64+$U1,$bp | ||
| 2457 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr); | ||
| 2458 | add %sp,LOCALS64+$U2,$rp | ||
| 2459 | |||
| 2460 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); | ||
| 2461 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2462 | |||
| 2463 | add %sp,LOCALS64+$Rsqr,$bp | ||
| 2464 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 2465 | add %sp,LOCALS64+$res_x,$rp | ||
| 2466 | |||
| 2467 | add %sp,LOCALS64+$Hcub,$bp | ||
| 2468 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); | ||
| 2469 | add %sp,LOCALS64+$res_x,$rp | ||
| 2470 | |||
| 2471 | ldx [%sp+LOCALS64+$S1],$bi ! forward load | ||
| 2472 | ldx [%sp+LOCALS64+$Hcub],$a0 | ||
| 2473 | ldx [%sp+LOCALS64+$Hcub+8],$a1 | ||
| 2474 | ldx [%sp+LOCALS64+$Hcub+16],$a2 | ||
| 2475 | ldx [%sp+LOCALS64+$Hcub+24],$a3 | ||
| 2476 | |||
| 2477 | add %sp,LOCALS64+$U2,$bp | ||
| 2478 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); | ||
| 2479 | add %sp,LOCALS64+$res_y,$rp | ||
| 2480 | |||
| 2481 | add %sp,LOCALS64+$S1,$bp | ||
| 2482 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub); | ||
| 2483 | add %sp,LOCALS64+$S2,$rp | ||
| 2484 | |||
| 2485 | ldx [%sp+LOCALS64+$R],$bi | ||
| 2486 | ldx [%sp+LOCALS64+$res_y],$a0 | ||
| 2487 | ldx [%sp+LOCALS64+$res_y+8],$a1 | ||
| 2488 | ldx [%sp+LOCALS64+$res_y+16],$a2 | ||
| 2489 | ldx [%sp+LOCALS64+$res_y+24],$a3 | ||
| 2490 | add %sp,LOCALS64+$R,$bp | ||
| 2491 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); | ||
| 2492 | add %sp,LOCALS64+$res_y,$rp | ||
| 2493 | |||
| 2494 | add %sp,LOCALS64+$S2,$bp | ||
| 2495 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); | ||
| 2496 | add %sp,LOCALS64+$res_y,$rp | ||
| 2497 | |||
| 2498 | ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 2499 | ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty | ||
| 2500 | ___ | ||
| 2501 | for($i=0;$i<96;$i+=16) { # conditional moves | ||
| 2502 | $code.=<<___; | ||
| 2503 | ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res | ||
| 2504 | ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 | ||
| 2505 | ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 | ||
| 2506 | ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 | ||
| 2507 | ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 | ||
| 2508 | ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 | ||
| 2509 | movrz $t1,$acc2,$acc0 | ||
| 2510 | movrz $t1,$acc3,$acc1 | ||
| 2511 | movrz $t2,$acc4,$acc0 | ||
| 2512 | movrz $t2,$acc5,$acc1 | ||
| 2513 | srlx $acc0,32,$acc2 | ||
| 2514 | srlx $acc1,32,$acc3 | ||
| 2515 | st $acc0,[$rp_real+$i] | ||
| 2516 | st $acc2,[$rp_real+$i+4] | ||
| 2517 | st $acc1,[$rp_real+$i+8] | ||
| 2518 | st $acc3,[$rp_real+$i+12] | ||
| 2519 | ___ | ||
| 2520 | } | ||
| 2521 | $code.=<<___; | ||
| 2522 | .Ladd_done_vis3: | ||
| 2523 | ret | ||
| 2524 | restore | ||
| 2525 | .type ecp_nistz256_point_add_vis3,#function | ||
| 2526 | .size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3 | ||
| 2527 | ___ | ||
| 2528 | } | ||
| 2529 | ######################################################################## | ||
| 2530 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | ||
| 2531 | # const P256_POINT_AFFINE *in2); | ||
| 2532 | { | ||
| 2533 | my ($res_x,$res_y,$res_z, | ||
| 2534 | $in1_x,$in1_y,$in1_z, | ||
| 2535 | $in2_x,$in2_y, | ||
| 2536 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | ||
| 2537 | my $Z1sqr = $S2; | ||
| 2538 | # above map() describes stack layout with 15 temporary | ||
| 2539 | # 256-bit vectors on top. Then we reserve some space for | ||
| 2540 | # !in1infty and !in2infty. | ||
| 2541 | |||
| 2542 | $code.=<<___; | ||
| 2543 | .align 32 | ||
| 2544 | ecp_nistz256_point_add_affine_vis3: | ||
| 2545 | save %sp,-STACK64_FRAME-32*15-32,%sp | ||
| 2546 | |||
| 2547 | mov $rp,$rp_real | ||
| 2548 | mov -1,$minus1 | ||
| 2549 | mov -2,$poly3 | ||
| 2550 | sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 | ||
| 2551 | srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE | ||
| 2552 | |||
| 2553 | ! convert input to uint64_t[4] | ||
| 2554 | ld [$bp],$a0 ! in2_x | ||
| 2555 | ld [$bp+4],$t0 | ||
| 2556 | ld [$bp+8],$a1 | ||
| 2557 | ld [$bp+12],$t1 | ||
| 2558 | ld [$bp+16],$a2 | ||
| 2559 | ld [$bp+20],$t2 | ||
| 2560 | ld [$bp+24],$a3 | ||
| 2561 | ld [$bp+28],$t3 | ||
| 2562 | sllx $t0,32,$t0 | ||
| 2563 | sllx $t1,32,$t1 | ||
| 2564 | ld [$bp+32],$acc0 ! in2_y | ||
| 2565 | or $a0,$t0,$a0 | ||
| 2566 | ld [$bp+32+4],$t0 | ||
| 2567 | sllx $t2,32,$t2 | ||
| 2568 | ld [$bp+32+8],$acc1 | ||
| 2569 | or $a1,$t1,$a1 | ||
| 2570 | ld [$bp+32+12],$t1 | ||
| 2571 | sllx $t3,32,$t3 | ||
| 2572 | ld [$bp+32+16],$acc2 | ||
| 2573 | or $a2,$t2,$a2 | ||
| 2574 | ld [$bp+32+20],$t2 | ||
| 2575 | or $a3,$t3,$a3 | ||
| 2576 | ld [$bp+32+24],$acc3 | ||
| 2577 | sllx $t0,32,$t0 | ||
| 2578 | ld [$bp+32+28],$t3 | ||
| 2579 | sllx $t1,32,$t1 | ||
| 2580 | stx $a0,[%sp+LOCALS64+$in2_x] | ||
| 2581 | sllx $t2,32,$t2 | ||
| 2582 | stx $a1,[%sp+LOCALS64+$in2_x+8] | ||
| 2583 | sllx $t3,32,$t3 | ||
| 2584 | stx $a2,[%sp+LOCALS64+$in2_x+16] | ||
| 2585 | or $acc0,$t0,$acc0 | ||
| 2586 | stx $a3,[%sp+LOCALS64+$in2_x+24] | ||
| 2587 | or $acc1,$t1,$acc1 | ||
| 2588 | stx $acc0,[%sp+LOCALS64+$in2_y] | ||
| 2589 | or $acc2,$t2,$acc2 | ||
| 2590 | stx $acc1,[%sp+LOCALS64+$in2_y+8] | ||
| 2591 | or $acc3,$t3,$acc3 | ||
| 2592 | stx $acc2,[%sp+LOCALS64+$in2_y+16] | ||
| 2593 | stx $acc3,[%sp+LOCALS64+$in2_y+24] | ||
| 2594 | |||
| 2595 | or $a1,$a0,$a0 | ||
| 2596 | or $a3,$a2,$a2 | ||
| 2597 | or $acc1,$acc0,$acc0 | ||
| 2598 | or $acc3,$acc2,$acc2 | ||
| 2599 | or $a2,$a0,$a0 | ||
| 2600 | or $acc2,$acc0,$acc0 | ||
| 2601 | or $acc0,$a0,$a0 | ||
| 2602 | movrnz $a0,-1,$a0 ! !in2infty | ||
| 2603 | stx $a0,[%fp+STACK_BIAS-8] | ||
| 2604 | |||
| 2605 | ld [$ap],$a0 ! in1_x | ||
| 2606 | ld [$ap+4],$t0 | ||
| 2607 | ld [$ap+8],$a1 | ||
| 2608 | ld [$ap+12],$t1 | ||
| 2609 | ld [$ap+16],$a2 | ||
| 2610 | ld [$ap+20],$t2 | ||
| 2611 | ld [$ap+24],$a3 | ||
| 2612 | ld [$ap+28],$t3 | ||
| 2613 | sllx $t0,32,$t0 | ||
| 2614 | sllx $t1,32,$t1 | ||
| 2615 | ld [$ap+32],$acc0 ! in1_y | ||
| 2616 | or $a0,$t0,$a0 | ||
| 2617 | ld [$ap+32+4],$t0 | ||
| 2618 | sllx $t2,32,$t2 | ||
| 2619 | ld [$ap+32+8],$acc1 | ||
| 2620 | or $a1,$t1,$a1 | ||
| 2621 | ld [$ap+32+12],$t1 | ||
| 2622 | sllx $t3,32,$t3 | ||
| 2623 | ld [$ap+32+16],$acc2 | ||
| 2624 | or $a2,$t2,$a2 | ||
| 2625 | ld [$ap+32+20],$t2 | ||
| 2626 | or $a3,$t3,$a3 | ||
| 2627 | ld [$ap+32+24],$acc3 | ||
| 2628 | sllx $t0,32,$t0 | ||
| 2629 | ld [$ap+32+28],$t3 | ||
| 2630 | sllx $t1,32,$t1 | ||
| 2631 | stx $a0,[%sp+LOCALS64+$in1_x] | ||
| 2632 | sllx $t2,32,$t2 | ||
| 2633 | stx $a1,[%sp+LOCALS64+$in1_x+8] | ||
| 2634 | sllx $t3,32,$t3 | ||
| 2635 | stx $a2,[%sp+LOCALS64+$in1_x+16] | ||
| 2636 | or $acc0,$t0,$acc0 | ||
| 2637 | stx $a3,[%sp+LOCALS64+$in1_x+24] | ||
| 2638 | or $acc1,$t1,$acc1 | ||
| 2639 | stx $acc0,[%sp+LOCALS64+$in1_y] | ||
| 2640 | or $acc2,$t2,$acc2 | ||
| 2641 | stx $acc1,[%sp+LOCALS64+$in1_y+8] | ||
| 2642 | or $acc3,$t3,$acc3 | ||
| 2643 | stx $acc2,[%sp+LOCALS64+$in1_y+16] | ||
| 2644 | stx $acc3,[%sp+LOCALS64+$in1_y+24] | ||
| 2645 | |||
| 2646 | ld [$ap+64],$a0 ! in1_z | ||
| 2647 | ld [$ap+64+4],$t0 | ||
| 2648 | ld [$ap+64+8],$a1 | ||
| 2649 | ld [$ap+64+12],$t1 | ||
| 2650 | ld [$ap+64+16],$a2 | ||
| 2651 | ld [$ap+64+20],$t2 | ||
| 2652 | ld [$ap+64+24],$a3 | ||
| 2653 | ld [$ap+64+28],$t3 | ||
| 2654 | sllx $t0,32,$t0 | ||
| 2655 | sllx $t1,32,$t1 | ||
| 2656 | or $a0,$t0,$a0 | ||
| 2657 | sllx $t2,32,$t2 | ||
| 2658 | or $a1,$t1,$a1 | ||
| 2659 | sllx $t3,32,$t3 | ||
| 2660 | stx $a0,[%sp+LOCALS64+$in1_z] | ||
| 2661 | or $a2,$t2,$a2 | ||
| 2662 | stx $a1,[%sp+LOCALS64+$in1_z+8] | ||
| 2663 | or $a3,$t3,$a3 | ||
| 2664 | stx $a2,[%sp+LOCALS64+$in1_z+16] | ||
| 2665 | stx $a3,[%sp+LOCALS64+$in1_z+24] | ||
| 2666 | |||
| 2667 | or $a1,$a0,$t0 | ||
| 2668 | or $a3,$a2,$t2 | ||
| 2669 | or $t2,$t0,$t0 | ||
| 2670 | movrnz $t0,-1,$t0 ! !in1infty | ||
| 2671 | stx $t0,[%fp+STACK_BIAS-16] | ||
| 2672 | |||
| 2673 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 2674 | add %sp,LOCALS64+$Z1sqr,$rp | ||
| 2675 | |||
| 2676 | ldx [%sp+LOCALS64+$in2_x],$bi | ||
| 2677 | mov $acc0,$a0 | ||
| 2678 | mov $acc1,$a1 | ||
| 2679 | mov $acc2,$a2 | ||
| 2680 | mov $acc3,$a3 | ||
| 2681 | add %sp,LOCALS64+$in2_x,$bp | ||
| 2682 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 2683 | add %sp,LOCALS64+$U2,$rp | ||
| 2684 | |||
| 2685 | ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load | ||
| 2686 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2687 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2688 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2689 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2690 | |||
| 2691 | add %sp,LOCALS64+$in1_x,$bp | ||
| 2692 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x); | ||
| 2693 | add %sp,LOCALS64+$H,$rp | ||
| 2694 | |||
| 2695 | add %sp,LOCALS64+$Z1sqr,$bp | ||
| 2696 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 2697 | add %sp,LOCALS64+$S2,$rp | ||
| 2698 | |||
| 2699 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2700 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2701 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2702 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2703 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2704 | add %sp,LOCALS64+$H,$bp | ||
| 2705 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); | ||
| 2706 | add %sp,LOCALS64+$res_z,$rp | ||
| 2707 | |||
| 2708 | ldx [%sp+LOCALS64+$S2],$bi | ||
| 2709 | ldx [%sp+LOCALS64+$in2_y],$a0 | ||
| 2710 | ldx [%sp+LOCALS64+$in2_y+8],$a1 | ||
| 2711 | ldx [%sp+LOCALS64+$in2_y+16],$a2 | ||
| 2712 | ldx [%sp+LOCALS64+$in2_y+24],$a3 | ||
| 2713 | add %sp,LOCALS64+$S2,$bp | ||
| 2714 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); | ||
| 2715 | add %sp,LOCALS64+$S2,$rp | ||
| 2716 | |||
| 2717 | ldx [%sp+LOCALS64+$H],$a0 ! forward load | ||
| 2718 | ldx [%sp+LOCALS64+$H+8],$a1 | ||
| 2719 | ldx [%sp+LOCALS64+$H+16],$a2 | ||
| 2720 | ldx [%sp+LOCALS64+$H+24],$a3 | ||
| 2721 | |||
| 2722 | add %sp,LOCALS64+$in1_y,$bp | ||
| 2723 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y); | ||
| 2724 | add %sp,LOCALS64+$R,$rp | ||
| 2725 | |||
| 2726 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); | ||
| 2727 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2728 | |||
| 2729 | ldx [%sp+LOCALS64+$R],$a0 | ||
| 2730 | ldx [%sp+LOCALS64+$R+8],$a1 | ||
| 2731 | ldx [%sp+LOCALS64+$R+16],$a2 | ||
| 2732 | ldx [%sp+LOCALS64+$R+24],$a3 | ||
| 2733 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); | ||
| 2734 | add %sp,LOCALS64+$Rsqr,$rp | ||
| 2735 | |||
| 2736 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2737 | ldx [%sp+LOCALS64+$Hsqr],$a0 | ||
| 2738 | ldx [%sp+LOCALS64+$Hsqr+8],$a1 | ||
| 2739 | ldx [%sp+LOCALS64+$Hsqr+16],$a2 | ||
| 2740 | ldx [%sp+LOCALS64+$Hsqr+24],$a3 | ||
| 2741 | add %sp,LOCALS64+$H,$bp | ||
| 2742 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 2743 | add %sp,LOCALS64+$Hcub,$rp | ||
| 2744 | |||
| 2745 | ldx [%sp+LOCALS64+$Hsqr],$bi | ||
| 2746 | ldx [%sp+LOCALS64+$in1_x],$a0 | ||
| 2747 | ldx [%sp+LOCALS64+$in1_x+8],$a1 | ||
| 2748 | ldx [%sp+LOCALS64+$in1_x+16],$a2 | ||
| 2749 | ldx [%sp+LOCALS64+$in1_x+24],$a3 | ||
| 2750 | add %sp,LOCALS64+$Hsqr,$bp | ||
| 2751 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr); | ||
| 2752 | add %sp,LOCALS64+$U2,$rp | ||
| 2753 | |||
| 2754 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); | ||
| 2755 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2756 | |||
| 2757 | add %sp,LOCALS64+$Rsqr,$bp | ||
| 2758 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 2759 | add %sp,LOCALS64+$res_x,$rp | ||
| 2760 | |||
| 2761 | add %sp,LOCALS64+$Hcub,$bp | ||
| 2762 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); | ||
| 2763 | add %sp,LOCALS64+$res_x,$rp | ||
| 2764 | |||
| 2765 | ldx [%sp+LOCALS64+$Hcub],$bi ! forward load | ||
| 2766 | ldx [%sp+LOCALS64+$in1_y],$a0 | ||
| 2767 | ldx [%sp+LOCALS64+$in1_y+8],$a1 | ||
| 2768 | ldx [%sp+LOCALS64+$in1_y+16],$a2 | ||
| 2769 | ldx [%sp+LOCALS64+$in1_y+24],$a3 | ||
| 2770 | |||
| 2771 | add %sp,LOCALS64+$U2,$bp | ||
| 2772 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); | ||
| 2773 | add %sp,LOCALS64+$res_y,$rp | ||
| 2774 | |||
| 2775 | add %sp,LOCALS64+$Hcub,$bp | ||
| 2776 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub); | ||
| 2777 | add %sp,LOCALS64+$S2,$rp | ||
| 2778 | |||
| 2779 | ldx [%sp+LOCALS64+$R],$bi | ||
| 2780 | ldx [%sp+LOCALS64+$res_y],$a0 | ||
| 2781 | ldx [%sp+LOCALS64+$res_y+8],$a1 | ||
| 2782 | ldx [%sp+LOCALS64+$res_y+16],$a2 | ||
| 2783 | ldx [%sp+LOCALS64+$res_y+24],$a3 | ||
| 2784 | add %sp,LOCALS64+$R,$bp | ||
| 2785 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); | ||
| 2786 | add %sp,LOCALS64+$res_y,$rp | ||
| 2787 | |||
| 2788 | add %sp,LOCALS64+$S2,$bp | ||
| 2789 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); | ||
| 2790 | add %sp,LOCALS64+$res_y,$rp | ||
| 2791 | |||
| 2792 | ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 2793 | ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty | ||
| 2794 | 1: call .+8 | ||
| 2795 | add %o7,.Lone_mont_vis3-1b,$bp | ||
| 2796 | ___ | ||
| 2797 | for($i=0;$i<64;$i+=16) { # conditional moves | ||
| 2798 | $code.=<<___; | ||
| 2799 | ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res | ||
| 2800 | ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 | ||
| 2801 | ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 | ||
| 2802 | ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 | ||
| 2803 | ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 | ||
| 2804 | ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 | ||
| 2805 | movrz $t1,$acc2,$acc0 | ||
| 2806 | movrz $t1,$acc3,$acc1 | ||
| 2807 | movrz $t2,$acc4,$acc0 | ||
| 2808 | movrz $t2,$acc5,$acc1 | ||
| 2809 | srlx $acc0,32,$acc2 | ||
| 2810 | srlx $acc1,32,$acc3 | ||
| 2811 | st $acc0,[$rp_real+$i] | ||
| 2812 | st $acc2,[$rp_real+$i+4] | ||
| 2813 | st $acc1,[$rp_real+$i+8] | ||
| 2814 | st $acc3,[$rp_real+$i+12] | ||
| 2815 | ___ | ||
| 2816 | } | ||
| 2817 | for(;$i<96;$i+=16) { | ||
| 2818 | $code.=<<___; | ||
| 2819 | ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res | ||
| 2820 | ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 | ||
| 2821 | ldx [$bp+$i-64],$acc2 ! "in2" | ||
| 2822 | ldx [$bp+$i-64+8],$acc3 | ||
| 2823 | ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 | ||
| 2824 | ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 | ||
| 2825 | movrz $t1,$acc2,$acc0 | ||
| 2826 | movrz $t1,$acc3,$acc1 | ||
| 2827 | movrz $t2,$acc4,$acc0 | ||
| 2828 | movrz $t2,$acc5,$acc1 | ||
| 2829 | srlx $acc0,32,$acc2 | ||
| 2830 | srlx $acc1,32,$acc3 | ||
| 2831 | st $acc0,[$rp_real+$i] | ||
| 2832 | st $acc2,[$rp_real+$i+4] | ||
| 2833 | st $acc1,[$rp_real+$i+8] | ||
| 2834 | st $acc3,[$rp_real+$i+12] | ||
| 2835 | ___ | ||
| 2836 | } | ||
| 2837 | $code.=<<___; | ||
| 2838 | ret | ||
| 2839 | restore | ||
| 2840 | .type ecp_nistz256_point_add_affine_vis3,#function | ||
| 2841 | .size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3 | ||
| 2842 | .align 64 | ||
| 2843 | .Lone_mont_vis3: | ||
| 2844 | .long 0x00000000,0x00000001, 0xffffffff,0x00000000 | ||
| 2845 | .long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe | ||
| 2846 | .align 64 | ||
| 2847 | #endif | ||
| 2848 | ___ | ||
| 2849 | } }}} | ||
| 2850 | |||
| 2851 | # Purpose of these subroutines is to explicitly encode VIS instructions, | ||
| 2852 | # so that one can compile the module without having to specify VIS | ||
| 2853 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. | ||
| 2854 | # Idea is to reserve for option to produce "universal" binary and let | ||
| 2855 | # programmer detect if current CPU is VIS capable at run-time. | ||
| 2856 | sub unvis3 { | ||
| 2857 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | ||
| 2858 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | ||
| 2859 | my ($ref,$opf); | ||
| 2860 | my %visopf = ( "addxc" => 0x011, | ||
| 2861 | "addxccc" => 0x013, | ||
| 2862 | "umulxhi" => 0x016 ); | ||
| 2863 | |||
| 2864 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | ||
| 2865 | |||
| 2866 | if ($opf=$visopf{$mnemonic}) { | ||
| 2867 | foreach ($rs1,$rs2,$rd) { | ||
| 2868 | return $ref if (!/%([goli])([0-9])/); | ||
| 2869 | $_=$bias{$1}+$2; | ||
| 2870 | } | ||
| 2871 | |||
| 2872 | return sprintf ".word\t0x%08x !%s", | ||
| 2873 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | ||
| 2874 | $ref; | ||
| 2875 | } else { | ||
| 2876 | return $ref; | ||
| 2877 | } | ||
| 2878 | } | ||
| 2879 | |||
| 2880 | foreach (split("\n",$code)) { | ||
| 2881 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 2882 | |||
| 2883 | s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | ||
| 2884 | &unvis3($1,$2,$3,$4) | ||
| 2885 | /ge; | ||
| 2886 | |||
| 2887 | print $_,"\n"; | ||
| 2888 | } | ||
| 2889 | |||
| 2890 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl deleted file mode 100644 index 085d637e5d..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl +++ /dev/null | |||
| @@ -1,1740 +0,0 @@ | |||
| 1 | #! /usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-x86.pl,v 1.1 2016/11/04 17:33:20 miod Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | |||
| 12 | # ==================================================================== | ||
| 13 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 14 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 15 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 16 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 17 | # ==================================================================== | ||
| 18 | # | ||
| 19 | # ECP_NISTZ256 module for x86/SSE2. | ||
| 20 | # | ||
| 21 | # October 2014. | ||
| 22 | # | ||
| 23 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | ||
| 24 | # http://eprint.iacr.org/2013/816. In the process of adaptation | ||
| 25 | # original .c module was made 32-bit savvy in order to make this | ||
| 26 | # implementation possible. | ||
| 27 | # | ||
| 28 | # with/without -DECP_NISTZ256_ASM | ||
| 29 | # Pentium +66-163% | ||
| 30 | # PIII +72-172% | ||
| 31 | # P4 +65-132% | ||
| 32 | # Core2 +90-215% | ||
| 33 | # Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) | ||
| 34 | # Atom +65-155% | ||
| 35 | # Opteron +54-110% | ||
| 36 | # Bulldozer +99-240% | ||
| 37 | # VIA Nano +93-290% | ||
| 38 | # | ||
| 39 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 40 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | ||
| 41 | # operation. Keep in mind that +200% means 3x improvement. | ||
| 42 | |||
| 43 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 44 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 45 | require "x86asm.pl"; | ||
| 46 | |||
| 47 | # Uncomment when all i386 assembly generators are updated to take the output | ||
| 48 | # file as last argument... | ||
| 49 | # $output=pop; | ||
| 50 | # open STDOUT,">$output"; | ||
| 51 | |||
| 52 | &asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386"); | ||
| 53 | |||
| 54 | $sse2=0; | ||
| 55 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 56 | |||
| 57 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 58 | |||
| 59 | |||
| 60 | ######################################################################## | ||
| 61 | # Keep in mind that constants are stored least to most significant word | ||
| 62 | &static_label("ONE"); | ||
| 63 | &set_label("ONE",64); | ||
| 64 | &data_word(1,0,0,0,0,0,0,0); | ||
| 65 | &align(64); | ||
| 66 | |||
| 67 | ######################################################################## | ||
| 68 | # void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 69 | &function_begin("ecp_nistz256_mul_by_2"); | ||
| 70 | &mov ("esi",&wparam(1)); | ||
| 71 | &mov ("edi",&wparam(0)); | ||
| 72 | &mov ("ebp","esi"); | ||
| 73 | ######################################################################## | ||
| 74 | # common pattern for internal functions is that %edi is result pointer, | ||
| 75 | # %esi and %ebp are input ones, %ebp being optional. %edi is preserved. | ||
| 76 | &call ("_ecp_nistz256_add"); | ||
| 77 | &function_end("ecp_nistz256_mul_by_2"); | ||
| 78 | |||
| 79 | ######################################################################## | ||
| 80 | # void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 81 | &function_begin("ecp_nistz256_div_by_2"); | ||
| 82 | &mov ("esi",&wparam(1)); | ||
| 83 | &mov ("edi",&wparam(0)); | ||
| 84 | &call ("_ecp_nistz256_div_by_2"); | ||
| 85 | &function_end("ecp_nistz256_div_by_2"); | ||
| 86 | |||
| 87 | &function_begin_B("_ecp_nistz256_div_by_2"); | ||
| 88 | # tmp = a is odd ? a+mod : a | ||
| 89 | # | ||
| 90 | # note that because mod has special form, i.e. consists of | ||
| 91 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 92 | # assigning least significant bit of input to one register, | ||
| 93 | # %ebp, and its negative to another, %edx. | ||
| 94 | |||
| 95 | &mov ("ebp",&DWP(0,"esi")); | ||
| 96 | &xor ("edx","edx"); | ||
| 97 | &mov ("ebx",&DWP(4,"esi")); | ||
| 98 | &mov ("eax","ebp"); | ||
| 99 | &and ("ebp",1); | ||
| 100 | &mov ("ecx",&DWP(8,"esi")); | ||
| 101 | &sub ("edx","ebp"); | ||
| 102 | |||
| 103 | &add ("eax","edx"); | ||
| 104 | &adc ("ebx","edx"); | ||
| 105 | &mov (&DWP(0,"edi"),"eax"); | ||
| 106 | &adc ("ecx","edx"); | ||
| 107 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 108 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 109 | |||
| 110 | &mov ("eax",&DWP(12,"esi")); | ||
| 111 | &mov ("ebx",&DWP(16,"esi")); | ||
| 112 | &adc ("eax",0); | ||
| 113 | &mov ("ecx",&DWP(20,"esi")); | ||
| 114 | &adc ("ebx",0); | ||
| 115 | &mov (&DWP(12,"edi"),"eax"); | ||
| 116 | &adc ("ecx",0); | ||
| 117 | &mov (&DWP(16,"edi"),"ebx"); | ||
| 118 | &mov (&DWP(20,"edi"),"ecx"); | ||
| 119 | |||
| 120 | &mov ("eax",&DWP(24,"esi")); | ||
| 121 | &mov ("ebx",&DWP(28,"esi")); | ||
| 122 | &adc ("eax","ebp"); | ||
| 123 | &adc ("ebx","edx"); | ||
| 124 | &mov (&DWP(24,"edi"),"eax"); | ||
| 125 | &sbb ("esi","esi"); # broadcast carry bit | ||
| 126 | &mov (&DWP(28,"edi"),"ebx"); | ||
| 127 | |||
| 128 | # ret = tmp >> 1 | ||
| 129 | |||
| 130 | &mov ("eax",&DWP(0,"edi")); | ||
| 131 | &mov ("ebx",&DWP(4,"edi")); | ||
| 132 | &mov ("ecx",&DWP(8,"edi")); | ||
| 133 | &mov ("edx",&DWP(12,"edi")); | ||
| 134 | |||
| 135 | &shr ("eax",1); | ||
| 136 | &mov ("ebp","ebx"); | ||
| 137 | &shl ("ebx",31); | ||
| 138 | &or ("eax","ebx"); | ||
| 139 | |||
| 140 | &shr ("ebp",1); | ||
| 141 | &mov ("ebx","ecx"); | ||
| 142 | &shl ("ecx",31); | ||
| 143 | &mov (&DWP(0,"edi"),"eax"); | ||
| 144 | &or ("ebp","ecx"); | ||
| 145 | &mov ("eax",&DWP(16,"edi")); | ||
| 146 | |||
| 147 | &shr ("ebx",1); | ||
| 148 | &mov ("ecx","edx"); | ||
| 149 | &shl ("edx",31); | ||
| 150 | &mov (&DWP(4,"edi"),"ebp"); | ||
| 151 | &or ("ebx","edx"); | ||
| 152 | &mov ("ebp",&DWP(20,"edi")); | ||
| 153 | |||
| 154 | &shr ("ecx",1); | ||
| 155 | &mov ("edx","eax"); | ||
| 156 | &shl ("eax",31); | ||
| 157 | &mov (&DWP(8,"edi"),"ebx"); | ||
| 158 | &or ("ecx","eax"); | ||
| 159 | &mov ("ebx",&DWP(24,"edi")); | ||
| 160 | |||
| 161 | &shr ("edx",1); | ||
| 162 | &mov ("eax","ebp"); | ||
| 163 | &shl ("ebp",31); | ||
| 164 | &mov (&DWP(12,"edi"),"ecx"); | ||
| 165 | &or ("edx","ebp"); | ||
| 166 | &mov ("ecx",&DWP(28,"edi")); | ||
| 167 | |||
| 168 | &shr ("eax",1); | ||
| 169 | &mov ("ebp","ebx"); | ||
| 170 | &shl ("ebx",31); | ||
| 171 | &mov (&DWP(16,"edi"),"edx"); | ||
| 172 | &or ("eax","ebx"); | ||
| 173 | |||
| 174 | &shr ("ebp",1); | ||
| 175 | &mov ("ebx","ecx"); | ||
| 176 | &shl ("ecx",31); | ||
| 177 | &mov (&DWP(20,"edi"),"eax"); | ||
| 178 | &or ("ebp","ecx"); | ||
| 179 | |||
| 180 | &shr ("ebx",1); | ||
| 181 | &shl ("esi",31); | ||
| 182 | &mov (&DWP(24,"edi"),"ebp"); | ||
| 183 | &or ("ebx","esi"); # handle top-most carry bit | ||
| 184 | &mov (&DWP(28,"edi"),"ebx"); | ||
| 185 | |||
| 186 | &ret (); | ||
| 187 | &function_end_B("_ecp_nistz256_div_by_2"); | ||
| 188 | |||
| 189 | ######################################################################## | ||
| 190 | # void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], | ||
| 191 | # const BN_ULONG ebp[8]); | ||
| 192 | &function_begin("ecp_nistz256_add"); | ||
| 193 | &mov ("esi",&wparam(1)); | ||
| 194 | &mov ("ebp",&wparam(2)); | ||
| 195 | &mov ("edi",&wparam(0)); | ||
| 196 | &call ("_ecp_nistz256_add"); | ||
| 197 | &function_end("ecp_nistz256_add"); | ||
| 198 | |||
| 199 | &function_begin_B("_ecp_nistz256_add"); | ||
| 200 | &mov ("eax",&DWP(0,"esi")); | ||
| 201 | &mov ("ebx",&DWP(4,"esi")); | ||
| 202 | &mov ("ecx",&DWP(8,"esi")); | ||
| 203 | &add ("eax",&DWP(0,"ebp")); | ||
| 204 | &mov ("edx",&DWP(12,"esi")); | ||
| 205 | &adc ("ebx",&DWP(4,"ebp")); | ||
| 206 | &mov (&DWP(0,"edi"),"eax"); | ||
| 207 | &adc ("ecx",&DWP(8,"ebp")); | ||
| 208 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 209 | &adc ("edx",&DWP(12,"ebp")); | ||
| 210 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 211 | &mov (&DWP(12,"edi"),"edx"); | ||
| 212 | |||
| 213 | &mov ("eax",&DWP(16,"esi")); | ||
| 214 | &mov ("ebx",&DWP(20,"esi")); | ||
| 215 | &mov ("ecx",&DWP(24,"esi")); | ||
| 216 | &adc ("eax",&DWP(16,"ebp")); | ||
| 217 | &mov ("edx",&DWP(28,"esi")); | ||
| 218 | &adc ("ebx",&DWP(20,"ebp")); | ||
| 219 | &mov (&DWP(16,"edi"),"eax"); | ||
| 220 | &adc ("ecx",&DWP(24,"ebp")); | ||
| 221 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 222 | &mov ("esi",0); | ||
| 223 | &adc ("edx",&DWP(28,"ebp")); | ||
| 224 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 225 | &adc ("esi",0); | ||
| 226 | &mov (&DWP(28,"edi"),"edx"); | ||
| 227 | |||
| 228 | # if a+b >= modulus, subtract modulus. | ||
| 229 | # | ||
| 230 | # But since comparison implies subtraction, we subtract modulus | ||
| 231 | # to see if it borrows, and then subtract it for real if | ||
| 232 | # subtraction didn't borrow. | ||
| 233 | |||
| 234 | &mov ("eax",&DWP(0,"edi")); | ||
| 235 | &mov ("ebx",&DWP(4,"edi")); | ||
| 236 | &mov ("ecx",&DWP(8,"edi")); | ||
| 237 | &sub ("eax",-1); | ||
| 238 | &mov ("edx",&DWP(12,"edi")); | ||
| 239 | &sbb ("ebx",-1); | ||
| 240 | &mov ("eax",&DWP(16,"edi")); | ||
| 241 | &sbb ("ecx",-1); | ||
| 242 | &mov ("ebx",&DWP(20,"edi")); | ||
| 243 | &sbb ("edx",0); | ||
| 244 | &mov ("ecx",&DWP(24,"edi")); | ||
| 245 | &sbb ("eax",0); | ||
| 246 | &mov ("edx",&DWP(28,"edi")); | ||
| 247 | &sbb ("ebx",0); | ||
| 248 | &sbb ("ecx",1); | ||
| 249 | &sbb ("edx",-1); | ||
| 250 | &sbb ("esi",0); | ||
| 251 | |||
| 252 | # Note that because mod has special form, i.e. consists of | ||
| 253 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 254 | # by using borrow. | ||
| 255 | |||
| 256 | ¬ ("esi"); | ||
| 257 | &mov ("eax",&DWP(0,"edi")); | ||
| 258 | &mov ("ebp","esi"); | ||
| 259 | &mov ("ebx",&DWP(4,"edi")); | ||
| 260 | &shr ("ebp",31); | ||
| 261 | &mov ("ecx",&DWP(8,"edi")); | ||
| 262 | &sub ("eax","esi"); | ||
| 263 | &mov ("edx",&DWP(12,"edi")); | ||
| 264 | &sbb ("ebx","esi"); | ||
| 265 | &mov (&DWP(0,"edi"),"eax"); | ||
| 266 | &sbb ("ecx","esi"); | ||
| 267 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 268 | &sbb ("edx",0); | ||
| 269 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 270 | &mov (&DWP(12,"edi"),"edx"); | ||
| 271 | |||
| 272 | &mov ("eax",&DWP(16,"edi")); | ||
| 273 | &mov ("ebx",&DWP(20,"edi")); | ||
| 274 | &mov ("ecx",&DWP(24,"edi")); | ||
| 275 | &sbb ("eax",0); | ||
| 276 | &mov ("edx",&DWP(28,"edi")); | ||
| 277 | &sbb ("ebx",0); | ||
| 278 | &mov (&DWP(16,"edi"),"eax"); | ||
| 279 | &sbb ("ecx","ebp"); | ||
| 280 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 281 | &sbb ("edx","esi"); | ||
| 282 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 283 | &mov (&DWP(28,"edi"),"edx"); | ||
| 284 | |||
| 285 | &ret (); | ||
| 286 | &function_end_B("_ecp_nistz256_add"); | ||
| 287 | |||
| 288 | ######################################################################## | ||
| 289 | # void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], | ||
| 290 | # const BN_ULONG ebp[8]); | ||
| 291 | &function_begin("ecp_nistz256_sub"); | ||
| 292 | &mov ("esi",&wparam(1)); | ||
| 293 | &mov ("ebp",&wparam(2)); | ||
| 294 | &mov ("edi",&wparam(0)); | ||
| 295 | &call ("_ecp_nistz256_sub"); | ||
| 296 | &function_end("ecp_nistz256_sub"); | ||
| 297 | |||
| 298 | &function_begin_B("_ecp_nistz256_sub"); | ||
| 299 | &mov ("eax",&DWP(0,"esi")); | ||
| 300 | &mov ("ebx",&DWP(4,"esi")); | ||
| 301 | &mov ("ecx",&DWP(8,"esi")); | ||
| 302 | &sub ("eax",&DWP(0,"ebp")); | ||
| 303 | &mov ("edx",&DWP(12,"esi")); | ||
| 304 | &sbb ("ebx",&DWP(4,"ebp")); | ||
| 305 | &mov (&DWP(0,"edi"),"eax"); | ||
| 306 | &sbb ("ecx",&DWP(8,"ebp")); | ||
| 307 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 308 | &sbb ("edx",&DWP(12,"ebp")); | ||
| 309 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 310 | &mov (&DWP(12,"edi"),"edx"); | ||
| 311 | |||
| 312 | &mov ("eax",&DWP(16,"esi")); | ||
| 313 | &mov ("ebx",&DWP(20,"esi")); | ||
| 314 | &mov ("ecx",&DWP(24,"esi")); | ||
| 315 | &sbb ("eax",&DWP(16,"ebp")); | ||
| 316 | &mov ("edx",&DWP(28,"esi")); | ||
| 317 | &sbb ("ebx",&DWP(20,"ebp")); | ||
| 318 | &sbb ("ecx",&DWP(24,"ebp")); | ||
| 319 | &mov (&DWP(16,"edi"),"eax"); | ||
| 320 | &sbb ("edx",&DWP(28,"ebp")); | ||
| 321 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 322 | &sbb ("esi","esi"); # broadcast borrow bit | ||
| 323 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 324 | &mov (&DWP(28,"edi"),"edx"); | ||
| 325 | |||
| 326 | # if a-b borrows, add modulus. | ||
| 327 | # | ||
| 328 | # Note that because mod has special form, i.e. consists of | ||
| 329 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 330 | # assigning borrow bit to one register, %ebp, and its negative | ||
| 331 | # to another, %esi. But we started by calculating %esi... | ||
| 332 | |||
| 333 | &mov ("eax",&DWP(0,"edi")); | ||
| 334 | &mov ("ebp","esi"); | ||
| 335 | &mov ("ebx",&DWP(4,"edi")); | ||
| 336 | &shr ("ebp",31); | ||
| 337 | &mov ("ecx",&DWP(8,"edi")); | ||
| 338 | &add ("eax","esi"); | ||
| 339 | &mov ("edx",&DWP(12,"edi")); | ||
| 340 | &adc ("ebx","esi"); | ||
| 341 | &mov (&DWP(0,"edi"),"eax"); | ||
| 342 | &adc ("ecx","esi"); | ||
| 343 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 344 | &adc ("edx",0); | ||
| 345 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 346 | &mov (&DWP(12,"edi"),"edx"); | ||
| 347 | |||
| 348 | &mov ("eax",&DWP(16,"edi")); | ||
| 349 | &mov ("ebx",&DWP(20,"edi")); | ||
| 350 | &mov ("ecx",&DWP(24,"edi")); | ||
| 351 | &adc ("eax",0); | ||
| 352 | &mov ("edx",&DWP(28,"edi")); | ||
| 353 | &adc ("ebx",0); | ||
| 354 | &mov (&DWP(16,"edi"),"eax"); | ||
| 355 | &adc ("ecx","ebp"); | ||
| 356 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 357 | &adc ("edx","esi"); | ||
| 358 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 359 | &mov (&DWP(28,"edi"),"edx"); | ||
| 360 | |||
| 361 | &ret (); | ||
| 362 | &function_end_B("_ecp_nistz256_sub"); | ||
| 363 | |||
| 364 | ######################################################################## | ||
| 365 | # void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 366 | &function_begin("ecp_nistz256_neg"); | ||
| 367 | &mov ("ebp",&wparam(1)); | ||
| 368 | &mov ("edi",&wparam(0)); | ||
| 369 | |||
| 370 | &xor ("eax","eax"); | ||
| 371 | &stack_push(8); | ||
| 372 | &mov (&DWP(0,"esp"),"eax"); | ||
| 373 | &mov ("esi","esp"); | ||
| 374 | &mov (&DWP(4,"esp"),"eax"); | ||
| 375 | &mov (&DWP(8,"esp"),"eax"); | ||
| 376 | &mov (&DWP(12,"esp"),"eax"); | ||
| 377 | &mov (&DWP(16,"esp"),"eax"); | ||
| 378 | &mov (&DWP(20,"esp"),"eax"); | ||
| 379 | &mov (&DWP(24,"esp"),"eax"); | ||
| 380 | &mov (&DWP(28,"esp"),"eax"); | ||
| 381 | |||
| 382 | &call ("_ecp_nistz256_sub"); | ||
| 383 | |||
| 384 | &stack_pop(8); | ||
| 385 | &function_end("ecp_nistz256_neg"); | ||
| 386 | |||
| 387 | &function_begin_B("_picup_eax"); | ||
| 388 | &mov ("eax",&DWP(0,"esp")); | ||
| 389 | &ret (); | ||
| 390 | &function_end_B("_picup_eax"); | ||
| 391 | |||
| 392 | ######################################################################## | ||
| 393 | # void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 394 | &function_begin("ecp_nistz256_from_mont"); | ||
| 395 | &mov ("esi",&wparam(1)); | ||
| 396 | &call ("_picup_eax"); | ||
| 397 | &set_label("pic"); | ||
| 398 | &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); | ||
| 399 | if ($sse2) { | ||
| 400 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 401 | &mov ("eax",&DWP(0,"eax")); } | ||
| 402 | &mov ("edi",&wparam(0)); | ||
| 403 | &call ("_ecp_nistz256_mul_mont"); | ||
| 404 | &function_end("ecp_nistz256_from_mont"); | ||
| 405 | |||
| 406 | ######################################################################## | ||
| 407 | # void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], | ||
| 408 | # const BN_ULONG ebp[8]); | ||
| 409 | &function_begin("ecp_nistz256_mul_mont"); | ||
| 410 | &mov ("esi",&wparam(1)); | ||
| 411 | &mov ("ebp",&wparam(2)); | ||
| 412 | if ($sse2) { | ||
| 413 | &call ("_picup_eax"); | ||
| 414 | &set_label("pic"); | ||
| 415 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 416 | &mov ("eax",&DWP(0,"eax")); } | ||
| 417 | &mov ("edi",&wparam(0)); | ||
| 418 | &call ("_ecp_nistz256_mul_mont"); | ||
| 419 | &function_end("ecp_nistz256_mul_mont"); | ||
| 420 | |||
| 421 | ######################################################################## | ||
| 422 | # void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 423 | &function_begin("ecp_nistz256_sqr_mont"); | ||
| 424 | &mov ("esi",&wparam(1)); | ||
| 425 | if ($sse2) { | ||
| 426 | &call ("_picup_eax"); | ||
| 427 | &set_label("pic"); | ||
| 428 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 429 | &mov ("eax",&DWP(0,"eax")); } | ||
| 430 | &mov ("edi",&wparam(0)); | ||
| 431 | &mov ("ebp","esi"); | ||
| 432 | &call ("_ecp_nistz256_mul_mont"); | ||
| 433 | &function_end("ecp_nistz256_sqr_mont"); | ||
| 434 | |||
| 435 | &function_begin_B("_ecp_nistz256_mul_mont"); | ||
| 436 | if ($sse2) { | ||
| 437 | # see if XMM+SSE2 is on | ||
| 438 | &and ("eax","\$(IA32CAP_MASK0_FXSR | IA32CAP_MASK0_SSE2)"); | ||
| 439 | &cmp ("eax","\$(IA32CAP_MASK0_FXSR | IA32CAP_MASK0_SSE2)"); | ||
| 440 | &jne (&label("mul_mont_ialu")); | ||
| 441 | |||
| 442 | ######################################## | ||
| 443 | # SSE2 code path featuring 32x16-bit | ||
| 444 | # multiplications is ~2x faster than | ||
| 445 | # IALU counterpart (except on Atom)... | ||
| 446 | ######################################## | ||
| 447 | # stack layout: | ||
| 448 | # +------------------------------------+< %esp | ||
| 449 | # | 7 16-byte temporary XMM words, | | ||
| 450 | # | "sliding" toward lower address | | ||
| 451 | # . . | ||
| 452 | # +------------------------------------+ | ||
| 453 | # | unused XMM word | | ||
| 454 | # +------------------------------------+< +128,%ebx | ||
| 455 | # | 8 16-byte XMM words holding copies | | ||
| 456 | # | of a[i]<<64|a[i] | | ||
| 457 | # . . | ||
| 458 | # . . | ||
| 459 | # +------------------------------------+< +256 | ||
| 460 | &mov ("edx","esp"); | ||
| 461 | &sub ("esp",0x100); | ||
| 462 | |||
| 463 | &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy | ||
| 464 | &lea ("ebp",&DWP(4,"ebp")); | ||
| 465 | &pcmpeqd("xmm6","xmm6"); | ||
| 466 | &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff | ||
| 467 | |||
| 468 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | ||
| 469 | &and ("esp",-64); | ||
| 470 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | ||
| 471 | &lea ("ebx",&DWP(0x80,"esp")); | ||
| 472 | |||
| 473 | &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy | ||
| 474 | &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy | ||
| 475 | &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... | ||
| 476 | &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] | ||
| 477 | &pmuludq("xmm0","xmm7"); # a[0]*b[0] | ||
| 478 | |||
| 479 | &movd ("xmm2",&DWP(4*2,"esi")); | ||
| 480 | &pshufd ("xmm1","xmm1",0b11001100); | ||
| 481 | &movdqa (&QWP(0x10,"ebx"),"xmm1"); | ||
| 482 | &pmuludq("xmm1","xmm7"); # a[1]*b[0] | ||
| 483 | |||
| 484 | &movq ("xmm4","xmm0"); # clear upper 64 bits | ||
| 485 | &pslldq("xmm4",6); | ||
| 486 | &paddq ("xmm4","xmm0"); | ||
| 487 | &movdqa("xmm5","xmm4"); | ||
| 488 | &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] | ||
| 489 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] | ||
| 490 | |||
| 491 | # Upper half of a[0]*b[i] is carried into next multiplication | ||
| 492 | # iteration, while lower one "participates" in actual reduction. | ||
| 493 | # Normally latter is done by accumulating result of multiplication | ||
| 494 | # of modulus by "magic" digit, but thanks to special form of modulus | ||
| 495 | # and "magic" digit it can be performed only with additions and | ||
| 496 | # subtractions (see note in IALU section below). Note that we are | ||
| 497 | # not bothered with carry bits, they are accumulated in "flatten" | ||
| 498 | # phase after all multiplications and reductions. | ||
| 499 | |||
| 500 | &movd ("xmm3",&DWP(4*3,"esi")); | ||
| 501 | &pshufd ("xmm2","xmm2",0b11001100); | ||
| 502 | &movdqa (&QWP(0x20,"ebx"),"xmm2"); | ||
| 503 | &pmuludq("xmm2","xmm7"); # a[2]*b[0] | ||
| 504 | &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry | ||
| 505 | &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] | ||
| 506 | |||
| 507 | &movd ("xmm0",&DWP(4*4,"esi")); | ||
| 508 | &pshufd ("xmm3","xmm3",0b11001100); | ||
| 509 | &movdqa (&QWP(0x30,"ebx"),"xmm3"); | ||
| 510 | &pmuludq("xmm3","xmm7"); # a[3]*b[0] | ||
| 511 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | ||
| 512 | |||
| 513 | &movd ("xmm1",&DWP(4*5,"esi")); | ||
| 514 | &pshufd ("xmm0","xmm0",0b11001100); | ||
| 515 | &movdqa (&QWP(0x40,"ebx"),"xmm0"); | ||
| 516 | &pmuludq("xmm0","xmm7"); # a[4]*b[0] | ||
| 517 | &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step | ||
| 518 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | ||
| 519 | |||
| 520 | &movd ("xmm2",&DWP(4*6,"esi")); | ||
| 521 | &pshufd ("xmm1","xmm1",0b11001100); | ||
| 522 | &movdqa (&QWP(0x50,"ebx"),"xmm1"); | ||
| 523 | &pmuludq("xmm1","xmm7"); # a[5]*b[0] | ||
| 524 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | ||
| 525 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | ||
| 526 | |||
| 527 | &movd ("xmm3",&DWP(4*7,"esi")); | ||
| 528 | &pshufd ("xmm2","xmm2",0b11001100); | ||
| 529 | &movdqa (&QWP(0x60,"ebx"),"xmm2"); | ||
| 530 | &pmuludq("xmm2","xmm7"); # a[6]*b[0] | ||
| 531 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | ||
| 532 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | ||
| 533 | |||
| 534 | &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy | ||
| 535 | &pshufd ("xmm3","xmm3",0b11001100); | ||
| 536 | &movdqa (&QWP(0x70,"ebx"),"xmm3"); | ||
| 537 | &pmuludq("xmm3","xmm7"); # a[7]*b[0] | ||
| 538 | |||
| 539 | &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y | ||
| 540 | &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | ||
| 541 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | ||
| 542 | |||
| 543 | &mov ("ecx",6); | ||
| 544 | &lea ("ebp",&DWP(4,"ebp")); | ||
| 545 | &jmp (&label("madd_sse2")); | ||
| 546 | |||
| 547 | &set_label("madd_sse2",16); | ||
| 548 | &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] | ||
| 549 | &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] | ||
| 550 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | ||
| 551 | &pmuludq("xmm0","xmm7"); # a[0]*b[i] | ||
| 552 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | ||
| 553 | |||
| 554 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | ||
| 555 | &pmuludq("xmm1","xmm7"); # a[1]*b[i] | ||
| 556 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | ||
| 557 | &paddq ("xmm0",&QWP(0x00,"esp")); | ||
| 558 | |||
| 559 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | ||
| 560 | &pmuludq("xmm2","xmm7"); # a[2]*b[i] | ||
| 561 | &movq ("xmm4","xmm0"); # clear upper 64 bits | ||
| 562 | &pslldq("xmm4",6); | ||
| 563 | &paddq ("xmm1",&QWP(0x10,"esp")); | ||
| 564 | &paddq ("xmm4","xmm0"); | ||
| 565 | &movdqa("xmm5","xmm4"); | ||
| 566 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | ||
| 567 | |||
| 568 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | ||
| 569 | &pmuludq("xmm3","xmm7"); # a[3]*b[i] | ||
| 570 | &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry | ||
| 571 | &paddq ("xmm2",&QWP(0x20,"esp")); | ||
| 572 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | ||
| 573 | |||
| 574 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | ||
| 575 | &pmuludq("xmm0","xmm7"); # a[4]*b[i] | ||
| 576 | &paddq ("xmm3",&QWP(0x30,"esp")); | ||
| 577 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | ||
| 578 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | ||
| 579 | |||
| 580 | &movdqa ("xmm2",&QWP(0x60,"ebx")); | ||
| 581 | &pmuludq("xmm1","xmm7"); # a[5]*b[i] | ||
| 582 | &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step | ||
| 583 | &paddq ("xmm0",&QWP(0x40,"esp")); | ||
| 584 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | ||
| 585 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | ||
| 586 | |||
| 587 | &movdqa ("xmm3","xmm7"); | ||
| 588 | &pmuludq("xmm2","xmm7"); # a[6]*b[i] | ||
| 589 | &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy | ||
| 590 | &lea ("ebp",&DWP(4,"ebp")); | ||
| 591 | &paddq ("xmm1",&QWP(0x50,"esp")); | ||
| 592 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | ||
| 593 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | ||
| 594 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | ||
| 595 | |||
| 596 | &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] | ||
| 597 | &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | ||
| 598 | &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | ||
| 599 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | ||
| 600 | &paddq ("xmm2",&QWP(0x60,"esp")); | ||
| 601 | |||
| 602 | &dec ("ecx"); | ||
| 603 | &jnz (&label("madd_sse2")); | ||
| 604 | |||
| 605 | &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] | ||
| 606 | &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] | ||
| 607 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | ||
| 608 | &pmuludq("xmm0","xmm7"); # a[0]*b[7] | ||
| 609 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | ||
| 610 | |||
| 611 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | ||
| 612 | &pmuludq("xmm1","xmm7"); # a[1]*b[7] | ||
| 613 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | ||
| 614 | &paddq ("xmm0",&QWP(0x00,"esp")); | ||
| 615 | |||
| 616 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | ||
| 617 | &pmuludq("xmm2","xmm7"); # a[2]*b[7] | ||
| 618 | &movq ("xmm4","xmm0"); # clear upper 64 bits | ||
| 619 | &pslldq("xmm4",6); | ||
| 620 | &paddq ("xmm1",&QWP(0x10,"esp")); | ||
| 621 | &paddq ("xmm4","xmm0"); | ||
| 622 | &movdqa("xmm5","xmm4"); | ||
| 623 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | ||
| 624 | |||
| 625 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | ||
| 626 | &pmuludq("xmm3","xmm7"); # a[3]*b[7] | ||
| 627 | &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry | ||
| 628 | &paddq ("xmm2",&QWP(0x20,"esp")); | ||
| 629 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | ||
| 630 | |||
| 631 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | ||
| 632 | &pmuludq("xmm0","xmm7"); # a[4]*b[7] | ||
| 633 | &paddq ("xmm3",&QWP(0x30,"esp")); | ||
| 634 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | ||
| 635 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | ||
| 636 | |||
| 637 | &movdqa ("xmm2",&QWP(0x60,"ebx")); | ||
| 638 | &pmuludq("xmm1","xmm7"); # a[5]*b[7] | ||
| 639 | &paddq ("xmm3","xmm5"); # reduction step | ||
| 640 | &paddq ("xmm0",&QWP(0x40,"esp")); | ||
| 641 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | ||
| 642 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | ||
| 643 | |||
| 644 | &movdqa ("xmm3",&QWP(0x70,"ebx")); | ||
| 645 | &pmuludq("xmm2","xmm7"); # a[6]*b[7] | ||
| 646 | &paddq ("xmm1",&QWP(0x50,"esp")); | ||
| 647 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | ||
| 648 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | ||
| 649 | |||
| 650 | &pmuludq("xmm3","xmm7"); # a[7]*b[7] | ||
| 651 | &pcmpeqd("xmm7","xmm7"); | ||
| 652 | &movdqa ("xmm0",&QWP(0x00,"esp")); | ||
| 653 | &pslldq ("xmm7",8); | ||
| 654 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | ||
| 655 | &paddq ("xmm2",&QWP(0x60,"esp")); | ||
| 656 | |||
| 657 | &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step | ||
| 658 | &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step | ||
| 659 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | ||
| 660 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | ||
| 661 | |||
| 662 | &movdqa ("xmm1",&QWP(0x10,"esp")); | ||
| 663 | &movdqa ("xmm2",&QWP(0x20,"esp")); | ||
| 664 | &movdqa ("xmm3",&QWP(0x30,"esp")); | ||
| 665 | |||
| 666 | &movq ("xmm4","xmm0"); # "flatten" | ||
| 667 | &pand ("xmm0","xmm7"); | ||
| 668 | &xor ("ebp","ebp"); | ||
| 669 | &pslldq ("xmm4",6); | ||
| 670 | &movq ("xmm5","xmm1"); | ||
| 671 | &paddq ("xmm0","xmm4"); | ||
| 672 | &pand ("xmm1","xmm7"); | ||
| 673 | &psrldq ("xmm0",6); | ||
| 674 | &movd ("eax","xmm0"); | ||
| 675 | &psrldq ("xmm0",4); | ||
| 676 | |||
| 677 | &paddq ("xmm5","xmm0"); | ||
| 678 | &movdqa ("xmm0",&QWP(0x40,"esp")); | ||
| 679 | &sub ("eax",-1); # start subtracting modulus, | ||
| 680 | # this is used to determine | ||
| 681 | # if result is larger/smaller | ||
| 682 | # than modulus (see below) | ||
| 683 | &pslldq ("xmm5",6); | ||
| 684 | &movq ("xmm4","xmm2"); | ||
| 685 | &paddq ("xmm1","xmm5"); | ||
| 686 | &pand ("xmm2","xmm7"); | ||
| 687 | &psrldq ("xmm1",6); | ||
| 688 | &mov (&DWP(4*0,"edi"),"eax"); | ||
| 689 | &movd ("eax","xmm1"); | ||
| 690 | &psrldq ("xmm1",4); | ||
| 691 | |||
| 692 | &paddq ("xmm4","xmm1"); | ||
| 693 | &movdqa ("xmm1",&QWP(0x50,"esp")); | ||
| 694 | &sbb ("eax",-1); | ||
| 695 | &pslldq ("xmm4",6); | ||
| 696 | &movq ("xmm5","xmm3"); | ||
| 697 | &paddq ("xmm2","xmm4"); | ||
| 698 | &pand ("xmm3","xmm7"); | ||
| 699 | &psrldq ("xmm2",6); | ||
| 700 | &mov (&DWP(4*1,"edi"),"eax"); | ||
| 701 | &movd ("eax","xmm2"); | ||
| 702 | &psrldq ("xmm2",4); | ||
| 703 | |||
| 704 | &paddq ("xmm5","xmm2"); | ||
| 705 | &movdqa ("xmm2",&QWP(0x60,"esp")); | ||
| 706 | &sbb ("eax",-1); | ||
| 707 | &pslldq ("xmm5",6); | ||
| 708 | &movq ("xmm4","xmm0"); | ||
| 709 | &paddq ("xmm3","xmm5"); | ||
| 710 | &pand ("xmm0","xmm7"); | ||
| 711 | &psrldq ("xmm3",6); | ||
| 712 | &mov (&DWP(4*2,"edi"),"eax"); | ||
| 713 | &movd ("eax","xmm3"); | ||
| 714 | &psrldq ("xmm3",4); | ||
| 715 | |||
| 716 | &paddq ("xmm4","xmm3"); | ||
| 717 | &sbb ("eax",0); | ||
| 718 | &pslldq ("xmm4",6); | ||
| 719 | &movq ("xmm5","xmm1"); | ||
| 720 | &paddq ("xmm0","xmm4"); | ||
| 721 | &pand ("xmm1","xmm7"); | ||
| 722 | &psrldq ("xmm0",6); | ||
| 723 | &mov (&DWP(4*3,"edi"),"eax"); | ||
| 724 | &movd ("eax","xmm0"); | ||
| 725 | &psrldq ("xmm0",4); | ||
| 726 | |||
| 727 | &paddq ("xmm5","xmm0"); | ||
| 728 | &sbb ("eax",0); | ||
| 729 | &pslldq ("xmm5",6); | ||
| 730 | &movq ("xmm4","xmm2"); | ||
| 731 | &paddq ("xmm1","xmm5"); | ||
| 732 | &pand ("xmm2","xmm7"); | ||
| 733 | &psrldq ("xmm1",6); | ||
| 734 | &movd ("ebx","xmm1"); | ||
| 735 | &psrldq ("xmm1",4); | ||
| 736 | &mov ("esp","edx"); | ||
| 737 | |||
| 738 | &paddq ("xmm4","xmm1"); | ||
| 739 | &pslldq ("xmm4",6); | ||
| 740 | &paddq ("xmm2","xmm4"); | ||
| 741 | &psrldq ("xmm2",6); | ||
| 742 | &movd ("ecx","xmm2"); | ||
| 743 | &psrldq ("xmm2",4); | ||
| 744 | &sbb ("ebx",0); | ||
| 745 | &movd ("edx","xmm2"); | ||
| 746 | &pextrw ("esi","xmm2",2); # top-most overflow bit | ||
| 747 | &sbb ("ecx",1); | ||
| 748 | &sbb ("edx",-1); | ||
| 749 | &sbb ("esi",0); # borrow from subtraction | ||
| 750 | |||
| 751 | # Final step is "if result > mod, subtract mod", and at this point | ||
| 752 | # we have result - mod written to output buffer, as well as borrow | ||
| 753 | # bit from this subtraction, and if borrow bit is set, we add | ||
| 754 | # modulus back. | ||
| 755 | # | ||
| 756 | # Note that because mod has special form, i.e. consists of | ||
| 757 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 758 | # assigning borrow bit to one register, %ebp, and its negative | ||
| 759 | # to another, %esi. But we started by calculating %esi... | ||
| 760 | |||
| 761 | &sub ("ebp","esi"); | ||
| 762 | &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero | ||
| 763 | &adc (&DWP(4*1,"edi"),"esi"); | ||
| 764 | &adc (&DWP(4*2,"edi"),"esi"); | ||
| 765 | &adc (&DWP(4*3,"edi"),0); | ||
| 766 | &adc ("eax",0); | ||
| 767 | &adc ("ebx",0); | ||
| 768 | &mov (&DWP(4*4,"edi"),"eax"); | ||
| 769 | &adc ("ecx","ebp"); | ||
| 770 | &mov (&DWP(4*5,"edi"),"ebx"); | ||
| 771 | &adc ("edx","esi"); | ||
| 772 | &mov (&DWP(4*6,"edi"),"ecx"); | ||
| 773 | &mov (&DWP(4*7,"edi"),"edx"); | ||
| 774 | |||
| 775 | &ret (); | ||
| 776 | |||
| 777 | &set_label("mul_mont_ialu",16); } | ||
| 778 | |||
| 779 | ######################################## | ||
| 780 | # IALU code path suitable for all CPUs. | ||
| 781 | ######################################## | ||
| 782 | # stack layout: | ||
| 783 | # +------------------------------------+< %esp | ||
| 784 | # | 8 32-bit temporary words, accessed | | ||
| 785 | # | as circular buffer | | ||
| 786 | # . . | ||
| 787 | # . . | ||
| 788 | # +------------------------------------+< +32 | ||
| 789 | # | offloaded destination pointer | | ||
| 790 | # +------------------------------------+ | ||
| 791 | # | unused | | ||
| 792 | # +------------------------------------+< +40 | ||
| 793 | &sub ("esp",10*4); | ||
| 794 | |||
| 795 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | ||
| 796 | &mov ("ebx",&DWP(0*4,"ebp")); # b[0] | ||
| 797 | &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr | ||
| 798 | |||
| 799 | &mul ("ebx"); # a[0]*b[0] | ||
| 800 | &mov (&DWP(0*4,"esp"),"eax"); # t[0] | ||
| 801 | &mov ("eax",&DWP(1*4,"esi")); | ||
| 802 | &mov ("ecx","edx") | ||
| 803 | |||
| 804 | &mul ("ebx"); # a[1]*b[0] | ||
| 805 | &add ("ecx","eax"); | ||
| 806 | &mov ("eax",&DWP(2*4,"esi")); | ||
| 807 | &adc ("edx",0); | ||
| 808 | &mov (&DWP(1*4,"esp"),"ecx"); # t[1] | ||
| 809 | &mov ("ecx","edx"); | ||
| 810 | |||
| 811 | &mul ("ebx"); # a[2]*b[0] | ||
| 812 | &add ("ecx","eax"); | ||
| 813 | &mov ("eax",&DWP(3*4,"esi")); | ||
| 814 | &adc ("edx",0); | ||
| 815 | &mov (&DWP(2*4,"esp"),"ecx"); # t[2] | ||
| 816 | &mov ("ecx","edx"); | ||
| 817 | |||
| 818 | &mul ("ebx"); # a[3]*b[0] | ||
| 819 | &add ("ecx","eax"); | ||
| 820 | &mov ("eax",&DWP(4*4,"esi")); | ||
| 821 | &adc ("edx",0); | ||
| 822 | &mov (&DWP(3*4,"esp"),"ecx"); # t[3] | ||
| 823 | &mov ("ecx","edx"); | ||
| 824 | |||
| 825 | &mul ("ebx"); # a[4]*b[0] | ||
| 826 | &add ("ecx","eax"); | ||
| 827 | &mov ("eax",&DWP(5*4,"esi")); | ||
| 828 | &adc ("edx",0); | ||
| 829 | &mov (&DWP(4*4,"esp"),"ecx"); # t[4] | ||
| 830 | &mov ("ecx","edx"); | ||
| 831 | |||
| 832 | &mul ("ebx"); # a[5]*b[0] | ||
| 833 | &add ("ecx","eax"); | ||
| 834 | &mov ("eax",&DWP(6*4,"esi")); | ||
| 835 | &adc ("edx",0); | ||
| 836 | &mov (&DWP(5*4,"esp"),"ecx"); # t[5] | ||
| 837 | &mov ("ecx","edx"); | ||
| 838 | |||
| 839 | &mul ("ebx"); # a[6]*b[0] | ||
| 840 | &add ("ecx","eax"); | ||
| 841 | &mov ("eax",&DWP(7*4,"esi")); | ||
| 842 | &adc ("edx",0); | ||
| 843 | &mov (&DWP(6*4,"esp"),"ecx"); # t[6] | ||
| 844 | &mov ("ecx","edx"); | ||
| 845 | |||
| 846 | &xor ("edi","edi"); # initial top-most carry | ||
| 847 | &mul ("ebx"); # a[7]*b[0] | ||
| 848 | &add ("ecx","eax"); # t[7] | ||
| 849 | &mov ("eax",&DWP(0*4,"esp")); # t[0] | ||
| 850 | &adc ("edx",0); # t[8] | ||
| 851 | |||
| 852 | for ($i=0;$i<7;$i++) { | ||
| 853 | my $j=$i+1; | ||
| 854 | |||
| 855 | # Reduction iteration is normally performed by accumulating | ||
| 856 | # result of multiplication of modulus by "magic" digit [and | ||
| 857 | # omitting least significant word, which is guaranteed to | ||
| 858 | # be 0], but thanks to special form of modulus and "magic" | ||
| 859 | # digit being equal to least significant word, it can be | ||
| 860 | # performed with additions and subtractions alone. Indeed: | ||
| 861 | # | ||
| 862 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | ||
| 863 | # * abcd | ||
| 864 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 865 | # | ||
| 866 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 867 | # rewrite above as: | ||
| 868 | # | ||
| 869 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 870 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | ||
| 871 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | ||
| 872 | # | ||
| 873 | # or marking redundant operations: | ||
| 874 | # | ||
| 875 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | ||
| 876 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | ||
| 877 | # - abcd.----.----.----.----.----.----.---- | ||
| 878 | |||
| 879 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | ||
| 880 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | ||
| 881 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | ||
| 882 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | ||
| 883 | &adc ("ecx",0); # t[7]+=0 | ||
| 884 | &adc ("edx","eax"); # t[8]+=t[0] | ||
| 885 | &adc ("edi",0); # top-most carry | ||
| 886 | &mov ("ebx",&DWP($j*4,"ebp")); # b[i] | ||
| 887 | &sub ("ecx","eax"); # t[7]-=t[0] | ||
| 888 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | ||
| 889 | &sbb ("edx",0); # t[8]-=0 | ||
| 890 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | ||
| 891 | &sbb ("edi",0); # top-most carry, | ||
| 892 | # keep in mind that | ||
| 893 | # netto result is | ||
| 894 | # *addition* of value | ||
| 895 | # with (abcd<<32)-abcd | ||
| 896 | # on top, so that | ||
| 897 | # underflow is | ||
| 898 | # impossible, because | ||
| 899 | # (abcd<<32)-abcd | ||
| 900 | # doesn't underflow | ||
| 901 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | ||
| 902 | |||
| 903 | &mul ("ebx"); # a[0]*b[i] | ||
| 904 | &add ("eax",&DWP((($j+0)%8)*4,"esp")); | ||
| 905 | &adc ("edx",0); | ||
| 906 | &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); | ||
| 907 | &mov ("eax",&DWP(1*4,"esi")); | ||
| 908 | &mov ("ecx","edx") | ||
| 909 | |||
| 910 | &mul ("ebx"); # a[1]*b[i] | ||
| 911 | &add ("ecx",&DWP((($j+1)%8)*4,"esp")); | ||
| 912 | &adc ("edx",0); | ||
| 913 | &add ("ecx","eax"); | ||
| 914 | &adc ("edx",0); | ||
| 915 | &mov ("eax",&DWP(2*4,"esi")); | ||
| 916 | &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); | ||
| 917 | &mov ("ecx","edx"); | ||
| 918 | |||
| 919 | &mul ("ebx"); # a[2]*b[i] | ||
| 920 | &add ("ecx",&DWP((($j+2)%8)*4,"esp")); | ||
| 921 | &adc ("edx",0); | ||
| 922 | &add ("ecx","eax"); | ||
| 923 | &adc ("edx",0); | ||
| 924 | &mov ("eax",&DWP(3*4,"esi")); | ||
| 925 | &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); | ||
| 926 | &mov ("ecx","edx"); | ||
| 927 | |||
| 928 | &mul ("ebx"); # a[3]*b[i] | ||
| 929 | &add ("ecx",&DWP((($j+3)%8)*4,"esp")); | ||
| 930 | &adc ("edx",0); | ||
| 931 | &add ("ecx","eax"); | ||
| 932 | &adc ("edx",0); | ||
| 933 | &mov ("eax",&DWP(4*4,"esi")); | ||
| 934 | &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); | ||
| 935 | &mov ("ecx","edx"); | ||
| 936 | |||
| 937 | &mul ("ebx"); # a[4]*b[i] | ||
| 938 | &add ("ecx",&DWP((($j+4)%8)*4,"esp")); | ||
| 939 | &adc ("edx",0); | ||
| 940 | &add ("ecx","eax"); | ||
| 941 | &adc ("edx",0); | ||
| 942 | &mov ("eax",&DWP(5*4,"esi")); | ||
| 943 | &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); | ||
| 944 | &mov ("ecx","edx"); | ||
| 945 | |||
| 946 | &mul ("ebx"); # a[5]*b[i] | ||
| 947 | &add ("ecx",&DWP((($j+5)%8)*4,"esp")); | ||
| 948 | &adc ("edx",0); | ||
| 949 | &add ("ecx","eax"); | ||
| 950 | &adc ("edx",0); | ||
| 951 | &mov ("eax",&DWP(6*4,"esi")); | ||
| 952 | &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); | ||
| 953 | &mov ("ecx","edx"); | ||
| 954 | |||
| 955 | &mul ("ebx"); # a[6]*b[i] | ||
| 956 | &add ("ecx",&DWP((($j+6)%8)*4,"esp")); | ||
| 957 | &adc ("edx",0); | ||
| 958 | &add ("ecx","eax"); | ||
| 959 | &adc ("edx",0); | ||
| 960 | &mov ("eax",&DWP(7*4,"esi")); | ||
| 961 | &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); | ||
| 962 | &mov ("ecx","edx"); | ||
| 963 | |||
| 964 | &mul ("ebx"); # a[7]*b[i] | ||
| 965 | &add ("ecx",&DWP((($j+7)%8)*4,"esp")); | ||
| 966 | &adc ("edx",0); | ||
| 967 | &add ("ecx","eax"); # t[7] | ||
| 968 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] | ||
| 969 | &adc ("edx","edi"); # t[8] | ||
| 970 | &mov ("edi",0); | ||
| 971 | &adc ("edi",0); # top-most carry | ||
| 972 | } | ||
| 973 | &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr | ||
| 974 | &xor ("esi","esi"); | ||
| 975 | my $j=$i+1; | ||
| 976 | |||
| 977 | # last multiplication-less reduction | ||
| 978 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | ||
| 979 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | ||
| 980 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | ||
| 981 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | ||
| 982 | &adc ("ecx",0); # t[7]+=0 | ||
| 983 | &adc ("edx","eax"); # t[8]+=t[0] | ||
| 984 | &adc ("edi",0); # top-most carry | ||
| 985 | &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); | ||
| 986 | &sub ("ecx","eax"); # t[7]-=t[0] | ||
| 987 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); | ||
| 988 | &sbb ("edx",0); # t[8]-=0 | ||
| 989 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | ||
| 990 | &sbb ("edi",0); # top-most carry | ||
| 991 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | ||
| 992 | |||
| 993 | # Final step is "if result > mod, subtract mod", but we do it | ||
| 994 | # "other way around", namely write result - mod to output buffer | ||
| 995 | # and if subtraction borrowed, add modulus back. | ||
| 996 | |||
| 997 | &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); | ||
| 998 | &sub ("eax",-1); | ||
| 999 | &mov ("edx",&DWP((($j+3)%8)*4,"esp")); | ||
| 1000 | &sbb ("ebx",-1); | ||
| 1001 | &mov (&DWP(0*4,"ebp"),"eax"); | ||
| 1002 | &sbb ("ecx",-1); | ||
| 1003 | &mov (&DWP(1*4,"ebp"),"ebx"); | ||
| 1004 | &sbb ("edx",0); | ||
| 1005 | &mov (&DWP(2*4,"ebp"),"ecx"); | ||
| 1006 | &mov (&DWP(3*4,"ebp"),"edx"); | ||
| 1007 | |||
| 1008 | &mov ("eax",&DWP((($j+4)%8)*4,"esp")); | ||
| 1009 | &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); | ||
| 1010 | &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); | ||
| 1011 | &sbb ("eax",0); | ||
| 1012 | &mov ("edx",&DWP((($j+7)%8)*4,"esp")); | ||
| 1013 | &sbb ("ebx",0); | ||
| 1014 | &sbb ("ecx",1); | ||
| 1015 | &sbb ("edx",-1); | ||
| 1016 | &sbb ("edi",0); | ||
| 1017 | |||
| 1018 | # Note that because mod has special form, i.e. consists of | ||
| 1019 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 1020 | # assigning borrow bit to one register, %ebp, and its negative | ||
| 1021 | # to another, %esi. But we started by calculating %esi... | ||
| 1022 | |||
| 1023 | &sub ("esi","edi"); | ||
| 1024 | &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero | ||
| 1025 | &adc (&DWP(1*4,"ebp"),"edi"); | ||
| 1026 | &adc (&DWP(2*4,"ebp"),"edi"); | ||
| 1027 | &adc (&DWP(3*4,"ebp"),0); | ||
| 1028 | &adc ("eax",0); | ||
| 1029 | &adc ("ebx",0); | ||
| 1030 | &mov (&DWP(4*4,"ebp"),"eax"); | ||
| 1031 | &adc ("ecx","esi"); | ||
| 1032 | &mov (&DWP(5*4,"ebp"),"ebx"); | ||
| 1033 | &adc ("edx","edi"); | ||
| 1034 | &mov (&DWP(6*4,"ebp"),"ecx"); | ||
| 1035 | &mov ("edi","ebp"); # fulfill contract | ||
| 1036 | &mov (&DWP(7*4,"ebp"),"edx"); | ||
| 1037 | |||
| 1038 | &add ("esp",10*4); | ||
| 1039 | &ret (); | ||
| 1040 | &function_end_B("_ecp_nistz256_mul_mont"); | ||
| 1041 | |||
| 1042 | ######################################################################## | ||
| 1043 | # void ecp_nistz256_select_w5(P256_POINT *edi,const void *esi, | ||
| 1044 | # int ebp); | ||
| 1045 | &function_begin("ecp_nistz256_select_w5"); | ||
| 1046 | &mov ("esi",&wparam(1)); | ||
| 1047 | &mov ("ebp",&wparam(2)); | ||
| 1048 | |||
| 1049 | &lea ("esi",&DWP(0,"esi","ebp",4)); | ||
| 1050 | &neg ("ebp"); | ||
| 1051 | &sar ("ebp",31); | ||
| 1052 | &mov ("edi",&wparam(0)); | ||
| 1053 | &lea ("esi",&DWP(0,"esi","ebp",4)); | ||
| 1054 | |||
| 1055 | for($i=0;$i<24;$i+=4) { | ||
| 1056 | &mov ("eax",&DWP(64*($i+0),"esi")); | ||
| 1057 | &mov ("ebx",&DWP(64*($i+1),"esi")); | ||
| 1058 | &mov ("ecx",&DWP(64*($i+2),"esi")); | ||
| 1059 | &mov ("edx",&DWP(64*($i+3),"esi")); | ||
| 1060 | &and ("eax","ebp"); | ||
| 1061 | &and ("ebx","ebp"); | ||
| 1062 | &and ("ecx","ebp"); | ||
| 1063 | &and ("edx","ebp"); | ||
| 1064 | &mov (&DWP(4*($i+0),"edi"),"eax"); | ||
| 1065 | &mov (&DWP(4*($i+1),"edi"),"ebx"); | ||
| 1066 | &mov (&DWP(4*($i+2),"edi"),"ecx"); | ||
| 1067 | &mov (&DWP(4*($i+3),"edi"),"edx"); | ||
| 1068 | } | ||
| 1069 | &function_end("ecp_nistz256_select_w5"); | ||
| 1070 | |||
| 1071 | ######################################################################## | ||
| 1072 | # void ecp_nistz256_select_w7(P256_POINT_AFFINE *edi,const void *esi, | ||
| 1073 | # int ebp); | ||
| 1074 | &function_begin("ecp_nistz256_select_w7"); | ||
| 1075 | &mov ("esi",&wparam(1)); | ||
| 1076 | &mov ("ebp",&wparam(2)); | ||
| 1077 | |||
| 1078 | &add ("esi","ebp"); | ||
| 1079 | &neg ("ebp"), | ||
| 1080 | &sar ("ebp",31); | ||
| 1081 | &mov ("edi",&wparam(0)); | ||
| 1082 | &lea ("esi",&DWP(0,"esi","ebp")); | ||
| 1083 | |||
| 1084 | for($i=0;$i<64;$i+=4) { | ||
| 1085 | &movz ("eax",&BP(64*($i+0),"esi")); | ||
| 1086 | &movz ("ebx",&BP(64*($i+1),"esi")); | ||
| 1087 | &movz ("ecx",&BP(64*($i+2),"esi")); | ||
| 1088 | &and ("eax","ebp"); | ||
| 1089 | &movz ("edx",&BP(64*($i+3),"esi")); | ||
| 1090 | &and ("ebx","ebp"); | ||
| 1091 | &mov (&BP($i+0,"edi"),"al"); | ||
| 1092 | &and ("ecx","ebp"); | ||
| 1093 | &mov (&BP($i+1,"edi"),"bl"); | ||
| 1094 | &and ("edx","ebp"); | ||
| 1095 | &mov (&BP($i+2,"edi"),"cl"); | ||
| 1096 | &mov (&BP($i+3,"edi"),"dl"); | ||
| 1097 | } | ||
| 1098 | &function_end("ecp_nistz256_select_w7"); | ||
| 1099 | |||
| 1100 | ######################################################################## | ||
| 1101 | # following subroutines are "literal" implementation of those found in | ||
| 1102 | # ecp_nistz256.c | ||
| 1103 | # | ||
| 1104 | ######################################################################## | ||
| 1105 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 1106 | # | ||
| 1107 | &static_label("point_double_shortcut"); | ||
| 1108 | &function_begin("ecp_nistz256_point_double"); | ||
| 1109 | { my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | ||
| 1110 | |||
| 1111 | &mov ("esi",&wparam(1)); | ||
| 1112 | |||
| 1113 | # above map() describes stack layout with 5 temporary | ||
| 1114 | # 256-bit vectors on top, then we take extra word for | ||
| 1115 | # OPENSSL_ia32cap_P copy. | ||
| 1116 | &stack_push(8*5+1); | ||
| 1117 | if ($sse2) { | ||
| 1118 | &call ("_picup_eax"); | ||
| 1119 | &set_label("pic"); | ||
| 1120 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 1121 | &mov ("ebp",&DWP(0,"edx")); } | ||
| 1122 | |||
| 1123 | &set_label("point_double_shortcut"); | ||
| 1124 | &mov ("eax",&DWP(0,"esi")); # copy in_x | ||
| 1125 | &mov ("ebx",&DWP(4,"esi")); | ||
| 1126 | &mov ("ecx",&DWP(8,"esi")); | ||
| 1127 | &mov ("edx",&DWP(12,"esi")); | ||
| 1128 | &mov (&DWP($in_x+0,"esp"),"eax"); | ||
| 1129 | &mov (&DWP($in_x+4,"esp"),"ebx"); | ||
| 1130 | &mov (&DWP($in_x+8,"esp"),"ecx"); | ||
| 1131 | &mov (&DWP($in_x+12,"esp"),"edx"); | ||
| 1132 | &mov ("eax",&DWP(16,"esi")); | ||
| 1133 | &mov ("ebx",&DWP(20,"esi")); | ||
| 1134 | &mov ("ecx",&DWP(24,"esi")); | ||
| 1135 | &mov ("edx",&DWP(28,"esi")); | ||
| 1136 | &mov (&DWP($in_x+16,"esp"),"eax"); | ||
| 1137 | &mov (&DWP($in_x+20,"esp"),"ebx"); | ||
| 1138 | &mov (&DWP($in_x+24,"esp"),"ecx"); | ||
| 1139 | &mov (&DWP($in_x+28,"esp"),"edx"); | ||
| 1140 | &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy | ||
| 1141 | |||
| 1142 | &lea ("ebp",&DWP(32,"esi")); | ||
| 1143 | &lea ("esi",&DWP(32,"esi")); | ||
| 1144 | &lea ("edi",&DWP($S,"esp")); | ||
| 1145 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); | ||
| 1146 | |||
| 1147 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1148 | &mov ("esi",64); | ||
| 1149 | &add ("esi",&wparam(1)); | ||
| 1150 | &lea ("edi",&DWP($Zsqr,"esp")); | ||
| 1151 | &mov ("ebp","esi"); | ||
| 1152 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); | ||
| 1153 | |||
| 1154 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1155 | &lea ("esi",&DWP($S,"esp")); | ||
| 1156 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1157 | &lea ("edi",&DWP($S,"esp")); | ||
| 1158 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); | ||
| 1159 | |||
| 1160 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1161 | &mov ("ebp",&wparam(1)); | ||
| 1162 | &lea ("esi",&DWP(32,"ebp")); | ||
| 1163 | &lea ("ebp",&DWP(64,"ebp")); | ||
| 1164 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1165 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); | ||
| 1166 | |||
| 1167 | &lea ("esi",&DWP($in_x,"esp")); | ||
| 1168 | &lea ("ebp",&DWP($Zsqr,"esp")); | ||
| 1169 | &lea ("edi",&DWP($M,"esp")); | ||
| 1170 | &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); | ||
| 1171 | |||
| 1172 | &mov ("edi",64); | ||
| 1173 | &lea ("esi",&DWP($tmp0,"esp")); | ||
| 1174 | &lea ("ebp",&DWP($tmp0,"esp")); | ||
| 1175 | &add ("edi",&wparam(0)); | ||
| 1176 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); | ||
| 1177 | |||
| 1178 | &lea ("esi",&DWP($in_x,"esp")); | ||
| 1179 | &lea ("ebp",&DWP($Zsqr,"esp")); | ||
| 1180 | &lea ("edi",&DWP($Zsqr,"esp")); | ||
| 1181 | &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); | ||
| 1182 | |||
| 1183 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1184 | &lea ("esi",&DWP($S,"esp")); | ||
| 1185 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1186 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1187 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); | ||
| 1188 | |||
| 1189 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1190 | &lea ("esi",&DWP($M,"esp")); | ||
| 1191 | &lea ("ebp",&DWP($Zsqr,"esp")); | ||
| 1192 | &lea ("edi",&DWP($M,"esp")); | ||
| 1193 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); | ||
| 1194 | |||
| 1195 | &mov ("edi",32); | ||
| 1196 | &lea ("esi",&DWP($tmp0,"esp")); | ||
| 1197 | &add ("edi",&wparam(0)); | ||
| 1198 | &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); | ||
| 1199 | |||
| 1200 | &lea ("esi",&DWP($M,"esp")); | ||
| 1201 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1202 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1203 | &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); | ||
| 1204 | |||
| 1205 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1206 | &lea ("esi",&DWP($in_x,"esp")); | ||
| 1207 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1208 | &lea ("edi",&DWP($S,"esp")); | ||
| 1209 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); | ||
| 1210 | |||
| 1211 | &lea ("esi",&DWP($tmp0,"esp")); | ||
| 1212 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1213 | &lea ("edi",&DWP($M,"esp")); | ||
| 1214 | &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); | ||
| 1215 | |||
| 1216 | &lea ("esi",&DWP($S,"esp")); | ||
| 1217 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1218 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1219 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); | ||
| 1220 | |||
| 1221 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1222 | &lea ("esi",&DWP($M,"esp")); | ||
| 1223 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1224 | &mov ("edi",&wparam(0)); | ||
| 1225 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); | ||
| 1226 | |||
| 1227 | &mov ("esi","edi"); # %edi is still res_x here | ||
| 1228 | &lea ("ebp",&DWP($tmp0,"esp")); | ||
| 1229 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); | ||
| 1230 | |||
| 1231 | &lea ("esi",&DWP($S,"esp")); | ||
| 1232 | &mov ("ebp","edi"); # %edi is still res_x | ||
| 1233 | &lea ("edi",&DWP($S,"esp")); | ||
| 1234 | &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); | ||
| 1235 | |||
| 1236 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1237 | &mov ("esi","edi"); # %edi is still &S | ||
| 1238 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1239 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); | ||
| 1240 | |||
| 1241 | &mov ("ebp",32); | ||
| 1242 | &lea ("esi",&DWP($S,"esp")); | ||
| 1243 | &add ("ebp",&wparam(0)); | ||
| 1244 | &mov ("edi","ebp"); | ||
| 1245 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); | ||
| 1246 | |||
| 1247 | &stack_pop(8*5+1); | ||
| 1248 | } &function_end("ecp_nistz256_point_double"); | ||
| 1249 | |||
| 1250 | ######################################################################## | ||
| 1251 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 1252 | # const P256_POINT *in2); | ||
| 1253 | &function_begin("ecp_nistz256_point_add"); | ||
| 1254 | { my ($res_x,$res_y,$res_z, | ||
| 1255 | $in1_x,$in1_y,$in1_z, | ||
| 1256 | $in2_x,$in2_y,$in2_z, | ||
| 1257 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 1258 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | ||
| 1259 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 1260 | |||
| 1261 | &mov ("esi",&wparam(2)); | ||
| 1262 | |||
| 1263 | # above map() describes stack layout with 18 temporary | ||
| 1264 | # 256-bit vectors on top, then we take extra words for | ||
| 1265 | # !in1infty, !in2infty, result of check for zero and | ||
| 1266 | # OPENSSL_ia32cap_P copy. [one unused word for padding] | ||
| 1267 | &stack_push(8*18+5); | ||
| 1268 | if ($sse2) { | ||
| 1269 | &call ("_picup_eax"); | ||
| 1270 | &set_label("pic"); | ||
| 1271 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 1272 | &mov ("ebp",&DWP(0,"edx")); } | ||
| 1273 | |||
| 1274 | &lea ("edi",&DWP($in2_x,"esp")); | ||
| 1275 | for($i=0;$i<96;$i+=16) { | ||
| 1276 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | ||
| 1277 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1278 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1279 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1280 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1281 | &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); | ||
| 1282 | &mov ("ebp","eax") if ($i==64); | ||
| 1283 | &or ("ebp","eax") if ($i>64); | ||
| 1284 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1285 | &or ("ebp","ebx") if ($i>=64); | ||
| 1286 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1287 | &or ("ebp","ecx") if ($i>=64); | ||
| 1288 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1289 | &or ("ebp","edx") if ($i>=64); | ||
| 1290 | } | ||
| 1291 | &xor ("eax","eax"); | ||
| 1292 | &mov ("esi",&wparam(1)); | ||
| 1293 | &sub ("eax","ebp"); | ||
| 1294 | &or ("ebp","eax"); | ||
| 1295 | &sar ("ebp",31); | ||
| 1296 | &mov (&DWP(32*18+4,"esp"),"ebp"); # !in2infty | ||
| 1297 | |||
| 1298 | &lea ("edi",&DWP($in1_x,"esp")); | ||
| 1299 | for($i=0;$i<96;$i+=16) { | ||
| 1300 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | ||
| 1301 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1302 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1303 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1304 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1305 | &mov ("ebp","eax") if ($i==64); | ||
| 1306 | &or ("ebp","eax") if ($i>64); | ||
| 1307 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1308 | &or ("ebp","ebx") if ($i>=64); | ||
| 1309 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1310 | &or ("ebp","ecx") if ($i>=64); | ||
| 1311 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1312 | &or ("ebp","edx") if ($i>=64); | ||
| 1313 | } | ||
| 1314 | &xor ("eax","eax"); | ||
| 1315 | &sub ("eax","ebp"); | ||
| 1316 | &or ("ebp","eax"); | ||
| 1317 | &sar ("ebp",31); | ||
| 1318 | &mov (&DWP(32*18+0,"esp"),"ebp"); # !in1infty | ||
| 1319 | |||
| 1320 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1321 | &lea ("esi",&DWP($in2_z,"esp")); | ||
| 1322 | &lea ("ebp",&DWP($in2_z,"esp")); | ||
| 1323 | &lea ("edi",&DWP($Z2sqr,"esp")); | ||
| 1324 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); | ||
| 1325 | |||
| 1326 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1327 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1328 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1329 | &lea ("edi",&DWP($Z1sqr,"esp")); | ||
| 1330 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1331 | |||
| 1332 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1333 | &lea ("esi",&DWP($Z2sqr,"esp")); | ||
| 1334 | &lea ("ebp",&DWP($in2_z,"esp")); | ||
| 1335 | &lea ("edi",&DWP($S1,"esp")); | ||
| 1336 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 1337 | |||
| 1338 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1339 | &lea ("esi",&DWP($Z1sqr,"esp")); | ||
| 1340 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1341 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1342 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1343 | |||
| 1344 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1345 | &lea ("esi",&DWP($in1_y,"esp")); | ||
| 1346 | &lea ("ebp",&DWP($S1,"esp")); | ||
| 1347 | &lea ("edi",&DWP($S1,"esp")); | ||
| 1348 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); | ||
| 1349 | |||
| 1350 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1351 | &lea ("esi",&DWP($in2_y,"esp")); | ||
| 1352 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1353 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1354 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | ||
| 1355 | |||
| 1356 | &lea ("esi",&DWP($S2,"esp")); | ||
| 1357 | &lea ("ebp",&DWP($S1,"esp")); | ||
| 1358 | &lea ("edi",&DWP($R,"esp")); | ||
| 1359 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); | ||
| 1360 | |||
| 1361 | &or ("ebx","eax"); # see if result is zero | ||
| 1362 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1363 | &or ("ebx","ecx"); | ||
| 1364 | &or ("ebx","edx"); | ||
| 1365 | &or ("ebx",&DWP(0,"edi")); | ||
| 1366 | &or ("ebx",&DWP(4,"edi")); | ||
| 1367 | &lea ("esi",&DWP($in1_x,"esp")); | ||
| 1368 | &or ("ebx",&DWP(8,"edi")); | ||
| 1369 | &lea ("ebp",&DWP($Z2sqr,"esp")); | ||
| 1370 | &or ("ebx",&DWP(12,"edi")); | ||
| 1371 | &lea ("edi",&DWP($U1,"esp")); | ||
| 1372 | &mov (&DWP(32*18+8,"esp"),"ebx"); | ||
| 1373 | |||
| 1374 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 1375 | |||
| 1376 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1377 | &lea ("esi",&DWP($in2_x,"esp")); | ||
| 1378 | &lea ("ebp",&DWP($Z1sqr,"esp")); | ||
| 1379 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1380 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 1381 | |||
| 1382 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1383 | &lea ("ebp",&DWP($U1,"esp")); | ||
| 1384 | &lea ("edi",&DWP($H,"esp")); | ||
| 1385 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); | ||
| 1386 | |||
| 1387 | &or ("eax","ebx"); # see if result is zero | ||
| 1388 | &or ("eax","ecx"); | ||
| 1389 | &or ("eax","edx"); | ||
| 1390 | &or ("eax",&DWP(0,"edi")); | ||
| 1391 | &or ("eax",&DWP(4,"edi")); | ||
| 1392 | &or ("eax",&DWP(8,"edi")); | ||
| 1393 | &or ("eax",&DWP(12,"edi")); | ||
| 1394 | |||
| 1395 | &data_byte(0x3e); # predict taken | ||
| 1396 | &jnz (&label("add_proceed")); # is_equal(U1,U2)? | ||
| 1397 | |||
| 1398 | &mov ("eax",&DWP(32*18+0,"esp")); | ||
| 1399 | &and ("eax",&DWP(32*18+4,"esp")); | ||
| 1400 | &mov ("ebx",&DWP(32*18+8,"esp")); | ||
| 1401 | &jz (&label("add_proceed")); # (in1infty || in2infty)? | ||
| 1402 | &test ("ebx","ebx"); | ||
| 1403 | &jz (&label("add_double")); # is_equal(S1,S2)? | ||
| 1404 | |||
| 1405 | &mov ("edi",&wparam(0)); | ||
| 1406 | &xor ("eax","eax"); | ||
| 1407 | &mov ("ecx",96/4); | ||
| 1408 | &data_byte(0xfc,0xf3,0xab); # cld; stosd | ||
| 1409 | &jmp (&label("add_done")); | ||
| 1410 | |||
| 1411 | &set_label("add_double",16); | ||
| 1412 | &mov ("esi",&wparam(1)); | ||
| 1413 | &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1414 | &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes | ||
| 1415 | &jmp (&label("point_double_shortcut")); | ||
| 1416 | |||
| 1417 | &set_label("add_proceed",16); | ||
| 1418 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1419 | &lea ("esi",&DWP($R,"esp")); | ||
| 1420 | &lea ("ebp",&DWP($R,"esp")); | ||
| 1421 | &lea ("edi",&DWP($Rsqr,"esp")); | ||
| 1422 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | ||
| 1423 | |||
| 1424 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1425 | &lea ("esi",&DWP($H,"esp")); | ||
| 1426 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1427 | &lea ("edi",&DWP($res_z,"esp")); | ||
| 1428 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | ||
| 1429 | |||
| 1430 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1431 | &lea ("esi",&DWP($H,"esp")); | ||
| 1432 | &lea ("ebp",&DWP($H,"esp")); | ||
| 1433 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1434 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | ||
| 1435 | |||
| 1436 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1437 | &lea ("esi",&DWP($in2_z,"esp")); | ||
| 1438 | &lea ("ebp",&DWP($res_z,"esp")); | ||
| 1439 | &lea ("edi",&DWP($res_z,"esp")); | ||
| 1440 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); | ||
| 1441 | |||
| 1442 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1443 | &lea ("esi",&DWP($Hsqr,"esp")); | ||
| 1444 | &lea ("ebp",&DWP($U1,"esp")); | ||
| 1445 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1446 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); | ||
| 1447 | |||
| 1448 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1449 | &lea ("esi",&DWP($H,"esp")); | ||
| 1450 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1451 | &lea ("edi",&DWP($Hcub,"esp")); | ||
| 1452 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1453 | |||
| 1454 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1455 | &lea ("ebp",&DWP($U2,"esp")); | ||
| 1456 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1457 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | ||
| 1458 | |||
| 1459 | &lea ("esi",&DWP($Rsqr,"esp")); | ||
| 1460 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1461 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1462 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1463 | |||
| 1464 | &lea ("esi",&DWP($res_x,"esp")); | ||
| 1465 | &lea ("ebp",&DWP($Hcub,"esp")); | ||
| 1466 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1467 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | ||
| 1468 | |||
| 1469 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1470 | &lea ("ebp",&DWP($res_x,"esp")); | ||
| 1471 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1472 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | ||
| 1473 | |||
| 1474 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1475 | &lea ("esi",&DWP($Hcub,"esp")); | ||
| 1476 | &lea ("ebp",&DWP($S1,"esp")); | ||
| 1477 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1478 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); | ||
| 1479 | |||
| 1480 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1481 | &lea ("esi",&DWP($R,"esp")); | ||
| 1482 | &lea ("ebp",&DWP($res_y,"esp")); | ||
| 1483 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1484 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); | ||
| 1485 | |||
| 1486 | &lea ("esi",&DWP($res_y,"esp")); | ||
| 1487 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1488 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1489 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | ||
| 1490 | |||
| 1491 | &mov ("ebp",&DWP(32*18+0,"esp")); # !in1infty | ||
| 1492 | &mov ("esi",&DWP(32*18+4,"esp")); # !in2infty | ||
| 1493 | &mov ("edi",&wparam(0)); | ||
| 1494 | &mov ("edx","ebp"); | ||
| 1495 | ¬ ("ebp"); | ||
| 1496 | &and ("edx","esi"); | ||
| 1497 | &and ("ebp","esi"); | ||
| 1498 | ¬ ("esi"); | ||
| 1499 | |||
| 1500 | ######################################## | ||
| 1501 | # conditional moves | ||
| 1502 | for($i=64;$i<96;$i+=4) { | ||
| 1503 | &mov ("eax","edx"); | ||
| 1504 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1505 | &mov ("ebx","ebp"); | ||
| 1506 | &and ("ebx",&DWP($in2_x+$i,"esp")); | ||
| 1507 | &mov ("ecx","esi"); | ||
| 1508 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1509 | &or ("eax","ebx"); | ||
| 1510 | &or ("eax","ecx"); | ||
| 1511 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1512 | } | ||
| 1513 | for($i=0;$i<64;$i+=4) { | ||
| 1514 | &mov ("eax","edx"); | ||
| 1515 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1516 | &mov ("ebx","ebp"); | ||
| 1517 | &and ("ebx",&DWP($in2_x+$i,"esp")); | ||
| 1518 | &mov ("ecx","esi"); | ||
| 1519 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1520 | &or ("eax","ebx"); | ||
| 1521 | &or ("eax","ecx"); | ||
| 1522 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1523 | } | ||
| 1524 | &set_label("add_done"); | ||
| 1525 | &stack_pop(8*18+5); | ||
| 1526 | } &function_end("ecp_nistz256_point_add"); | ||
| 1527 | |||
| 1528 | ######################################################################## | ||
| 1529 | # void ecp_nistz256_point_add_affine(P256_POINT *out, | ||
| 1530 | # const P256_POINT *in1, | ||
| 1531 | # const P256_POINT_AFFINE *in2); | ||
| 1532 | &function_begin("ecp_nistz256_point_add_affine"); | ||
| 1533 | { | ||
| 1534 | my ($res_x,$res_y,$res_z, | ||
| 1535 | $in1_x,$in1_y,$in1_z, | ||
| 1536 | $in2_x,$in2_y, | ||
| 1537 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | ||
| 1538 | my $Z1sqr = $S2; | ||
| 1539 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | ||
| 1540 | |||
| 1541 | &mov ("esi",&wparam(1)); | ||
| 1542 | |||
| 1543 | # above map() describes stack layout with 15 temporary | ||
| 1544 | # 256-bit vectors on top, then we take extra words for | ||
| 1545 | # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy. | ||
| 1546 | &stack_push(8*15+3); | ||
| 1547 | if ($sse2) { | ||
| 1548 | &call ("_picup_eax"); | ||
| 1549 | &set_label("pic"); | ||
| 1550 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 1551 | &mov ("ebp",&DWP(0,"edx")); } | ||
| 1552 | |||
| 1553 | &lea ("edi",&DWP($in1_x,"esp")); | ||
| 1554 | for($i=0;$i<96;$i+=16) { | ||
| 1555 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | ||
| 1556 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1557 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1558 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1559 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1560 | &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); | ||
| 1561 | &mov ("ebp","eax") if ($i==64); | ||
| 1562 | &or ("ebp","eax") if ($i>64); | ||
| 1563 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1564 | &or ("ebp","ebx") if ($i>=64); | ||
| 1565 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1566 | &or ("ebp","ecx") if ($i>=64); | ||
| 1567 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1568 | &or ("ebp","edx") if ($i>=64); | ||
| 1569 | } | ||
| 1570 | &xor ("eax","eax"); | ||
| 1571 | &mov ("esi",&wparam(2)); | ||
| 1572 | &sub ("eax","ebp"); | ||
| 1573 | &or ("ebp","eax"); | ||
| 1574 | &sar ("ebp",31); | ||
| 1575 | &mov (&DWP(32*15+0,"esp"),"ebp"); # !in1infty | ||
| 1576 | |||
| 1577 | &lea ("edi",&DWP($in2_x,"esp")); | ||
| 1578 | for($i=0;$i<64;$i+=16) { | ||
| 1579 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | ||
| 1580 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1581 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1582 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1583 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1584 | &mov ("ebp","eax") if ($i==0); | ||
| 1585 | &or ("ebp","eax") if ($i!=0); | ||
| 1586 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1587 | &or ("ebp","ebx"); | ||
| 1588 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1589 | &or ("ebp","ecx"); | ||
| 1590 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1591 | &or ("ebp","edx"); | ||
| 1592 | } | ||
| 1593 | &xor ("ebx","ebx"); | ||
| 1594 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1595 | &sub ("ebx","ebp"); | ||
| 1596 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1597 | &or ("ebx","ebp"); | ||
| 1598 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1599 | &sar ("ebx",31); | ||
| 1600 | &lea ("edi",&DWP($Z1sqr,"esp")); | ||
| 1601 | &mov (&DWP(32*15+4,"esp"),"ebx"); # !in2infty | ||
| 1602 | |||
| 1603 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1604 | |||
| 1605 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1606 | &lea ("esi",&DWP($in2_x,"esp")); | ||
| 1607 | &mov ("ebp","edi"); # %esi is stull &Z1sqr | ||
| 1608 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1609 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1610 | |||
| 1611 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1612 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1613 | &lea ("ebp",&DWP($Z1sqr,"esp")); | ||
| 1614 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1615 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1616 | |||
| 1617 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1618 | &lea ("ebp",&DWP($in1_x,"esp")); | ||
| 1619 | &lea ("edi",&DWP($H,"esp")); | ||
| 1620 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); | ||
| 1621 | |||
| 1622 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1623 | &lea ("esi",&DWP($in2_y,"esp")); | ||
| 1624 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1625 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1626 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | ||
| 1627 | |||
| 1628 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1629 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1630 | &lea ("ebp",&DWP($H,"esp")); | ||
| 1631 | &lea ("edi",&DWP($res_z,"esp")); | ||
| 1632 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | ||
| 1633 | |||
| 1634 | &lea ("esi",&DWP($S2,"esp")); | ||
| 1635 | &lea ("ebp",&DWP($in1_y,"esp")); | ||
| 1636 | &lea ("edi",&DWP($R,"esp")); | ||
| 1637 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); | ||
| 1638 | |||
| 1639 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1640 | &lea ("esi",&DWP($H,"esp")); | ||
| 1641 | &lea ("ebp",&DWP($H,"esp")); | ||
| 1642 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1643 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | ||
| 1644 | |||
| 1645 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1646 | &lea ("esi",&DWP($R,"esp")); | ||
| 1647 | &lea ("ebp",&DWP($R,"esp")); | ||
| 1648 | &lea ("edi",&DWP($Rsqr,"esp")); | ||
| 1649 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | ||
| 1650 | |||
| 1651 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1652 | &lea ("esi",&DWP($in1_x,"esp")); | ||
| 1653 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1654 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1655 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1656 | |||
| 1657 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1658 | &lea ("esi",&DWP($H,"esp")); | ||
| 1659 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1660 | &lea ("edi",&DWP($Hcub,"esp")); | ||
| 1661 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1662 | |||
| 1663 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1664 | &lea ("ebp",&DWP($U2,"esp")); | ||
| 1665 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1666 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | ||
| 1667 | |||
| 1668 | &lea ("esi",&DWP($Rsqr,"esp")); | ||
| 1669 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1670 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1671 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1672 | |||
| 1673 | &lea ("esi",&DWP($res_x,"esp")); | ||
| 1674 | &lea ("ebp",&DWP($Hcub,"esp")); | ||
| 1675 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1676 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | ||
| 1677 | |||
| 1678 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1679 | &lea ("ebp",&DWP($res_x,"esp")); | ||
| 1680 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1681 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | ||
| 1682 | |||
| 1683 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1684 | &lea ("esi",&DWP($Hcub,"esp")); | ||
| 1685 | &lea ("ebp",&DWP($in1_y,"esp")); | ||
| 1686 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1687 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); | ||
| 1688 | |||
| 1689 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1690 | &lea ("esi",&DWP($R,"esp")); | ||
| 1691 | &lea ("ebp",&DWP($res_y,"esp")); | ||
| 1692 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1693 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); | ||
| 1694 | |||
| 1695 | &lea ("esi",&DWP($res_y,"esp")); | ||
| 1696 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1697 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1698 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | ||
| 1699 | |||
| 1700 | &mov ("ebp",&DWP(32*15+0,"esp")); # !in1infty | ||
| 1701 | &mov ("esi",&DWP(32*15+4,"esp")); # !in2infty | ||
| 1702 | &mov ("edi",&wparam(0)); | ||
| 1703 | &mov ("edx","ebp"); | ||
| 1704 | ¬ ("ebp"); | ||
| 1705 | &and ("edx","esi"); | ||
| 1706 | &and ("ebp","esi"); | ||
| 1707 | ¬ ("esi"); | ||
| 1708 | |||
| 1709 | ######################################## | ||
| 1710 | # conditional moves | ||
| 1711 | for($i=64;$i<96;$i+=4) { | ||
| 1712 | my $one=@ONE_mont[($i-64)/4]; | ||
| 1713 | |||
| 1714 | &mov ("eax","edx"); | ||
| 1715 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1716 | &mov ("ebx","ebp") if ($one && $one!=-1); | ||
| 1717 | &and ("ebx",$one) if ($one && $one!=-1); | ||
| 1718 | &mov ("ecx","esi"); | ||
| 1719 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1720 | &or ("eax",$one==-1?"ebp":"ebx") if ($one); | ||
| 1721 | &or ("eax","ecx"); | ||
| 1722 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1723 | } | ||
| 1724 | for($i=0;$i<64;$i+=4) { | ||
| 1725 | &mov ("eax","edx"); | ||
| 1726 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1727 | &mov ("ebx","ebp"); | ||
| 1728 | &and ("ebx",&DWP($in2_x+$i,"esp")); | ||
| 1729 | &mov ("ecx","esi"); | ||
| 1730 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1731 | &or ("eax","ebx"); | ||
| 1732 | &or ("eax","ecx"); | ||
| 1733 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1734 | } | ||
| 1735 | &stack_pop(8*15+3); | ||
| 1736 | } &function_end("ecp_nistz256_point_add_affine"); | ||
| 1737 | |||
| 1738 | &asm_finish(); | ||
| 1739 | |||
| 1740 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl deleted file mode 100644 index b772aae742..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl +++ /dev/null | |||
| @@ -1,1971 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-x86_64.pl,v 1.1 2016/11/04 17:33:20 miod Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | # Copyright (c) 2014, Intel Corporation. | ||
| 12 | # | ||
| 13 | # Permission to use, copy, modify, and/or distribute this software for any | ||
| 14 | # purpose with or without fee is hereby granted, provided that the above | ||
| 15 | # copyright notice and this permission notice appear in all copies. | ||
| 16 | # | ||
| 17 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
| 18 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
| 19 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | ||
| 20 | # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
| 21 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION | ||
| 22 | # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN | ||
| 23 | # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
| 24 | |||
| 25 | # Developers and authors: | ||
| 26 | # Shay Gueron (1, 2), and Vlad Krasnov (1) | ||
| 27 | # (1) Intel Corporation, Israel Development Center | ||
| 28 | # (2) University of Haifa | ||
| 29 | |||
| 30 | # Reference: | ||
| 31 | # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with | ||
| 32 | # 256 Bit Primes" | ||
| 33 | |||
| 34 | # Further optimization by <appro@openssl.org>: | ||
| 35 | # | ||
| 36 | # this/original with/without -DECP_NISTZ256_ASM(*) | ||
| 37 | # Opteron +12-49% +110-150% | ||
| 38 | # Bulldozer +14-45% +175-210% | ||
| 39 | # P4 +18-46% n/a :-( | ||
| 40 | # Westmere +12-34% +80-87% | ||
| 41 | # Sandy Bridge +9-35% +110-120% | ||
| 42 | # Ivy Bridge +9-35% +110-125% | ||
| 43 | # Haswell +8-37% +140-160% | ||
| 44 | # Broadwell +18-58% +145-210% | ||
| 45 | # Atom +15-50% +130-180% | ||
| 46 | # VIA Nano +43-160% +300-480% | ||
| 47 | # | ||
| 48 | # (*) "without -DECP_NISTZ256_ASM" refers to build with | ||
| 49 | # "enable-ec_nistp_64_gcc_128"; | ||
| 50 | # | ||
| 51 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 52 | # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest | ||
| 53 | # server-side operation. Keep in mind that +100% means 2x improvement. | ||
| 54 | |||
| 55 | $flavour = shift; | ||
| 56 | $output = shift; | ||
| 57 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 58 | |||
| 59 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 60 | |||
| 61 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 62 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 63 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 64 | die "can't locate x86_64-xlate.pl"; | ||
| 65 | |||
| 66 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; | ||
| 67 | *STDOUT=*OUT; | ||
| 68 | |||
| 69 | $code.=<<___; | ||
| 70 | .text | ||
| 71 | |||
| 72 | # The polynomial | ||
| 73 | .align 64 | ||
| 74 | .Lpoly: | ||
| 75 | .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 | ||
| 76 | |||
| 77 | .LOne: | ||
| 78 | .long 1,1,1,1,1,1,1,1 | ||
| 79 | .LTwo: | ||
| 80 | .long 2,2,2,2,2,2,2,2 | ||
| 81 | .LThree: | ||
| 82 | .long 3,3,3,3,3,3,3,3 | ||
| 83 | .LONE_mont: | ||
| 84 | .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe | ||
| 85 | ___ | ||
| 86 | |||
| 87 | { | ||
| 88 | ################################################################################ | ||
| 89 | # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); | ||
| 90 | |||
| 91 | my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); | ||
| 92 | my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); | ||
| 93 | my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); | ||
| 94 | |||
| 95 | $code.=<<___; | ||
| 96 | |||
| 97 | .globl ecp_nistz256_mul_by_2 | ||
| 98 | .type ecp_nistz256_mul_by_2,\@function,2 | ||
| 99 | .align 64 | ||
| 100 | ecp_nistz256_mul_by_2: | ||
| 101 | push %r12 | ||
| 102 | push %r13 | ||
| 103 | |||
| 104 | mov 8*0($a_ptr), $a0 | ||
| 105 | mov 8*1($a_ptr), $a1 | ||
| 106 | add $a0, $a0 # a0:a3+a0:a3 | ||
| 107 | mov 8*2($a_ptr), $a2 | ||
| 108 | adc $a1, $a1 | ||
| 109 | mov 8*3($a_ptr), $a3 | ||
| 110 | lea .Lpoly(%rip), $a_ptr | ||
| 111 | mov $a0, $t0 | ||
| 112 | adc $a2, $a2 | ||
| 113 | adc $a3, $a3 | ||
| 114 | mov $a1, $t1 | ||
| 115 | sbb $t4, $t4 | ||
| 116 | |||
| 117 | sub 8*0($a_ptr), $a0 | ||
| 118 | mov $a2, $t2 | ||
| 119 | sbb 8*1($a_ptr), $a1 | ||
| 120 | sbb 8*2($a_ptr), $a2 | ||
| 121 | mov $a3, $t3 | ||
| 122 | sbb 8*3($a_ptr), $a3 | ||
| 123 | test $t4, $t4 | ||
| 124 | |||
| 125 | cmovz $t0, $a0 | ||
| 126 | cmovz $t1, $a1 | ||
| 127 | mov $a0, 8*0($r_ptr) | ||
| 128 | cmovz $t2, $a2 | ||
| 129 | mov $a1, 8*1($r_ptr) | ||
| 130 | cmovz $t3, $a3 | ||
| 131 | mov $a2, 8*2($r_ptr) | ||
| 132 | mov $a3, 8*3($r_ptr) | ||
| 133 | |||
| 134 | pop %r13 | ||
| 135 | pop %r12 | ||
| 136 | ret | ||
| 137 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | ||
| 138 | |||
| 139 | ################################################################################ | ||
| 140 | # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); | ||
| 141 | .globl ecp_nistz256_neg | ||
| 142 | .type ecp_nistz256_neg,\@function,2 | ||
| 143 | .align 32 | ||
| 144 | ecp_nistz256_neg: | ||
| 145 | push %r12 | ||
| 146 | push %r13 | ||
| 147 | |||
| 148 | xor $a0, $a0 | ||
| 149 | xor $a1, $a1 | ||
| 150 | xor $a2, $a2 | ||
| 151 | xor $a3, $a3 | ||
| 152 | xor $t4, $t4 | ||
| 153 | |||
| 154 | sub 8*0($a_ptr), $a0 | ||
| 155 | sbb 8*1($a_ptr), $a1 | ||
| 156 | sbb 8*2($a_ptr), $a2 | ||
| 157 | mov $a0, $t0 | ||
| 158 | sbb 8*3($a_ptr), $a3 | ||
| 159 | lea .Lpoly(%rip), $a_ptr | ||
| 160 | mov $a1, $t1 | ||
| 161 | sbb \$0, $t4 | ||
| 162 | |||
| 163 | add 8*0($a_ptr), $a0 | ||
| 164 | mov $a2, $t2 | ||
| 165 | adc 8*1($a_ptr), $a1 | ||
| 166 | adc 8*2($a_ptr), $a2 | ||
| 167 | mov $a3, $t3 | ||
| 168 | adc 8*3($a_ptr), $a3 | ||
| 169 | test $t4, $t4 | ||
| 170 | |||
| 171 | cmovz $t0, $a0 | ||
| 172 | cmovz $t1, $a1 | ||
| 173 | mov $a0, 8*0($r_ptr) | ||
| 174 | cmovz $t2, $a2 | ||
| 175 | mov $a1, 8*1($r_ptr) | ||
| 176 | cmovz $t3, $a3 | ||
| 177 | mov $a2, 8*2($r_ptr) | ||
| 178 | mov $a3, 8*3($r_ptr) | ||
| 179 | |||
| 180 | pop %r13 | ||
| 181 | pop %r12 | ||
| 182 | ret | ||
| 183 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | ||
| 184 | ___ | ||
| 185 | } | ||
| 186 | { | ||
| 187 | my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); | ||
| 188 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); | ||
| 189 | my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); | ||
| 190 | my ($poly1,$poly3)=($acc6,$acc7); | ||
| 191 | |||
| 192 | $code.=<<___; | ||
| 193 | ################################################################################ | ||
| 194 | # void ecp_nistz256_mul_mont( | ||
| 195 | # uint64_t res[4], | ||
| 196 | # uint64_t a[4], | ||
| 197 | # uint64_t b[4]); | ||
| 198 | |||
| 199 | .globl ecp_nistz256_mul_mont | ||
| 200 | .type ecp_nistz256_mul_mont,\@function,3 | ||
| 201 | .align 32 | ||
| 202 | ecp_nistz256_mul_mont: | ||
| 203 | .Lmul_mont: | ||
| 204 | push %rbp | ||
| 205 | push %rbx | ||
| 206 | push %r12 | ||
| 207 | push %r13 | ||
| 208 | push %r14 | ||
| 209 | push %r15 | ||
| 210 | |||
| 211 | mov $b_org, $b_ptr | ||
| 212 | mov 8*0($b_org), %rax | ||
| 213 | mov 8*0($a_ptr), $acc1 | ||
| 214 | mov 8*1($a_ptr), $acc2 | ||
| 215 | mov 8*2($a_ptr), $acc3 | ||
| 216 | mov 8*3($a_ptr), $acc4 | ||
| 217 | |||
| 218 | call __ecp_nistz256_mul_montq | ||
| 219 | |||
| 220 | pop %r15 | ||
| 221 | pop %r14 | ||
| 222 | pop %r13 | ||
| 223 | pop %r12 | ||
| 224 | pop %rbx | ||
| 225 | pop %rbp | ||
| 226 | ret | ||
| 227 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | ||
| 228 | |||
| 229 | .type __ecp_nistz256_mul_montq,\@abi-omnipotent | ||
| 230 | .align 32 | ||
| 231 | __ecp_nistz256_mul_montq: | ||
| 232 | ######################################################################## | ||
| 233 | # Multiply a by b[0] | ||
| 234 | mov %rax, $t1 | ||
| 235 | mulq $acc1 | ||
| 236 | mov .Lpoly+8*1(%rip),$poly1 | ||
| 237 | mov %rax, $acc0 | ||
| 238 | mov $t1, %rax | ||
| 239 | mov %rdx, $acc1 | ||
| 240 | |||
| 241 | mulq $acc2 | ||
| 242 | mov .Lpoly+8*3(%rip),$poly3 | ||
| 243 | add %rax, $acc1 | ||
| 244 | mov $t1, %rax | ||
| 245 | adc \$0, %rdx | ||
| 246 | mov %rdx, $acc2 | ||
| 247 | |||
| 248 | mulq $acc3 | ||
| 249 | add %rax, $acc2 | ||
| 250 | mov $t1, %rax | ||
| 251 | adc \$0, %rdx | ||
| 252 | mov %rdx, $acc3 | ||
| 253 | |||
| 254 | mulq $acc4 | ||
| 255 | add %rax, $acc3 | ||
| 256 | mov $acc0, %rax | ||
| 257 | adc \$0, %rdx | ||
| 258 | xor $acc5, $acc5 | ||
| 259 | mov %rdx, $acc4 | ||
| 260 | |||
| 261 | ######################################################################## | ||
| 262 | # First reduction step | ||
| 263 | # Basically now we want to multiply acc[0] by p256, | ||
| 264 | # and add the result to the acc. | ||
| 265 | # Due to the special form of p256 we do some optimizations | ||
| 266 | # | ||
| 267 | # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] | ||
| 268 | # then we add acc[0] and get acc[0] x 2^96 | ||
| 269 | |||
| 270 | mov $acc0, $t1 | ||
| 271 | shl \$32, $acc0 | ||
| 272 | mulq $poly3 | ||
| 273 | shr \$32, $t1 | ||
| 274 | add $acc0, $acc1 # +=acc[0]<<96 | ||
| 275 | adc $t1, $acc2 | ||
| 276 | adc %rax, $acc3 | ||
| 277 | mov 8*1($b_ptr), %rax | ||
| 278 | adc %rdx, $acc4 | ||
| 279 | adc \$0, $acc5 | ||
| 280 | xor $acc0, $acc0 | ||
| 281 | |||
| 282 | ######################################################################## | ||
| 283 | # Multiply by b[1] | ||
| 284 | mov %rax, $t1 | ||
| 285 | mulq 8*0($a_ptr) | ||
| 286 | add %rax, $acc1 | ||
| 287 | mov $t1, %rax | ||
| 288 | adc \$0, %rdx | ||
| 289 | mov %rdx, $t0 | ||
| 290 | |||
| 291 | mulq 8*1($a_ptr) | ||
| 292 | add $t0, $acc2 | ||
| 293 | adc \$0, %rdx | ||
| 294 | add %rax, $acc2 | ||
| 295 | mov $t1, %rax | ||
| 296 | adc \$0, %rdx | ||
| 297 | mov %rdx, $t0 | ||
| 298 | |||
| 299 | mulq 8*2($a_ptr) | ||
| 300 | add $t0, $acc3 | ||
| 301 | adc \$0, %rdx | ||
| 302 | add %rax, $acc3 | ||
| 303 | mov $t1, %rax | ||
| 304 | adc \$0, %rdx | ||
| 305 | mov %rdx, $t0 | ||
| 306 | |||
| 307 | mulq 8*3($a_ptr) | ||
| 308 | add $t0, $acc4 | ||
| 309 | adc \$0, %rdx | ||
| 310 | add %rax, $acc4 | ||
| 311 | mov $acc1, %rax | ||
| 312 | adc %rdx, $acc5 | ||
| 313 | adc \$0, $acc0 | ||
| 314 | |||
| 315 | ######################################################################## | ||
| 316 | # Second reduction step | ||
| 317 | mov $acc1, $t1 | ||
| 318 | shl \$32, $acc1 | ||
| 319 | mulq $poly3 | ||
| 320 | shr \$32, $t1 | ||
| 321 | add $acc1, $acc2 | ||
| 322 | adc $t1, $acc3 | ||
| 323 | adc %rax, $acc4 | ||
| 324 | mov 8*2($b_ptr), %rax | ||
| 325 | adc %rdx, $acc5 | ||
| 326 | adc \$0, $acc0 | ||
| 327 | xor $acc1, $acc1 | ||
| 328 | |||
| 329 | ######################################################################## | ||
| 330 | # Multiply by b[2] | ||
| 331 | mov %rax, $t1 | ||
| 332 | mulq 8*0($a_ptr) | ||
| 333 | add %rax, $acc2 | ||
| 334 | mov $t1, %rax | ||
| 335 | adc \$0, %rdx | ||
| 336 | mov %rdx, $t0 | ||
| 337 | |||
| 338 | mulq 8*1($a_ptr) | ||
| 339 | add $t0, $acc3 | ||
| 340 | adc \$0, %rdx | ||
| 341 | add %rax, $acc3 | ||
| 342 | mov $t1, %rax | ||
| 343 | adc \$0, %rdx | ||
| 344 | mov %rdx, $t0 | ||
| 345 | |||
| 346 | mulq 8*2($a_ptr) | ||
| 347 | add $t0, $acc4 | ||
| 348 | adc \$0, %rdx | ||
| 349 | add %rax, $acc4 | ||
| 350 | mov $t1, %rax | ||
| 351 | adc \$0, %rdx | ||
| 352 | mov %rdx, $t0 | ||
| 353 | |||
| 354 | mulq 8*3($a_ptr) | ||
| 355 | add $t0, $acc5 | ||
| 356 | adc \$0, %rdx | ||
| 357 | add %rax, $acc5 | ||
| 358 | mov $acc2, %rax | ||
| 359 | adc %rdx, $acc0 | ||
| 360 | adc \$0, $acc1 | ||
| 361 | |||
| 362 | ######################################################################## | ||
| 363 | # Third reduction step | ||
| 364 | mov $acc2, $t1 | ||
| 365 | shl \$32, $acc2 | ||
| 366 | mulq $poly3 | ||
| 367 | shr \$32, $t1 | ||
| 368 | add $acc2, $acc3 | ||
| 369 | adc $t1, $acc4 | ||
| 370 | adc %rax, $acc5 | ||
| 371 | mov 8*3($b_ptr), %rax | ||
| 372 | adc %rdx, $acc0 | ||
| 373 | adc \$0, $acc1 | ||
| 374 | xor $acc2, $acc2 | ||
| 375 | |||
| 376 | ######################################################################## | ||
| 377 | # Multiply by b[3] | ||
| 378 | mov %rax, $t1 | ||
| 379 | mulq 8*0($a_ptr) | ||
| 380 | add %rax, $acc3 | ||
| 381 | mov $t1, %rax | ||
| 382 | adc \$0, %rdx | ||
| 383 | mov %rdx, $t0 | ||
| 384 | |||
| 385 | mulq 8*1($a_ptr) | ||
| 386 | add $t0, $acc4 | ||
| 387 | adc \$0, %rdx | ||
| 388 | add %rax, $acc4 | ||
| 389 | mov $t1, %rax | ||
| 390 | adc \$0, %rdx | ||
| 391 | mov %rdx, $t0 | ||
| 392 | |||
| 393 | mulq 8*2($a_ptr) | ||
| 394 | add $t0, $acc5 | ||
| 395 | adc \$0, %rdx | ||
| 396 | add %rax, $acc5 | ||
| 397 | mov $t1, %rax | ||
| 398 | adc \$0, %rdx | ||
| 399 | mov %rdx, $t0 | ||
| 400 | |||
| 401 | mulq 8*3($a_ptr) | ||
| 402 | add $t0, $acc0 | ||
| 403 | adc \$0, %rdx | ||
| 404 | add %rax, $acc0 | ||
| 405 | mov $acc3, %rax | ||
| 406 | adc %rdx, $acc1 | ||
| 407 | adc \$0, $acc2 | ||
| 408 | |||
| 409 | ######################################################################## | ||
| 410 | # Final reduction step | ||
| 411 | mov $acc3, $t1 | ||
| 412 | shl \$32, $acc3 | ||
| 413 | mulq $poly3 | ||
| 414 | shr \$32, $t1 | ||
| 415 | add $acc3, $acc4 | ||
| 416 | adc $t1, $acc5 | ||
| 417 | mov $acc4, $t0 | ||
| 418 | adc %rax, $acc0 | ||
| 419 | adc %rdx, $acc1 | ||
| 420 | mov $acc5, $t1 | ||
| 421 | adc \$0, $acc2 | ||
| 422 | |||
| 423 | ######################################################################## | ||
| 424 | # Branch-less conditional subtraction of P | ||
| 425 | sub \$-1, $acc4 # .Lpoly[0] | ||
| 426 | mov $acc0, $t2 | ||
| 427 | sbb $poly1, $acc5 # .Lpoly[1] | ||
| 428 | sbb \$0, $acc0 # .Lpoly[2] | ||
| 429 | mov $acc1, $t3 | ||
| 430 | sbb $poly3, $acc1 # .Lpoly[3] | ||
| 431 | sbb \$0, $acc2 | ||
| 432 | |||
| 433 | cmovc $t0, $acc4 | ||
| 434 | cmovc $t1, $acc5 | ||
| 435 | mov $acc4, 8*0($r_ptr) | ||
| 436 | cmovc $t2, $acc0 | ||
| 437 | mov $acc5, 8*1($r_ptr) | ||
| 438 | cmovc $t3, $acc1 | ||
| 439 | mov $acc0, 8*2($r_ptr) | ||
| 440 | mov $acc1, 8*3($r_ptr) | ||
| 441 | |||
| 442 | ret | ||
| 443 | .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq | ||
| 444 | |||
| 445 | ################################################################################ | ||
| 446 | # void ecp_nistz256_sqr_mont( | ||
| 447 | # uint64_t res[4], | ||
| 448 | # uint64_t a[4]); | ||
| 449 | |||
| 450 | # we optimize the square according to S.Gueron and V.Krasnov, | ||
| 451 | # "Speeding up Big-Number Squaring" | ||
| 452 | .globl ecp_nistz256_sqr_mont | ||
| 453 | .type ecp_nistz256_sqr_mont,\@function,2 | ||
| 454 | .align 32 | ||
| 455 | ecp_nistz256_sqr_mont: | ||
| 456 | push %rbp | ||
| 457 | push %rbx | ||
| 458 | push %r12 | ||
| 459 | push %r13 | ||
| 460 | push %r14 | ||
| 461 | push %r15 | ||
| 462 | |||
| 463 | mov 8*0($a_ptr), %rax | ||
| 464 | mov 8*1($a_ptr), $acc6 | ||
| 465 | mov 8*2($a_ptr), $acc7 | ||
| 466 | mov 8*3($a_ptr), $acc0 | ||
| 467 | |||
| 468 | call __ecp_nistz256_sqr_montq | ||
| 469 | |||
| 470 | pop %r15 | ||
| 471 | pop %r14 | ||
| 472 | pop %r13 | ||
| 473 | pop %r12 | ||
| 474 | pop %rbx | ||
| 475 | pop %rbp | ||
| 476 | ret | ||
| 477 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | ||
| 478 | |||
| 479 | .type __ecp_nistz256_sqr_montq,\@abi-omnipotent | ||
| 480 | .align 32 | ||
| 481 | __ecp_nistz256_sqr_montq: | ||
| 482 | mov %rax, $acc5 | ||
| 483 | mulq $acc6 # a[1]*a[0] | ||
| 484 | mov %rax, $acc1 | ||
| 485 | mov $acc7, %rax | ||
| 486 | mov %rdx, $acc2 | ||
| 487 | |||
| 488 | mulq $acc5 # a[0]*a[2] | ||
| 489 | add %rax, $acc2 | ||
| 490 | mov $acc0, %rax | ||
| 491 | adc \$0, %rdx | ||
| 492 | mov %rdx, $acc3 | ||
| 493 | |||
| 494 | mulq $acc5 # a[0]*a[3] | ||
| 495 | add %rax, $acc3 | ||
| 496 | mov $acc7, %rax | ||
| 497 | adc \$0, %rdx | ||
| 498 | mov %rdx, $acc4 | ||
| 499 | |||
| 500 | ################################# | ||
| 501 | mulq $acc6 # a[1]*a[2] | ||
| 502 | add %rax, $acc3 | ||
| 503 | mov $acc0, %rax | ||
| 504 | adc \$0, %rdx | ||
| 505 | mov %rdx, $t1 | ||
| 506 | |||
| 507 | mulq $acc6 # a[1]*a[3] | ||
| 508 | add %rax, $acc4 | ||
| 509 | mov $acc0, %rax | ||
| 510 | adc \$0, %rdx | ||
| 511 | add $t1, $acc4 | ||
| 512 | mov %rdx, $acc5 | ||
| 513 | adc \$0, $acc5 | ||
| 514 | |||
| 515 | ################################# | ||
| 516 | mulq $acc7 # a[2]*a[3] | ||
| 517 | xor $acc7, $acc7 | ||
| 518 | add %rax, $acc5 | ||
| 519 | mov 8*0($a_ptr), %rax | ||
| 520 | mov %rdx, $acc6 | ||
| 521 | adc \$0, $acc6 | ||
| 522 | |||
| 523 | add $acc1, $acc1 # acc1:6<<1 | ||
| 524 | adc $acc2, $acc2 | ||
| 525 | adc $acc3, $acc3 | ||
| 526 | adc $acc4, $acc4 | ||
| 527 | adc $acc5, $acc5 | ||
| 528 | adc $acc6, $acc6 | ||
| 529 | adc \$0, $acc7 | ||
| 530 | |||
| 531 | mulq %rax | ||
| 532 | mov %rax, $acc0 | ||
| 533 | mov 8*1($a_ptr), %rax | ||
| 534 | mov %rdx, $t0 | ||
| 535 | |||
| 536 | mulq %rax | ||
| 537 | add $t0, $acc1 | ||
| 538 | adc %rax, $acc2 | ||
| 539 | mov 8*2($a_ptr), %rax | ||
| 540 | adc \$0, %rdx | ||
| 541 | mov %rdx, $t0 | ||
| 542 | |||
| 543 | mulq %rax | ||
| 544 | add $t0, $acc3 | ||
| 545 | adc %rax, $acc4 | ||
| 546 | mov 8*3($a_ptr), %rax | ||
| 547 | adc \$0, %rdx | ||
| 548 | mov %rdx, $t0 | ||
| 549 | |||
| 550 | mulq %rax | ||
| 551 | add $t0, $acc5 | ||
| 552 | adc %rax, $acc6 | ||
| 553 | mov $acc0, %rax | ||
| 554 | adc %rdx, $acc7 | ||
| 555 | |||
| 556 | mov .Lpoly+8*1(%rip), $a_ptr | ||
| 557 | mov .Lpoly+8*3(%rip), $t1 | ||
| 558 | |||
| 559 | ########################################## | ||
| 560 | # Now the reduction | ||
| 561 | # First iteration | ||
| 562 | mov $acc0, $t0 | ||
| 563 | shl \$32, $acc0 | ||
| 564 | mulq $t1 | ||
| 565 | shr \$32, $t0 | ||
| 566 | add $acc0, $acc1 # +=acc[0]<<96 | ||
| 567 | adc $t0, $acc2 | ||
| 568 | adc %rax, $acc3 | ||
| 569 | mov $acc1, %rax | ||
| 570 | adc \$0, %rdx | ||
| 571 | |||
| 572 | ########################################## | ||
| 573 | # Second iteration | ||
| 574 | mov $acc1, $t0 | ||
| 575 | shl \$32, $acc1 | ||
| 576 | mov %rdx, $acc0 | ||
| 577 | mulq $t1 | ||
| 578 | shr \$32, $t0 | ||
| 579 | add $acc1, $acc2 | ||
| 580 | adc $t0, $acc3 | ||
| 581 | adc %rax, $acc0 | ||
| 582 | mov $acc2, %rax | ||
| 583 | adc \$0, %rdx | ||
| 584 | |||
| 585 | ########################################## | ||
| 586 | # Third iteration | ||
| 587 | mov $acc2, $t0 | ||
| 588 | shl \$32, $acc2 | ||
| 589 | mov %rdx, $acc1 | ||
| 590 | mulq $t1 | ||
| 591 | shr \$32, $t0 | ||
| 592 | add $acc2, $acc3 | ||
| 593 | adc $t0, $acc0 | ||
| 594 | adc %rax, $acc1 | ||
| 595 | mov $acc3, %rax | ||
| 596 | adc \$0, %rdx | ||
| 597 | |||
| 598 | ########################################### | ||
| 599 | # Last iteration | ||
| 600 | mov $acc3, $t0 | ||
| 601 | shl \$32, $acc3 | ||
| 602 | mov %rdx, $acc2 | ||
| 603 | mulq $t1 | ||
| 604 | shr \$32, $t0 | ||
| 605 | add $acc3, $acc0 | ||
| 606 | adc $t0, $acc1 | ||
| 607 | adc %rax, $acc2 | ||
| 608 | adc \$0, %rdx | ||
| 609 | xor $acc3, $acc3 | ||
| 610 | |||
| 611 | ############################################ | ||
| 612 | # Add the rest of the acc | ||
| 613 | add $acc0, $acc4 | ||
| 614 | adc $acc1, $acc5 | ||
| 615 | mov $acc4, $acc0 | ||
| 616 | adc $acc2, $acc6 | ||
| 617 | adc %rdx, $acc7 | ||
| 618 | mov $acc5, $acc1 | ||
| 619 | adc \$0, $acc3 | ||
| 620 | |||
| 621 | sub \$-1, $acc4 # .Lpoly[0] | ||
| 622 | mov $acc6, $acc2 | ||
| 623 | sbb $a_ptr, $acc5 # .Lpoly[1] | ||
| 624 | sbb \$0, $acc6 # .Lpoly[2] | ||
| 625 | mov $acc7, $t0 | ||
| 626 | sbb $t1, $acc7 # .Lpoly[3] | ||
| 627 | sbb \$0, $acc3 | ||
| 628 | |||
| 629 | cmovc $acc0, $acc4 | ||
| 630 | cmovc $acc1, $acc5 | ||
| 631 | mov $acc4, 8*0($r_ptr) | ||
| 632 | cmovc $acc2, $acc6 | ||
| 633 | mov $acc5, 8*1($r_ptr) | ||
| 634 | cmovc $t0, $acc7 | ||
| 635 | mov $acc6, 8*2($r_ptr) | ||
| 636 | mov $acc7, 8*3($r_ptr) | ||
| 637 | |||
| 638 | ret | ||
| 639 | .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq | ||
| 640 | ___ | ||
| 641 | |||
| 642 | } | ||
| 643 | { | ||
| 644 | my ($r_ptr,$in_ptr)=("%rdi","%rsi"); | ||
| 645 | my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); | ||
| 646 | my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); | ||
| 647 | |||
| 648 | $code.=<<___; | ||
| 649 | ################################################################################ | ||
| 650 | # void ecp_nistz256_from_mont( | ||
| 651 | # uint64_t res[4], | ||
| 652 | # uint64_t in[4]); | ||
| 653 | # This one performs Montgomery multiplication by 1, so we only need the reduction | ||
| 654 | |||
| 655 | .globl ecp_nistz256_from_mont | ||
| 656 | .type ecp_nistz256_from_mont,\@function,2 | ||
| 657 | .align 32 | ||
| 658 | ecp_nistz256_from_mont: | ||
| 659 | push %r12 | ||
| 660 | push %r13 | ||
| 661 | |||
| 662 | mov 8*0($in_ptr), %rax | ||
| 663 | mov .Lpoly+8*3(%rip), $t2 | ||
| 664 | mov 8*1($in_ptr), $acc1 | ||
| 665 | mov 8*2($in_ptr), $acc2 | ||
| 666 | mov 8*3($in_ptr), $acc3 | ||
| 667 | mov %rax, $acc0 | ||
| 668 | mov .Lpoly+8*1(%rip), $t1 | ||
| 669 | |||
| 670 | ######################################### | ||
| 671 | # First iteration | ||
| 672 | mov %rax, $t0 | ||
| 673 | shl \$32, $acc0 | ||
| 674 | mulq $t2 | ||
| 675 | shr \$32, $t0 | ||
| 676 | add $acc0, $acc1 | ||
| 677 | adc $t0, $acc2 | ||
| 678 | adc %rax, $acc3 | ||
| 679 | mov $acc1, %rax | ||
| 680 | adc \$0, %rdx | ||
| 681 | |||
| 682 | ######################################### | ||
| 683 | # Second iteration | ||
| 684 | mov $acc1, $t0 | ||
| 685 | shl \$32, $acc1 | ||
| 686 | mov %rdx, $acc0 | ||
| 687 | mulq $t2 | ||
| 688 | shr \$32, $t0 | ||
| 689 | add $acc1, $acc2 | ||
| 690 | adc $t0, $acc3 | ||
| 691 | adc %rax, $acc0 | ||
| 692 | mov $acc2, %rax | ||
| 693 | adc \$0, %rdx | ||
| 694 | |||
| 695 | ########################################## | ||
| 696 | # Third iteration | ||
| 697 | mov $acc2, $t0 | ||
| 698 | shl \$32, $acc2 | ||
| 699 | mov %rdx, $acc1 | ||
| 700 | mulq $t2 | ||
| 701 | shr \$32, $t0 | ||
| 702 | add $acc2, $acc3 | ||
| 703 | adc $t0, $acc0 | ||
| 704 | adc %rax, $acc1 | ||
| 705 | mov $acc3, %rax | ||
| 706 | adc \$0, %rdx | ||
| 707 | |||
| 708 | ########################################### | ||
| 709 | # Last iteration | ||
| 710 | mov $acc3, $t0 | ||
| 711 | shl \$32, $acc3 | ||
| 712 | mov %rdx, $acc2 | ||
| 713 | mulq $t2 | ||
| 714 | shr \$32, $t0 | ||
| 715 | add $acc3, $acc0 | ||
| 716 | adc $t0, $acc1 | ||
| 717 | mov $acc0, $t0 | ||
| 718 | adc %rax, $acc2 | ||
| 719 | mov $acc1, $in_ptr | ||
| 720 | adc \$0, %rdx | ||
| 721 | |||
| 722 | ########################################### | ||
| 723 | # Branch-less conditional subtraction | ||
| 724 | sub \$-1, $acc0 | ||
| 725 | mov $acc2, %rax | ||
| 726 | sbb $t1, $acc1 | ||
| 727 | sbb \$0, $acc2 | ||
| 728 | mov %rdx, $acc3 | ||
| 729 | sbb $t2, %rdx | ||
| 730 | sbb $t2, $t2 | ||
| 731 | |||
| 732 | cmovnz $t0, $acc0 | ||
| 733 | cmovnz $in_ptr, $acc1 | ||
| 734 | mov $acc0, 8*0($r_ptr) | ||
| 735 | cmovnz %rax, $acc2 | ||
| 736 | mov $acc1, 8*1($r_ptr) | ||
| 737 | cmovz %rdx, $acc3 | ||
| 738 | mov $acc2, 8*2($r_ptr) | ||
| 739 | mov $acc3, 8*3($r_ptr) | ||
| 740 | |||
| 741 | pop %r13 | ||
| 742 | pop %r12 | ||
| 743 | ret | ||
| 744 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | ||
| 745 | ___ | ||
| 746 | } | ||
| 747 | { | ||
| 748 | my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); | ||
| 749 | my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); | ||
| 750 | my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); | ||
| 751 | my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); | ||
| 752 | |||
| 753 | $code.=<<___; | ||
| 754 | ################################################################################ | ||
| 755 | # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); | ||
| 756 | .globl ecp_nistz256_select_w5 | ||
| 757 | .type ecp_nistz256_select_w5,\@abi-omnipotent | ||
| 758 | .align 32 | ||
| 759 | ecp_nistz256_select_w5: | ||
| 760 | ___ | ||
| 761 | $code.=<<___ if ($win64); | ||
| 762 | lea -0x88(%rsp), %rax | ||
| 763 | .LSEH_begin_ecp_nistz256_select_w5: | ||
| 764 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp | ||
| 765 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) | ||
| 766 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) | ||
| 767 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) | ||
| 768 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) | ||
| 769 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) | ||
| 770 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) | ||
| 771 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) | ||
| 772 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) | ||
| 773 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) | ||
| 774 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) | ||
| 775 | ___ | ||
| 776 | $code.=<<___; | ||
| 777 | movdqa .LOne(%rip), $ONE | ||
| 778 | movd $index, $INDEX | ||
| 779 | |||
| 780 | pxor $Ra, $Ra | ||
| 781 | pxor $Rb, $Rb | ||
| 782 | pxor $Rc, $Rc | ||
| 783 | pxor $Rd, $Rd | ||
| 784 | pxor $Re, $Re | ||
| 785 | pxor $Rf, $Rf | ||
| 786 | |||
| 787 | movdqa $ONE, $M0 | ||
| 788 | pshufd \$0, $INDEX, $INDEX | ||
| 789 | |||
| 790 | mov \$16, %rax | ||
| 791 | .Lselect_loop_sse_w5: | ||
| 792 | |||
| 793 | movdqa $M0, $TMP0 | ||
| 794 | paddd $ONE, $M0 | ||
| 795 | pcmpeqd $INDEX, $TMP0 | ||
| 796 | |||
| 797 | movdqa 16*0($in_t), $T0a | ||
| 798 | movdqa 16*1($in_t), $T0b | ||
| 799 | movdqa 16*2($in_t), $T0c | ||
| 800 | movdqa 16*3($in_t), $T0d | ||
| 801 | movdqa 16*4($in_t), $T0e | ||
| 802 | movdqa 16*5($in_t), $T0f | ||
| 803 | lea 16*6($in_t), $in_t | ||
| 804 | |||
| 805 | pand $TMP0, $T0a | ||
| 806 | pand $TMP0, $T0b | ||
| 807 | por $T0a, $Ra | ||
| 808 | pand $TMP0, $T0c | ||
| 809 | por $T0b, $Rb | ||
| 810 | pand $TMP0, $T0d | ||
| 811 | por $T0c, $Rc | ||
| 812 | pand $TMP0, $T0e | ||
| 813 | por $T0d, $Rd | ||
| 814 | pand $TMP0, $T0f | ||
| 815 | por $T0e, $Re | ||
| 816 | por $T0f, $Rf | ||
| 817 | |||
| 818 | dec %rax | ||
| 819 | jnz .Lselect_loop_sse_w5 | ||
| 820 | |||
| 821 | movdqu $Ra, 16*0($val) | ||
| 822 | movdqu $Rb, 16*1($val) | ||
| 823 | movdqu $Rc, 16*2($val) | ||
| 824 | movdqu $Rd, 16*3($val) | ||
| 825 | movdqu $Re, 16*4($val) | ||
| 826 | movdqu $Rf, 16*5($val) | ||
| 827 | ___ | ||
| 828 | $code.=<<___ if ($win64); | ||
| 829 | movaps (%rsp), %xmm6 | ||
| 830 | movaps 0x10(%rsp), %xmm7 | ||
| 831 | movaps 0x20(%rsp), %xmm8 | ||
| 832 | movaps 0x30(%rsp), %xmm9 | ||
| 833 | movaps 0x40(%rsp), %xmm10 | ||
| 834 | movaps 0x50(%rsp), %xmm11 | ||
| 835 | movaps 0x60(%rsp), %xmm12 | ||
| 836 | movaps 0x70(%rsp), %xmm13 | ||
| 837 | movaps 0x80(%rsp), %xmm14 | ||
| 838 | movaps 0x90(%rsp), %xmm15 | ||
| 839 | lea 0xa8(%rsp), %rsp | ||
| 840 | .LSEH_end_ecp_nistz256_select_w5: | ||
| 841 | ___ | ||
| 842 | $code.=<<___; | ||
| 843 | ret | ||
| 844 | .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 | ||
| 845 | |||
| 846 | ################################################################################ | ||
| 847 | # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); | ||
| 848 | .globl ecp_nistz256_select_w7 | ||
| 849 | .type ecp_nistz256_select_w7,\@abi-omnipotent | ||
| 850 | .align 32 | ||
| 851 | ecp_nistz256_select_w7: | ||
| 852 | ___ | ||
| 853 | $code.=<<___ if ($win64); | ||
| 854 | lea -0x88(%rsp), %rax | ||
| 855 | .LSEH_begin_ecp_nistz256_select_w7: | ||
| 856 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp | ||
| 857 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) | ||
| 858 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) | ||
| 859 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) | ||
| 860 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) | ||
| 861 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) | ||
| 862 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) | ||
| 863 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) | ||
| 864 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) | ||
| 865 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) | ||
| 866 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) | ||
| 867 | ___ | ||
| 868 | $code.=<<___; | ||
| 869 | movdqa .LOne(%rip), $M0 | ||
| 870 | movd $index, $INDEX | ||
| 871 | |||
| 872 | pxor $Ra, $Ra | ||
| 873 | pxor $Rb, $Rb | ||
| 874 | pxor $Rc, $Rc | ||
| 875 | pxor $Rd, $Rd | ||
| 876 | |||
| 877 | movdqa $M0, $ONE | ||
| 878 | pshufd \$0, $INDEX, $INDEX | ||
| 879 | mov \$64, %rax | ||
| 880 | |||
| 881 | .Lselect_loop_sse_w7: | ||
| 882 | movdqa $M0, $TMP0 | ||
| 883 | paddd $ONE, $M0 | ||
| 884 | movdqa 16*0($in_t), $T0a | ||
| 885 | movdqa 16*1($in_t), $T0b | ||
| 886 | pcmpeqd $INDEX, $TMP0 | ||
| 887 | movdqa 16*2($in_t), $T0c | ||
| 888 | movdqa 16*3($in_t), $T0d | ||
| 889 | lea 16*4($in_t), $in_t | ||
| 890 | |||
| 891 | pand $TMP0, $T0a | ||
| 892 | pand $TMP0, $T0b | ||
| 893 | por $T0a, $Ra | ||
| 894 | pand $TMP0, $T0c | ||
| 895 | por $T0b, $Rb | ||
| 896 | pand $TMP0, $T0d | ||
| 897 | por $T0c, $Rc | ||
| 898 | prefetcht0 255($in_t) | ||
| 899 | por $T0d, $Rd | ||
| 900 | |||
| 901 | dec %rax | ||
| 902 | jnz .Lselect_loop_sse_w7 | ||
| 903 | |||
| 904 | movdqu $Ra, 16*0($val) | ||
| 905 | movdqu $Rb, 16*1($val) | ||
| 906 | movdqu $Rc, 16*2($val) | ||
| 907 | movdqu $Rd, 16*3($val) | ||
| 908 | ___ | ||
| 909 | $code.=<<___ if ($win64); | ||
| 910 | movaps (%rsp), %xmm6 | ||
| 911 | movaps 0x10(%rsp), %xmm7 | ||
| 912 | movaps 0x20(%rsp), %xmm8 | ||
| 913 | movaps 0x30(%rsp), %xmm9 | ||
| 914 | movaps 0x40(%rsp), %xmm10 | ||
| 915 | movaps 0x50(%rsp), %xmm11 | ||
| 916 | movaps 0x60(%rsp), %xmm12 | ||
| 917 | movaps 0x70(%rsp), %xmm13 | ||
| 918 | movaps 0x80(%rsp), %xmm14 | ||
| 919 | movaps 0x90(%rsp), %xmm15 | ||
| 920 | lea 0xa8(%rsp), %rsp | ||
| 921 | .LSEH_end_ecp_nistz256_select_w7: | ||
| 922 | ___ | ||
| 923 | $code.=<<___; | ||
| 924 | ret | ||
| 925 | .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 | ||
| 926 | ___ | ||
| 927 | } | ||
| 928 | {{{ | ||
| 929 | ######################################################################## | ||
| 930 | # This block implements higher level point_double, point_add and | ||
| 931 | # point_add_affine. The key to performance in this case is to allow | ||
| 932 | # out-of-order execution logic to overlap computations from next step | ||
| 933 | # with tail processing from current step. By using tailored calling | ||
| 934 | # sequence we minimize inter-step overhead to give processor better | ||
| 935 | # shot at overlapping operations... | ||
| 936 | # | ||
| 937 | # You will notice that input data is copied to stack. Trouble is that | ||
| 938 | # there are no registers to spare for holding original pointers and | ||
| 939 | # reloading them, pointers, would create undesired dependencies on | ||
| 940 | # effective addresses calculation paths. In other words it's too done | ||
| 941 | # to favour out-of-order execution logic. | ||
| 942 | # <appro@openssl.org> | ||
| 943 | |||
| 944 | my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); | ||
| 945 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); | ||
| 946 | my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); | ||
| 947 | my ($poly1,$poly3)=($acc6,$acc7); | ||
| 948 | |||
| 949 | sub load_for_mul () { | ||
| 950 | my ($a,$b,$src0) = @_; | ||
| 951 | my $bias = $src0 eq "%rax" ? 0 : -128; | ||
| 952 | |||
| 953 | " mov $b, $src0 | ||
| 954 | lea $b, $b_ptr | ||
| 955 | mov 8*0+$a, $acc1 | ||
| 956 | mov 8*1+$a, $acc2 | ||
| 957 | lea $bias+$a, $a_ptr | ||
| 958 | mov 8*2+$a, $acc3 | ||
| 959 | mov 8*3+$a, $acc4" | ||
| 960 | } | ||
| 961 | |||
| 962 | sub load_for_sqr () { | ||
| 963 | my ($a,$src0) = @_; | ||
| 964 | my $bias = $src0 eq "%rax" ? 0 : -128; | ||
| 965 | |||
| 966 | " mov 8*0+$a, $src0 | ||
| 967 | mov 8*1+$a, $acc6 | ||
| 968 | lea $bias+$a, $a_ptr | ||
| 969 | mov 8*2+$a, $acc7 | ||
| 970 | mov 8*3+$a, $acc0" | ||
| 971 | } | ||
| 972 | |||
| 973 | { | ||
| 974 | ######################################################################## | ||
| 975 | # operate in 4-5-0-1 "name space" that matches multiplication output | ||
| 976 | # | ||
| 977 | my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | ||
| 978 | |||
| 979 | $code.=<<___; | ||
| 980 | .type __ecp_nistz256_add_toq,\@abi-omnipotent | ||
| 981 | .align 32 | ||
| 982 | __ecp_nistz256_add_toq: | ||
| 983 | add 8*0($b_ptr), $a0 | ||
| 984 | adc 8*1($b_ptr), $a1 | ||
| 985 | mov $a0, $t0 | ||
| 986 | adc 8*2($b_ptr), $a2 | ||
| 987 | adc 8*3($b_ptr), $a3 | ||
| 988 | mov $a1, $t1 | ||
| 989 | sbb $t4, $t4 | ||
| 990 | |||
| 991 | sub \$-1, $a0 | ||
| 992 | mov $a2, $t2 | ||
| 993 | sbb $poly1, $a1 | ||
| 994 | sbb \$0, $a2 | ||
| 995 | mov $a3, $t3 | ||
| 996 | sbb $poly3, $a3 | ||
| 997 | test $t4, $t4 | ||
| 998 | |||
| 999 | cmovz $t0, $a0 | ||
| 1000 | cmovz $t1, $a1 | ||
| 1001 | mov $a0, 8*0($r_ptr) | ||
| 1002 | cmovz $t2, $a2 | ||
| 1003 | mov $a1, 8*1($r_ptr) | ||
| 1004 | cmovz $t3, $a3 | ||
| 1005 | mov $a2, 8*2($r_ptr) | ||
| 1006 | mov $a3, 8*3($r_ptr) | ||
| 1007 | |||
| 1008 | ret | ||
| 1009 | .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq | ||
| 1010 | |||
| 1011 | .type __ecp_nistz256_sub_fromq,\@abi-omnipotent | ||
| 1012 | .align 32 | ||
| 1013 | __ecp_nistz256_sub_fromq: | ||
| 1014 | sub 8*0($b_ptr), $a0 | ||
| 1015 | sbb 8*1($b_ptr), $a1 | ||
| 1016 | mov $a0, $t0 | ||
| 1017 | sbb 8*2($b_ptr), $a2 | ||
| 1018 | sbb 8*3($b_ptr), $a3 | ||
| 1019 | mov $a1, $t1 | ||
| 1020 | sbb $t4, $t4 | ||
| 1021 | |||
| 1022 | add \$-1, $a0 | ||
| 1023 | mov $a2, $t2 | ||
| 1024 | adc $poly1, $a1 | ||
| 1025 | adc \$0, $a2 | ||
| 1026 | mov $a3, $t3 | ||
| 1027 | adc $poly3, $a3 | ||
| 1028 | test $t4, $t4 | ||
| 1029 | |||
| 1030 | cmovz $t0, $a0 | ||
| 1031 | cmovz $t1, $a1 | ||
| 1032 | mov $a0, 8*0($r_ptr) | ||
| 1033 | cmovz $t2, $a2 | ||
| 1034 | mov $a1, 8*1($r_ptr) | ||
| 1035 | cmovz $t3, $a3 | ||
| 1036 | mov $a2, 8*2($r_ptr) | ||
| 1037 | mov $a3, 8*3($r_ptr) | ||
| 1038 | |||
| 1039 | ret | ||
| 1040 | .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq | ||
| 1041 | |||
| 1042 | .type __ecp_nistz256_subq,\@abi-omnipotent | ||
| 1043 | .align 32 | ||
| 1044 | __ecp_nistz256_subq: | ||
| 1045 | sub $a0, $t0 | ||
| 1046 | sbb $a1, $t1 | ||
| 1047 | mov $t0, $a0 | ||
| 1048 | sbb $a2, $t2 | ||
| 1049 | sbb $a3, $t3 | ||
| 1050 | mov $t1, $a1 | ||
| 1051 | sbb $t4, $t4 | ||
| 1052 | |||
| 1053 | add \$-1, $t0 | ||
| 1054 | mov $t2, $a2 | ||
| 1055 | adc $poly1, $t1 | ||
| 1056 | adc \$0, $t2 | ||
| 1057 | mov $t3, $a3 | ||
| 1058 | adc $poly3, $t3 | ||
| 1059 | test $t4, $t4 | ||
| 1060 | |||
| 1061 | cmovnz $t0, $a0 | ||
| 1062 | cmovnz $t1, $a1 | ||
| 1063 | cmovnz $t2, $a2 | ||
| 1064 | cmovnz $t3, $a3 | ||
| 1065 | |||
| 1066 | ret | ||
| 1067 | .size __ecp_nistz256_subq,.-__ecp_nistz256_subq | ||
| 1068 | |||
| 1069 | .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent | ||
| 1070 | .align 32 | ||
| 1071 | __ecp_nistz256_mul_by_2q: | ||
| 1072 | add $a0, $a0 # a0:a3+a0:a3 | ||
| 1073 | adc $a1, $a1 | ||
| 1074 | mov $a0, $t0 | ||
| 1075 | adc $a2, $a2 | ||
| 1076 | adc $a3, $a3 | ||
| 1077 | mov $a1, $t1 | ||
| 1078 | sbb $t4, $t4 | ||
| 1079 | |||
| 1080 | sub \$-1, $a0 | ||
| 1081 | mov $a2, $t2 | ||
| 1082 | sbb $poly1, $a1 | ||
| 1083 | sbb \$0, $a2 | ||
| 1084 | mov $a3, $t3 | ||
| 1085 | sbb $poly3, $a3 | ||
| 1086 | test $t4, $t4 | ||
| 1087 | |||
| 1088 | cmovz $t0, $a0 | ||
| 1089 | cmovz $t1, $a1 | ||
| 1090 | mov $a0, 8*0($r_ptr) | ||
| 1091 | cmovz $t2, $a2 | ||
| 1092 | mov $a1, 8*1($r_ptr) | ||
| 1093 | cmovz $t3, $a3 | ||
| 1094 | mov $a2, 8*2($r_ptr) | ||
| 1095 | mov $a3, 8*3($r_ptr) | ||
| 1096 | |||
| 1097 | ret | ||
| 1098 | .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q | ||
| 1099 | ___ | ||
| 1100 | } | ||
| 1101 | sub gen_double () { | ||
| 1102 | my $x = shift; | ||
| 1103 | my ($src0,$sfx,$bias); | ||
| 1104 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | ||
| 1105 | |||
| 1106 | if ($x ne "x") { | ||
| 1107 | $src0 = "%rax"; | ||
| 1108 | $sfx = ""; | ||
| 1109 | $bias = 0; | ||
| 1110 | |||
| 1111 | $code.=<<___; | ||
| 1112 | .globl ecp_nistz256_point_double | ||
| 1113 | .type ecp_nistz256_point_double,\@function,2 | ||
| 1114 | .align 32 | ||
| 1115 | ecp_nistz256_point_double: | ||
| 1116 | ___ | ||
| 1117 | } else { | ||
| 1118 | $src0 = "%rdx"; | ||
| 1119 | $sfx = "x"; | ||
| 1120 | $bias = 128; | ||
| 1121 | |||
| 1122 | $code.=<<___; | ||
| 1123 | .type ecp_nistz256_point_doublex,\@function,2 | ||
| 1124 | .align 32 | ||
| 1125 | ecp_nistz256_point_doublex: | ||
| 1126 | .Lpoint_doublex: | ||
| 1127 | ___ | ||
| 1128 | } | ||
| 1129 | $code.=<<___; | ||
| 1130 | push %rbp | ||
| 1131 | push %rbx | ||
| 1132 | push %r12 | ||
| 1133 | push %r13 | ||
| 1134 | push %r14 | ||
| 1135 | push %r15 | ||
| 1136 | sub \$32*5+8, %rsp | ||
| 1137 | |||
| 1138 | .Lpoint_double_shortcut$x: | ||
| 1139 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x | ||
| 1140 | mov $a_ptr, $b_ptr # backup copy | ||
| 1141 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1142 | mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order | ||
| 1143 | mov 0x20+8*1($a_ptr), $acc5 | ||
| 1144 | mov 0x20+8*2($a_ptr), $acc0 | ||
| 1145 | mov 0x20+8*3($a_ptr), $acc1 | ||
| 1146 | mov .Lpoly+8*1(%rip), $poly1 | ||
| 1147 | mov .Lpoly+8*3(%rip), $poly3 | ||
| 1148 | movdqa %xmm0, $in_x(%rsp) | ||
| 1149 | movdqa %xmm1, $in_x+0x10(%rsp) | ||
| 1150 | lea 0x20($r_ptr), $acc2 | ||
| 1151 | lea 0x40($r_ptr), $acc3 | ||
| 1152 | movq $r_ptr, %xmm0 | ||
| 1153 | movq $acc2, %xmm1 | ||
| 1154 | movq $acc3, %xmm2 | ||
| 1155 | |||
| 1156 | lea $S(%rsp), $r_ptr | ||
| 1157 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); | ||
| 1158 | |||
| 1159 | mov 0x40+8*0($a_ptr), $src0 | ||
| 1160 | mov 0x40+8*1($a_ptr), $acc6 | ||
| 1161 | mov 0x40+8*2($a_ptr), $acc7 | ||
| 1162 | mov 0x40+8*3($a_ptr), $acc0 | ||
| 1163 | lea 0x40-$bias($a_ptr), $a_ptr | ||
| 1164 | lea $Zsqr(%rsp), $r_ptr | ||
| 1165 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); | ||
| 1166 | |||
| 1167 | `&load_for_sqr("$S(%rsp)", "$src0")` | ||
| 1168 | lea $S(%rsp), $r_ptr | ||
| 1169 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); | ||
| 1170 | |||
| 1171 | mov 0x20($b_ptr), $src0 # $b_ptr is still valid | ||
| 1172 | mov 0x40+8*0($b_ptr), $acc1 | ||
| 1173 | mov 0x40+8*1($b_ptr), $acc2 | ||
| 1174 | mov 0x40+8*2($b_ptr), $acc3 | ||
| 1175 | mov 0x40+8*3($b_ptr), $acc4 | ||
| 1176 | lea 0x40-$bias($b_ptr), $a_ptr | ||
| 1177 | lea 0x20($b_ptr), $b_ptr | ||
| 1178 | movq %xmm2, $r_ptr | ||
| 1179 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); | ||
| 1180 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); | ||
| 1181 | |||
| 1182 | mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order | ||
| 1183 | mov $in_x+8*1(%rsp), $acc5 | ||
| 1184 | lea $Zsqr(%rsp), $b_ptr | ||
| 1185 | mov $in_x+8*2(%rsp), $acc0 | ||
| 1186 | mov $in_x+8*3(%rsp), $acc1 | ||
| 1187 | lea $M(%rsp), $r_ptr | ||
| 1188 | call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); | ||
| 1189 | |||
| 1190 | mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order | ||
| 1191 | mov $in_x+8*1(%rsp), $acc5 | ||
| 1192 | lea $Zsqr(%rsp), $b_ptr | ||
| 1193 | mov $in_x+8*2(%rsp), $acc0 | ||
| 1194 | mov $in_x+8*3(%rsp), $acc1 | ||
| 1195 | lea $Zsqr(%rsp), $r_ptr | ||
| 1196 | call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); | ||
| 1197 | |||
| 1198 | `&load_for_sqr("$S(%rsp)", "$src0")` | ||
| 1199 | movq %xmm1, $r_ptr | ||
| 1200 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); | ||
| 1201 | ___ | ||
| 1202 | { | ||
| 1203 | ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## | ||
| 1204 | # operate in 4-5-6-7 "name space" that matches squaring output | ||
| 1205 | # | ||
| 1206 | my ($poly1,$poly3)=($a_ptr,$t1); | ||
| 1207 | my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); | ||
| 1208 | |||
| 1209 | $code.=<<___; | ||
| 1210 | xor $t4, $t4 | ||
| 1211 | mov $a0, $t0 | ||
| 1212 | add \$-1, $a0 | ||
| 1213 | mov $a1, $t1 | ||
| 1214 | adc $poly1, $a1 | ||
| 1215 | mov $a2, $t2 | ||
| 1216 | adc \$0, $a2 | ||
| 1217 | mov $a3, $t3 | ||
| 1218 | adc $poly3, $a3 | ||
| 1219 | adc \$0, $t4 | ||
| 1220 | xor $a_ptr, $a_ptr # borrow $a_ptr | ||
| 1221 | test \$1, $t0 | ||
| 1222 | |||
| 1223 | cmovz $t0, $a0 | ||
| 1224 | cmovz $t1, $a1 | ||
| 1225 | cmovz $t2, $a2 | ||
| 1226 | cmovz $t3, $a3 | ||
| 1227 | cmovz $a_ptr, $t4 | ||
| 1228 | |||
| 1229 | mov $a1, $t0 # a0:a3>>1 | ||
| 1230 | shr \$1, $a0 | ||
| 1231 | shl \$63, $t0 | ||
| 1232 | mov $a2, $t1 | ||
| 1233 | shr \$1, $a1 | ||
| 1234 | or $t0, $a0 | ||
| 1235 | shl \$63, $t1 | ||
| 1236 | mov $a3, $t2 | ||
| 1237 | shr \$1, $a2 | ||
| 1238 | or $t1, $a1 | ||
| 1239 | shl \$63, $t2 | ||
| 1240 | mov $a0, 8*0($r_ptr) | ||
| 1241 | shr \$1, $a3 | ||
| 1242 | mov $a1, 8*1($r_ptr) | ||
| 1243 | shl \$63, $t4 | ||
| 1244 | or $t2, $a2 | ||
| 1245 | or $t4, $a3 | ||
| 1246 | mov $a2, 8*2($r_ptr) | ||
| 1247 | mov $a3, 8*3($r_ptr) | ||
| 1248 | ___ | ||
| 1249 | } | ||
| 1250 | $code.=<<___; | ||
| 1251 | `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` | ||
| 1252 | lea $M(%rsp), $r_ptr | ||
| 1253 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); | ||
| 1254 | |||
| 1255 | lea $tmp0(%rsp), $r_ptr | ||
| 1256 | call __ecp_nistz256_mul_by_2$x | ||
| 1257 | |||
| 1258 | lea $M(%rsp), $b_ptr | ||
| 1259 | lea $M(%rsp), $r_ptr | ||
| 1260 | call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); | ||
| 1261 | |||
| 1262 | `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` | ||
| 1263 | lea $S(%rsp), $r_ptr | ||
| 1264 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); | ||
| 1265 | |||
| 1266 | lea $tmp0(%rsp), $r_ptr | ||
| 1267 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); | ||
| 1268 | |||
| 1269 | `&load_for_sqr("$M(%rsp)", "$src0")` | ||
| 1270 | movq %xmm0, $r_ptr | ||
| 1271 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); | ||
| 1272 | |||
| 1273 | lea $tmp0(%rsp), $b_ptr | ||
| 1274 | mov $acc6, $acc0 # harmonize sqr output and sub input | ||
| 1275 | mov $acc7, $acc1 | ||
| 1276 | mov $a_ptr, $poly1 | ||
| 1277 | mov $t1, $poly3 | ||
| 1278 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); | ||
| 1279 | |||
| 1280 | mov $S+8*0(%rsp), $t0 | ||
| 1281 | mov $S+8*1(%rsp), $t1 | ||
| 1282 | mov $S+8*2(%rsp), $t2 | ||
| 1283 | mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order | ||
| 1284 | lea $S(%rsp), $r_ptr | ||
| 1285 | call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); | ||
| 1286 | |||
| 1287 | mov $M(%rsp), $src0 | ||
| 1288 | lea $M(%rsp), $b_ptr | ||
| 1289 | mov $acc4, $acc6 # harmonize sub output and mul input | ||
| 1290 | xor %ecx, %ecx | ||
| 1291 | mov $acc4, $S+8*0(%rsp) # have to save:-( | ||
| 1292 | mov $acc5, $acc2 | ||
| 1293 | mov $acc5, $S+8*1(%rsp) | ||
| 1294 | cmovz $acc0, $acc3 | ||
| 1295 | mov $acc0, $S+8*2(%rsp) | ||
| 1296 | lea $S-$bias(%rsp), $a_ptr | ||
| 1297 | cmovz $acc1, $acc4 | ||
| 1298 | mov $acc1, $S+8*3(%rsp) | ||
| 1299 | mov $acc6, $acc1 | ||
| 1300 | lea $S(%rsp), $r_ptr | ||
| 1301 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); | ||
| 1302 | |||
| 1303 | movq %xmm1, $b_ptr | ||
| 1304 | movq %xmm1, $r_ptr | ||
| 1305 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); | ||
| 1306 | |||
| 1307 | add \$32*5+8, %rsp | ||
| 1308 | pop %r15 | ||
| 1309 | pop %r14 | ||
| 1310 | pop %r13 | ||
| 1311 | pop %r12 | ||
| 1312 | pop %rbx | ||
| 1313 | pop %rbp | ||
| 1314 | ret | ||
| 1315 | .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx | ||
| 1316 | ___ | ||
| 1317 | } | ||
| 1318 | &gen_double("q"); | ||
| 1319 | |||
| 1320 | sub gen_add () { | ||
| 1321 | my $x = shift; | ||
| 1322 | my ($src0,$sfx,$bias); | ||
| 1323 | my ($H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 1324 | $U1,$U2,$S1,$S2, | ||
| 1325 | $res_x,$res_y,$res_z, | ||
| 1326 | $in1_x,$in1_y,$in1_z, | ||
| 1327 | $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); | ||
| 1328 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 1329 | |||
| 1330 | if ($x ne "x") { | ||
| 1331 | $src0 = "%rax"; | ||
| 1332 | $sfx = ""; | ||
| 1333 | $bias = 0; | ||
| 1334 | |||
| 1335 | $code.=<<___; | ||
| 1336 | .globl ecp_nistz256_point_add | ||
| 1337 | .type ecp_nistz256_point_add,\@function,3 | ||
| 1338 | .align 32 | ||
| 1339 | ecp_nistz256_point_add: | ||
| 1340 | ___ | ||
| 1341 | } else { | ||
| 1342 | $src0 = "%rdx"; | ||
| 1343 | $sfx = "x"; | ||
| 1344 | $bias = 128; | ||
| 1345 | } | ||
| 1346 | $code.=<<___; | ||
| 1347 | push %rbp | ||
| 1348 | push %rbx | ||
| 1349 | push %r12 | ||
| 1350 | push %r13 | ||
| 1351 | push %r14 | ||
| 1352 | push %r15 | ||
| 1353 | sub \$32*18+8, %rsp | ||
| 1354 | |||
| 1355 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr | ||
| 1356 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1357 | movdqu 0x20($a_ptr), %xmm2 | ||
| 1358 | movdqu 0x30($a_ptr), %xmm3 | ||
| 1359 | movdqu 0x40($a_ptr), %xmm4 | ||
| 1360 | movdqu 0x50($a_ptr), %xmm5 | ||
| 1361 | mov $a_ptr, $b_ptr # reassign | ||
| 1362 | mov $b_org, $a_ptr # reassign | ||
| 1363 | movdqa %xmm0, $in1_x(%rsp) | ||
| 1364 | movdqa %xmm1, $in1_x+0x10(%rsp) | ||
| 1365 | por %xmm0, %xmm1 | ||
| 1366 | movdqa %xmm2, $in1_y(%rsp) | ||
| 1367 | movdqa %xmm3, $in1_y+0x10(%rsp) | ||
| 1368 | por %xmm2, %xmm3 | ||
| 1369 | movdqa %xmm4, $in1_z(%rsp) | ||
| 1370 | movdqa %xmm5, $in1_z+0x10(%rsp) | ||
| 1371 | por %xmm1, %xmm3 | ||
| 1372 | |||
| 1373 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr | ||
| 1374 | pshufd \$0xb1, %xmm3, %xmm5 | ||
| 1375 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1376 | movdqu 0x20($a_ptr), %xmm2 | ||
| 1377 | por %xmm3, %xmm5 | ||
| 1378 | movdqu 0x30($a_ptr), %xmm3 | ||
| 1379 | mov 0x40+8*0($a_ptr), $src0 # load original in2_z | ||
| 1380 | mov 0x40+8*1($a_ptr), $acc6 | ||
| 1381 | mov 0x40+8*2($a_ptr), $acc7 | ||
| 1382 | mov 0x40+8*3($a_ptr), $acc0 | ||
| 1383 | movdqa %xmm0, $in2_x(%rsp) | ||
| 1384 | pshufd \$0x1e, %xmm5, %xmm4 | ||
| 1385 | movdqa %xmm1, $in2_x+0x10(%rsp) | ||
| 1386 | por %xmm0, %xmm1 | ||
| 1387 | movq $r_ptr, %xmm0 # save $r_ptr | ||
| 1388 | movdqa %xmm2, $in2_y(%rsp) | ||
| 1389 | movdqa %xmm3, $in2_y+0x10(%rsp) | ||
| 1390 | por %xmm2, %xmm3 | ||
| 1391 | por %xmm4, %xmm5 | ||
| 1392 | pxor %xmm4, %xmm4 | ||
| 1393 | por %xmm1, %xmm3 | ||
| 1394 | |||
| 1395 | lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid | ||
| 1396 | mov $src0, $in2_z+8*0(%rsp) # make in2_z copy | ||
| 1397 | mov $acc6, $in2_z+8*1(%rsp) | ||
| 1398 | mov $acc7, $in2_z+8*2(%rsp) | ||
| 1399 | mov $acc0, $in2_z+8*3(%rsp) | ||
| 1400 | lea $Z2sqr(%rsp), $r_ptr # Z2^2 | ||
| 1401 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); | ||
| 1402 | |||
| 1403 | pcmpeqd %xmm4, %xmm5 | ||
| 1404 | pshufd \$0xb1, %xmm3, %xmm4 | ||
| 1405 | por %xmm3, %xmm4 | ||
| 1406 | pshufd \$0, %xmm5, %xmm5 # in1infty | ||
| 1407 | pshufd \$0x1e, %xmm4, %xmm3 | ||
| 1408 | por %xmm3, %xmm4 | ||
| 1409 | pxor %xmm3, %xmm3 | ||
| 1410 | pcmpeqd %xmm3, %xmm4 | ||
| 1411 | pshufd \$0, %xmm4, %xmm4 # in2infty | ||
| 1412 | mov 0x40+8*0($b_ptr), $src0 # load original in1_z | ||
| 1413 | mov 0x40+8*1($b_ptr), $acc6 | ||
| 1414 | mov 0x40+8*2($b_ptr), $acc7 | ||
| 1415 | mov 0x40+8*3($b_ptr), $acc0 | ||
| 1416 | movq $b_ptr, %xmm1 | ||
| 1417 | |||
| 1418 | lea 0x40-$bias($b_ptr), $a_ptr | ||
| 1419 | lea $Z1sqr(%rsp), $r_ptr # Z1^2 | ||
| 1420 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1421 | |||
| 1422 | `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` | ||
| 1423 | lea $S1(%rsp), $r_ptr # S1 = Z2^3 | ||
| 1424 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 1425 | |||
| 1426 | `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1427 | lea $S2(%rsp), $r_ptr # S2 = Z1^3 | ||
| 1428 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1429 | |||
| 1430 | `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` | ||
| 1431 | lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 | ||
| 1432 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); | ||
| 1433 | |||
| 1434 | `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` | ||
| 1435 | lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 | ||
| 1436 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); | ||
| 1437 | |||
| 1438 | lea $S1(%rsp), $b_ptr | ||
| 1439 | lea $R(%rsp), $r_ptr # R = S2 - S1 | ||
| 1440 | call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); | ||
| 1441 | |||
| 1442 | or $acc5, $acc4 # see if result is zero | ||
| 1443 | movdqa %xmm4, %xmm2 | ||
| 1444 | or $acc0, $acc4 | ||
| 1445 | or $acc1, $acc4 | ||
| 1446 | por %xmm5, %xmm2 # in1infty || in2infty | ||
| 1447 | movq $acc4, %xmm3 | ||
| 1448 | |||
| 1449 | `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` | ||
| 1450 | lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 | ||
| 1451 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 1452 | |||
| 1453 | `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` | ||
| 1454 | lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 | ||
| 1455 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 1456 | |||
| 1457 | lea $U1(%rsp), $b_ptr | ||
| 1458 | lea $H(%rsp), $r_ptr # H = U2 - U1 | ||
| 1459 | call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); | ||
| 1460 | |||
| 1461 | or $acc5, $acc4 # see if result is zero | ||
| 1462 | or $acc0, $acc4 | ||
| 1463 | or $acc1, $acc4 | ||
| 1464 | |||
| 1465 | .byte 0x3e # predict taken | ||
| 1466 | jnz .Ladd_proceed$x # is_equal(U1,U2)? | ||
| 1467 | movq %xmm2, $acc0 | ||
| 1468 | movq %xmm3, $acc1 | ||
| 1469 | test $acc0, $acc0 | ||
| 1470 | jnz .Ladd_proceed$x # (in1infty || in2infty)? | ||
| 1471 | test $acc1, $acc1 | ||
| 1472 | jz .Ladd_double$x # is_equal(S1,S2)? | ||
| 1473 | |||
| 1474 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1475 | pxor %xmm0, %xmm0 | ||
| 1476 | movdqu %xmm0, 0x00($r_ptr) | ||
| 1477 | movdqu %xmm0, 0x10($r_ptr) | ||
| 1478 | movdqu %xmm0, 0x20($r_ptr) | ||
| 1479 | movdqu %xmm0, 0x30($r_ptr) | ||
| 1480 | movdqu %xmm0, 0x40($r_ptr) | ||
| 1481 | movdqu %xmm0, 0x50($r_ptr) | ||
| 1482 | jmp .Ladd_done$x | ||
| 1483 | |||
| 1484 | .align 32 | ||
| 1485 | .Ladd_double$x: | ||
| 1486 | movq %xmm1, $a_ptr # restore $a_ptr | ||
| 1487 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1488 | add \$`32*(18-5)`, %rsp # difference in frame sizes | ||
| 1489 | jmp .Lpoint_double_shortcut$x | ||
| 1490 | |||
| 1491 | .align 32 | ||
| 1492 | .Ladd_proceed$x: | ||
| 1493 | `&load_for_sqr("$R(%rsp)", "$src0")` | ||
| 1494 | lea $Rsqr(%rsp), $r_ptr # R^2 | ||
| 1495 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); | ||
| 1496 | |||
| 1497 | `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1498 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | ||
| 1499 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); | ||
| 1500 | |||
| 1501 | `&load_for_sqr("$H(%rsp)", "$src0")` | ||
| 1502 | lea $Hsqr(%rsp), $r_ptr # H^2 | ||
| 1503 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); | ||
| 1504 | |||
| 1505 | `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` | ||
| 1506 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | ||
| 1507 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); | ||
| 1508 | |||
| 1509 | `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` | ||
| 1510 | lea $Hcub(%rsp), $r_ptr # H^3 | ||
| 1511 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1512 | |||
| 1513 | `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` | ||
| 1514 | lea $U2(%rsp), $r_ptr # U1*H^2 | ||
| 1515 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); | ||
| 1516 | ___ | ||
| 1517 | { | ||
| 1518 | ####################################################################### | ||
| 1519 | # operate in 4-5-0-1 "name space" that matches multiplication output | ||
| 1520 | # | ||
| 1521 | my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | ||
| 1522 | my ($poly1, $poly3)=($acc6,$acc7); | ||
| 1523 | |||
| 1524 | $code.=<<___; | ||
| 1525 | #lea $U2(%rsp), $a_ptr | ||
| 1526 | #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | ||
| 1527 | #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | ||
| 1528 | |||
| 1529 | add $acc0, $acc0 # a0:a3+a0:a3 | ||
| 1530 | lea $Rsqr(%rsp), $a_ptr | ||
| 1531 | adc $acc1, $acc1 | ||
| 1532 | mov $acc0, $t0 | ||
| 1533 | adc $acc2, $acc2 | ||
| 1534 | adc $acc3, $acc3 | ||
| 1535 | mov $acc1, $t1 | ||
| 1536 | sbb $t4, $t4 | ||
| 1537 | |||
| 1538 | sub \$-1, $acc0 | ||
| 1539 | mov $acc2, $t2 | ||
| 1540 | sbb $poly1, $acc1 | ||
| 1541 | sbb \$0, $acc2 | ||
| 1542 | mov $acc3, $t3 | ||
| 1543 | sbb $poly3, $acc3 | ||
| 1544 | test $t4, $t4 | ||
| 1545 | |||
| 1546 | cmovz $t0, $acc0 | ||
| 1547 | mov 8*0($a_ptr), $t0 | ||
| 1548 | cmovz $t1, $acc1 | ||
| 1549 | mov 8*1($a_ptr), $t1 | ||
| 1550 | cmovz $t2, $acc2 | ||
| 1551 | mov 8*2($a_ptr), $t2 | ||
| 1552 | cmovz $t3, $acc3 | ||
| 1553 | mov 8*3($a_ptr), $t3 | ||
| 1554 | |||
| 1555 | call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1556 | |||
| 1557 | lea $Hcub(%rsp), $b_ptr | ||
| 1558 | lea $res_x(%rsp), $r_ptr | ||
| 1559 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); | ||
| 1560 | |||
| 1561 | mov $U2+8*0(%rsp), $t0 | ||
| 1562 | mov $U2+8*1(%rsp), $t1 | ||
| 1563 | mov $U2+8*2(%rsp), $t2 | ||
| 1564 | mov $U2+8*3(%rsp), $t3 | ||
| 1565 | lea $res_y(%rsp), $r_ptr | ||
| 1566 | |||
| 1567 | call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); | ||
| 1568 | |||
| 1569 | mov $acc0, 8*0($r_ptr) # save the result, as | ||
| 1570 | mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't | ||
| 1571 | mov $acc2, 8*2($r_ptr) | ||
| 1572 | mov $acc3, 8*3($r_ptr) | ||
| 1573 | ___ | ||
| 1574 | } | ||
| 1575 | $code.=<<___; | ||
| 1576 | `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` | ||
| 1577 | lea $S2(%rsp), $r_ptr | ||
| 1578 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); | ||
| 1579 | |||
| 1580 | `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` | ||
| 1581 | lea $res_y(%rsp), $r_ptr | ||
| 1582 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); | ||
| 1583 | |||
| 1584 | lea $S2(%rsp), $b_ptr | ||
| 1585 | lea $res_y(%rsp), $r_ptr | ||
| 1586 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); | ||
| 1587 | |||
| 1588 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1589 | |||
| 1590 | movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); | ||
| 1591 | movdqa %xmm5, %xmm1 | ||
| 1592 | pandn $res_z(%rsp), %xmm0 | ||
| 1593 | movdqa %xmm5, %xmm2 | ||
| 1594 | pandn $res_z+0x10(%rsp), %xmm1 | ||
| 1595 | movdqa %xmm5, %xmm3 | ||
| 1596 | pand $in2_z(%rsp), %xmm2 | ||
| 1597 | pand $in2_z+0x10(%rsp), %xmm3 | ||
| 1598 | por %xmm0, %xmm2 | ||
| 1599 | por %xmm1, %xmm3 | ||
| 1600 | |||
| 1601 | movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); | ||
| 1602 | movdqa %xmm4, %xmm1 | ||
| 1603 | pandn %xmm2, %xmm0 | ||
| 1604 | movdqa %xmm4, %xmm2 | ||
| 1605 | pandn %xmm3, %xmm1 | ||
| 1606 | movdqa %xmm4, %xmm3 | ||
| 1607 | pand $in1_z(%rsp), %xmm2 | ||
| 1608 | pand $in1_z+0x10(%rsp), %xmm3 | ||
| 1609 | por %xmm0, %xmm2 | ||
| 1610 | por %xmm1, %xmm3 | ||
| 1611 | movdqu %xmm2, 0x40($r_ptr) | ||
| 1612 | movdqu %xmm3, 0x50($r_ptr) | ||
| 1613 | |||
| 1614 | movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); | ||
| 1615 | movdqa %xmm5, %xmm1 | ||
| 1616 | pandn $res_x(%rsp), %xmm0 | ||
| 1617 | movdqa %xmm5, %xmm2 | ||
| 1618 | pandn $res_x+0x10(%rsp), %xmm1 | ||
| 1619 | movdqa %xmm5, %xmm3 | ||
| 1620 | pand $in2_x(%rsp), %xmm2 | ||
| 1621 | pand $in2_x+0x10(%rsp), %xmm3 | ||
| 1622 | por %xmm0, %xmm2 | ||
| 1623 | por %xmm1, %xmm3 | ||
| 1624 | |||
| 1625 | movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); | ||
| 1626 | movdqa %xmm4, %xmm1 | ||
| 1627 | pandn %xmm2, %xmm0 | ||
| 1628 | movdqa %xmm4, %xmm2 | ||
| 1629 | pandn %xmm3, %xmm1 | ||
| 1630 | movdqa %xmm4, %xmm3 | ||
| 1631 | pand $in1_x(%rsp), %xmm2 | ||
| 1632 | pand $in1_x+0x10(%rsp), %xmm3 | ||
| 1633 | por %xmm0, %xmm2 | ||
| 1634 | por %xmm1, %xmm3 | ||
| 1635 | movdqu %xmm2, 0x00($r_ptr) | ||
| 1636 | movdqu %xmm3, 0x10($r_ptr) | ||
| 1637 | |||
| 1638 | movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); | ||
| 1639 | movdqa %xmm5, %xmm1 | ||
| 1640 | pandn $res_y(%rsp), %xmm0 | ||
| 1641 | movdqa %xmm5, %xmm2 | ||
| 1642 | pandn $res_y+0x10(%rsp), %xmm1 | ||
| 1643 | movdqa %xmm5, %xmm3 | ||
| 1644 | pand $in2_y(%rsp), %xmm2 | ||
| 1645 | pand $in2_y+0x10(%rsp), %xmm3 | ||
| 1646 | por %xmm0, %xmm2 | ||
| 1647 | por %xmm1, %xmm3 | ||
| 1648 | |||
| 1649 | movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); | ||
| 1650 | movdqa %xmm4, %xmm1 | ||
| 1651 | pandn %xmm2, %xmm0 | ||
| 1652 | movdqa %xmm4, %xmm2 | ||
| 1653 | pandn %xmm3, %xmm1 | ||
| 1654 | movdqa %xmm4, %xmm3 | ||
| 1655 | pand $in1_y(%rsp), %xmm2 | ||
| 1656 | pand $in1_y+0x10(%rsp), %xmm3 | ||
| 1657 | por %xmm0, %xmm2 | ||
| 1658 | por %xmm1, %xmm3 | ||
| 1659 | movdqu %xmm2, 0x20($r_ptr) | ||
| 1660 | movdqu %xmm3, 0x30($r_ptr) | ||
| 1661 | |||
| 1662 | .Ladd_done$x: | ||
| 1663 | add \$32*18+8, %rsp | ||
| 1664 | pop %r15 | ||
| 1665 | pop %r14 | ||
| 1666 | pop %r13 | ||
| 1667 | pop %r12 | ||
| 1668 | pop %rbx | ||
| 1669 | pop %rbp | ||
| 1670 | ret | ||
| 1671 | .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx | ||
| 1672 | ___ | ||
| 1673 | } | ||
| 1674 | &gen_add("q"); | ||
| 1675 | |||
| 1676 | sub gen_add_affine () { | ||
| 1677 | my $x = shift; | ||
| 1678 | my ($src0,$sfx,$bias); | ||
| 1679 | my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, | ||
| 1680 | $res_x,$res_y,$res_z, | ||
| 1681 | $in1_x,$in1_y,$in1_z, | ||
| 1682 | $in2_x,$in2_y)=map(32*$_,(0..14)); | ||
| 1683 | my $Z1sqr = $S2; | ||
| 1684 | |||
| 1685 | if ($x ne "x") { | ||
| 1686 | $src0 = "%rax"; | ||
| 1687 | $sfx = ""; | ||
| 1688 | $bias = 0; | ||
| 1689 | |||
| 1690 | $code.=<<___; | ||
| 1691 | .globl ecp_nistz256_point_add_affine | ||
| 1692 | .type ecp_nistz256_point_add_affine,\@function,3 | ||
| 1693 | .align 32 | ||
| 1694 | ecp_nistz256_point_add_affine: | ||
| 1695 | ___ | ||
| 1696 | } else { | ||
| 1697 | $src0 = "%rdx"; | ||
| 1698 | $sfx = "x"; | ||
| 1699 | $bias = 128; | ||
| 1700 | } | ||
| 1701 | $code.=<<___; | ||
| 1702 | push %rbp | ||
| 1703 | push %rbx | ||
| 1704 | push %r12 | ||
| 1705 | push %r13 | ||
| 1706 | push %r14 | ||
| 1707 | push %r15 | ||
| 1708 | sub \$32*15+8, %rsp | ||
| 1709 | |||
| 1710 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr | ||
| 1711 | mov $b_org, $b_ptr # reassign | ||
| 1712 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1713 | movdqu 0x20($a_ptr), %xmm2 | ||
| 1714 | movdqu 0x30($a_ptr), %xmm3 | ||
| 1715 | movdqu 0x40($a_ptr), %xmm4 | ||
| 1716 | movdqu 0x50($a_ptr), %xmm5 | ||
| 1717 | mov 0x40+8*0($a_ptr), $src0 # load original in1_z | ||
| 1718 | mov 0x40+8*1($a_ptr), $acc6 | ||
| 1719 | mov 0x40+8*2($a_ptr), $acc7 | ||
| 1720 | mov 0x40+8*3($a_ptr), $acc0 | ||
| 1721 | movdqa %xmm0, $in1_x(%rsp) | ||
| 1722 | movdqa %xmm1, $in1_x+0x10(%rsp) | ||
| 1723 | por %xmm0, %xmm1 | ||
| 1724 | movdqa %xmm2, $in1_y(%rsp) | ||
| 1725 | movdqa %xmm3, $in1_y+0x10(%rsp) | ||
| 1726 | por %xmm2, %xmm3 | ||
| 1727 | movdqa %xmm4, $in1_z(%rsp) | ||
| 1728 | movdqa %xmm5, $in1_z+0x10(%rsp) | ||
| 1729 | por %xmm1, %xmm3 | ||
| 1730 | |||
| 1731 | movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr | ||
| 1732 | pshufd \$0xb1, %xmm3, %xmm5 | ||
| 1733 | movdqu 0x10($b_ptr), %xmm1 | ||
| 1734 | movdqu 0x20($b_ptr), %xmm2 | ||
| 1735 | por %xmm3, %xmm5 | ||
| 1736 | movdqu 0x30($b_ptr), %xmm3 | ||
| 1737 | movdqa %xmm0, $in2_x(%rsp) | ||
| 1738 | pshufd \$0x1e, %xmm5, %xmm4 | ||
| 1739 | movdqa %xmm1, $in2_x+0x10(%rsp) | ||
| 1740 | por %xmm0, %xmm1 | ||
| 1741 | movq $r_ptr, %xmm0 # save $r_ptr | ||
| 1742 | movdqa %xmm2, $in2_y(%rsp) | ||
| 1743 | movdqa %xmm3, $in2_y+0x10(%rsp) | ||
| 1744 | por %xmm2, %xmm3 | ||
| 1745 | por %xmm4, %xmm5 | ||
| 1746 | pxor %xmm4, %xmm4 | ||
| 1747 | por %xmm1, %xmm3 | ||
| 1748 | |||
| 1749 | lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid | ||
| 1750 | lea $Z1sqr(%rsp), $r_ptr # Z1^2 | ||
| 1751 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1752 | |||
| 1753 | pcmpeqd %xmm4, %xmm5 | ||
| 1754 | pshufd \$0xb1, %xmm3, %xmm4 | ||
| 1755 | mov 0x00($b_ptr), $src0 # $b_ptr is still valid | ||
| 1756 | #lea 0x00($b_ptr), $b_ptr | ||
| 1757 | mov $acc4, $acc1 # harmonize sqr output and mul input | ||
| 1758 | por %xmm3, %xmm4 | ||
| 1759 | pshufd \$0, %xmm5, %xmm5 # in1infty | ||
| 1760 | pshufd \$0x1e, %xmm4, %xmm3 | ||
| 1761 | mov $acc5, $acc2 | ||
| 1762 | por %xmm3, %xmm4 | ||
| 1763 | pxor %xmm3, %xmm3 | ||
| 1764 | mov $acc6, $acc3 | ||
| 1765 | pcmpeqd %xmm3, %xmm4 | ||
| 1766 | pshufd \$0, %xmm4, %xmm4 # in2infty | ||
| 1767 | |||
| 1768 | lea $Z1sqr-$bias(%rsp), $a_ptr | ||
| 1769 | mov $acc7, $acc4 | ||
| 1770 | lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 | ||
| 1771 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1772 | |||
| 1773 | lea $in1_x(%rsp), $b_ptr | ||
| 1774 | lea $H(%rsp), $r_ptr # H = U2 - U1 | ||
| 1775 | call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); | ||
| 1776 | |||
| 1777 | `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1778 | lea $S2(%rsp), $r_ptr # S2 = Z1^3 | ||
| 1779 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1780 | |||
| 1781 | `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1782 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | ||
| 1783 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); | ||
| 1784 | |||
| 1785 | `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` | ||
| 1786 | lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 | ||
| 1787 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); | ||
| 1788 | |||
| 1789 | lea $in1_y(%rsp), $b_ptr | ||
| 1790 | lea $R(%rsp), $r_ptr # R = S2 - S1 | ||
| 1791 | call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); | ||
| 1792 | |||
| 1793 | `&load_for_sqr("$H(%rsp)", "$src0")` | ||
| 1794 | lea $Hsqr(%rsp), $r_ptr # H^2 | ||
| 1795 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); | ||
| 1796 | |||
| 1797 | `&load_for_sqr("$R(%rsp)", "$src0")` | ||
| 1798 | lea $Rsqr(%rsp), $r_ptr # R^2 | ||
| 1799 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); | ||
| 1800 | |||
| 1801 | `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` | ||
| 1802 | lea $Hcub(%rsp), $r_ptr # H^3 | ||
| 1803 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1804 | |||
| 1805 | `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` | ||
| 1806 | lea $U2(%rsp), $r_ptr # U1*H^2 | ||
| 1807 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1808 | ___ | ||
| 1809 | { | ||
| 1810 | ####################################################################### | ||
| 1811 | # operate in 4-5-0-1 "name space" that matches multiplication output | ||
| 1812 | # | ||
| 1813 | my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | ||
| 1814 | my ($poly1, $poly3)=($acc6,$acc7); | ||
| 1815 | |||
| 1816 | $code.=<<___; | ||
| 1817 | #lea $U2(%rsp), $a_ptr | ||
| 1818 | #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | ||
| 1819 | #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | ||
| 1820 | |||
| 1821 | add $acc0, $acc0 # a0:a3+a0:a3 | ||
| 1822 | lea $Rsqr(%rsp), $a_ptr | ||
| 1823 | adc $acc1, $acc1 | ||
| 1824 | mov $acc0, $t0 | ||
| 1825 | adc $acc2, $acc2 | ||
| 1826 | adc $acc3, $acc3 | ||
| 1827 | mov $acc1, $t1 | ||
| 1828 | sbb $t4, $t4 | ||
| 1829 | |||
| 1830 | sub \$-1, $acc0 | ||
| 1831 | mov $acc2, $t2 | ||
| 1832 | sbb $poly1, $acc1 | ||
| 1833 | sbb \$0, $acc2 | ||
| 1834 | mov $acc3, $t3 | ||
| 1835 | sbb $poly3, $acc3 | ||
| 1836 | test $t4, $t4 | ||
| 1837 | |||
| 1838 | cmovz $t0, $acc0 | ||
| 1839 | mov 8*0($a_ptr), $t0 | ||
| 1840 | cmovz $t1, $acc1 | ||
| 1841 | mov 8*1($a_ptr), $t1 | ||
| 1842 | cmovz $t2, $acc2 | ||
| 1843 | mov 8*2($a_ptr), $t2 | ||
| 1844 | cmovz $t3, $acc3 | ||
| 1845 | mov 8*3($a_ptr), $t3 | ||
| 1846 | |||
| 1847 | call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1848 | |||
| 1849 | lea $Hcub(%rsp), $b_ptr | ||
| 1850 | lea $res_x(%rsp), $r_ptr | ||
| 1851 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); | ||
| 1852 | |||
| 1853 | mov $U2+8*0(%rsp), $t0 | ||
| 1854 | mov $U2+8*1(%rsp), $t1 | ||
| 1855 | mov $U2+8*2(%rsp), $t2 | ||
| 1856 | mov $U2+8*3(%rsp), $t3 | ||
| 1857 | lea $H(%rsp), $r_ptr | ||
| 1858 | |||
| 1859 | call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); | ||
| 1860 | |||
| 1861 | mov $acc0, 8*0($r_ptr) # save the result, as | ||
| 1862 | mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't | ||
| 1863 | mov $acc2, 8*2($r_ptr) | ||
| 1864 | mov $acc3, 8*3($r_ptr) | ||
| 1865 | ___ | ||
| 1866 | } | ||
| 1867 | $code.=<<___; | ||
| 1868 | `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` | ||
| 1869 | lea $S2(%rsp), $r_ptr | ||
| 1870 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); | ||
| 1871 | |||
| 1872 | `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` | ||
| 1873 | lea $H(%rsp), $r_ptr | ||
| 1874 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); | ||
| 1875 | |||
| 1876 | lea $S2(%rsp), $b_ptr | ||
| 1877 | lea $res_y(%rsp), $r_ptr | ||
| 1878 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); | ||
| 1879 | |||
| 1880 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1881 | |||
| 1882 | movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); | ||
| 1883 | movdqa %xmm5, %xmm1 | ||
| 1884 | pandn $res_z(%rsp), %xmm0 | ||
| 1885 | movdqa %xmm5, %xmm2 | ||
| 1886 | pandn $res_z+0x10(%rsp), %xmm1 | ||
| 1887 | movdqa %xmm5, %xmm3 | ||
| 1888 | pand .LONE_mont(%rip), %xmm2 | ||
| 1889 | pand .LONE_mont+0x10(%rip), %xmm3 | ||
| 1890 | por %xmm0, %xmm2 | ||
| 1891 | por %xmm1, %xmm3 | ||
| 1892 | |||
| 1893 | movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); | ||
| 1894 | movdqa %xmm4, %xmm1 | ||
| 1895 | pandn %xmm2, %xmm0 | ||
| 1896 | movdqa %xmm4, %xmm2 | ||
| 1897 | pandn %xmm3, %xmm1 | ||
| 1898 | movdqa %xmm4, %xmm3 | ||
| 1899 | pand $in1_z(%rsp), %xmm2 | ||
| 1900 | pand $in1_z+0x10(%rsp), %xmm3 | ||
| 1901 | por %xmm0, %xmm2 | ||
| 1902 | por %xmm1, %xmm3 | ||
| 1903 | movdqu %xmm2, 0x40($r_ptr) | ||
| 1904 | movdqu %xmm3, 0x50($r_ptr) | ||
| 1905 | |||
| 1906 | movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); | ||
| 1907 | movdqa %xmm5, %xmm1 | ||
| 1908 | pandn $res_x(%rsp), %xmm0 | ||
| 1909 | movdqa %xmm5, %xmm2 | ||
| 1910 | pandn $res_x+0x10(%rsp), %xmm1 | ||
| 1911 | movdqa %xmm5, %xmm3 | ||
| 1912 | pand $in2_x(%rsp), %xmm2 | ||
| 1913 | pand $in2_x+0x10(%rsp), %xmm3 | ||
| 1914 | por %xmm0, %xmm2 | ||
| 1915 | por %xmm1, %xmm3 | ||
| 1916 | |||
| 1917 | movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); | ||
| 1918 | movdqa %xmm4, %xmm1 | ||
| 1919 | pandn %xmm2, %xmm0 | ||
| 1920 | movdqa %xmm4, %xmm2 | ||
| 1921 | pandn %xmm3, %xmm1 | ||
| 1922 | movdqa %xmm4, %xmm3 | ||
| 1923 | pand $in1_x(%rsp), %xmm2 | ||
| 1924 | pand $in1_x+0x10(%rsp), %xmm3 | ||
| 1925 | por %xmm0, %xmm2 | ||
| 1926 | por %xmm1, %xmm3 | ||
| 1927 | movdqu %xmm2, 0x00($r_ptr) | ||
| 1928 | movdqu %xmm3, 0x10($r_ptr) | ||
| 1929 | |||
| 1930 | movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); | ||
| 1931 | movdqa %xmm5, %xmm1 | ||
| 1932 | pandn $res_y(%rsp), %xmm0 | ||
| 1933 | movdqa %xmm5, %xmm2 | ||
| 1934 | pandn $res_y+0x10(%rsp), %xmm1 | ||
| 1935 | movdqa %xmm5, %xmm3 | ||
| 1936 | pand $in2_y(%rsp), %xmm2 | ||
| 1937 | pand $in2_y+0x10(%rsp), %xmm3 | ||
| 1938 | por %xmm0, %xmm2 | ||
| 1939 | por %xmm1, %xmm3 | ||
| 1940 | |||
| 1941 | movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); | ||
| 1942 | movdqa %xmm4, %xmm1 | ||
| 1943 | pandn %xmm2, %xmm0 | ||
| 1944 | movdqa %xmm4, %xmm2 | ||
| 1945 | pandn %xmm3, %xmm1 | ||
| 1946 | movdqa %xmm4, %xmm3 | ||
| 1947 | pand $in1_y(%rsp), %xmm2 | ||
| 1948 | pand $in1_y+0x10(%rsp), %xmm3 | ||
| 1949 | por %xmm0, %xmm2 | ||
| 1950 | por %xmm1, %xmm3 | ||
| 1951 | movdqu %xmm2, 0x20($r_ptr) | ||
| 1952 | movdqu %xmm3, 0x30($r_ptr) | ||
| 1953 | |||
| 1954 | add \$32*15+8, %rsp | ||
| 1955 | pop %r15 | ||
| 1956 | pop %r14 | ||
| 1957 | pop %r13 | ||
| 1958 | pop %r12 | ||
| 1959 | pop %rbx | ||
| 1960 | pop %rbp | ||
| 1961 | ret | ||
| 1962 | .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx | ||
| 1963 | ___ | ||
| 1964 | } | ||
| 1965 | &gen_add_affine("q"); | ||
| 1966 | |||
| 1967 | }}} | ||
| 1968 | |||
| 1969 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 1970 | print $code; | ||
| 1971 | close STDOUT; | ||
