diff options
| author | jsing <> | 2023-01-14 15:45:44 +0000 |
|---|---|---|
| committer | jsing <> | 2023-01-14 15:45:44 +0000 |
| commit | 2caf68e3ec46ff4ba172978eb728e2aa23948684 (patch) | |
| tree | 2beaa725f80865b4e4c7d3384563e3ed4940579e /src/lib/libcrypto/ec/asm | |
| parent | e182204c3487929ef9f6791554e79586f4d30335 (diff) | |
| download | openbsd-2caf68e3ec46ff4ba172978eb728e2aa23948684.tar.gz openbsd-2caf68e3ec46ff4ba172978eb728e2aa23948684.tar.bz2 openbsd-2caf68e3ec46ff4ba172978eb728e2aa23948684.zip | |
Remove unused Elliptic Curve code.
For various reasons, the ecp_nistp* and ecp_nistz* code is unused. While
ecp_nistp* was being compiled, it is disabled due to
OPENSSL_NO_EC_NISTP_64_GCC_128 being defined. On the other hand,
ecp_nistz* was not even being built.
We will bring in new versions or alternative versions of such code, if we
end up enabling it in the future. For now it is just causing complexity
(and grep noise) while trying to improve the EC code.
Discussed with tb@
Diffstat (limited to 'src/lib/libcrypto/ec/asm')
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl | 1733 | ||||
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl | 2890 | ||||
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl | 1740 | ||||
| -rw-r--r-- | src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl | 1971 |
4 files changed, 0 insertions, 8334 deletions
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl deleted file mode 100644 index 9e6c65905f..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl +++ /dev/null | |||
| @@ -1,1733 +0,0 @@ | |||
| 1 | #! /usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-armv4.pl,v 1.2 2022/12/26 07:18:51 jmc Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | |||
| 12 | # ==================================================================== | ||
| 13 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 14 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 15 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 16 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 17 | # ==================================================================== | ||
| 18 | # | ||
| 19 | # ECP_NISTZ256 module for ARMv4. | ||
| 20 | # | ||
| 21 | # October 2014. | ||
| 22 | # | ||
| 23 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | ||
| 24 | # http://eprint.iacr.org/2013/816. In the process of adaptation | ||
| 25 | # original .c module was made 32-bit savvy in order to make this | ||
| 26 | # implementation possible. | ||
| 27 | # | ||
| 28 | # with/without -DECP_NISTZ256_ASM | ||
| 29 | # Cortex-A8 +53-170% | ||
| 30 | # Cortex-A9 +76-205% | ||
| 31 | # Cortex-A15 +100-316% | ||
| 32 | # Snapdragon S4 +66-187% | ||
| 33 | # | ||
| 34 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 35 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | ||
| 36 | # operation. Keep in mind that +200% means 3x improvement. | ||
| 37 | |||
| 38 | $flavour = shift; | ||
| 39 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | ||
| 40 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | ||
| 41 | |||
| 42 | if ($flavour && $flavour ne "void") { | ||
| 43 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 44 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | ||
| 45 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | ||
| 46 | die "can't locate arm-xlate.pl"; | ||
| 47 | |||
| 48 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 49 | } else { | ||
| 50 | open STDOUT,">$output"; | ||
| 51 | } | ||
| 52 | |||
| 53 | $code.=<<___; | ||
| 54 | #include "arm_arch.h" | ||
| 55 | |||
| 56 | .text | ||
| 57 | #if defined(__thumb2__) | ||
| 58 | .syntax unified | ||
| 59 | .thumb | ||
| 60 | #else | ||
| 61 | .code 32 | ||
| 62 | #endif | ||
| 63 | ___ | ||
| 64 | |||
| 65 | $code.=<<___; | ||
| 66 | .Lone: | ||
| 67 | .long 1,0,0,0,0,0,0,0 | ||
| 68 | .align 6 | ||
| 69 | ___ | ||
| 70 | |||
| 71 | ######################################################################## | ||
| 72 | # common register layout, note that $t2 is link register, so that if | ||
| 73 | # internal subroutine uses $t2, then it has to offload lr... | ||
| 74 | |||
| 75 | ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= | ||
| 76 | map("r$_",(0..12,14)); | ||
| 77 | ($t0,$t3)=($ff,$a_ptr); | ||
| 78 | |||
| 79 | $code.=<<___; | ||
| 80 | @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 81 | .globl ecp_nistz256_from_mont | ||
| 82 | .type ecp_nistz256_from_mont,%function | ||
| 83 | ecp_nistz256_from_mont: | ||
| 84 | adr $b_ptr,.Lone | ||
| 85 | b .Lecp_nistz256_mul_mont | ||
| 86 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | ||
| 87 | |||
| 88 | @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 89 | .globl ecp_nistz256_mul_by_2 | ||
| 90 | .type ecp_nistz256_mul_by_2,%function | ||
| 91 | .align 4 | ||
| 92 | ecp_nistz256_mul_by_2: | ||
| 93 | stmdb sp!,{r4-r12,lr} | ||
| 94 | bl __ecp_nistz256_mul_by_2 | ||
| 95 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 96 | ldmia sp!,{r4-r12,pc} | ||
| 97 | #else | ||
| 98 | ldmia sp!,{r4-r12,lr} | ||
| 99 | bx lr @ interoperable with Thumb ISA:-) | ||
| 100 | #endif | ||
| 101 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | ||
| 102 | |||
| 103 | .type __ecp_nistz256_mul_by_2,%function | ||
| 104 | .align 4 | ||
| 105 | __ecp_nistz256_mul_by_2: | ||
| 106 | ldr $a0,[$a_ptr,#0] | ||
| 107 | ldr $a1,[$a_ptr,#4] | ||
| 108 | ldr $a2,[$a_ptr,#8] | ||
| 109 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself | ||
| 110 | ldr $a3,[$a_ptr,#12] | ||
| 111 | adcs $a1,$a1,$a1 | ||
| 112 | ldr $a4,[$a_ptr,#16] | ||
| 113 | adcs $a2,$a2,$a2 | ||
| 114 | ldr $a5,[$a_ptr,#20] | ||
| 115 | adcs $a3,$a3,$a3 | ||
| 116 | ldr $a6,[$a_ptr,#24] | ||
| 117 | adcs $a4,$a4,$a4 | ||
| 118 | ldr $a7,[$a_ptr,#28] | ||
| 119 | adcs $a5,$a5,$a5 | ||
| 120 | adcs $a6,$a6,$a6 | ||
| 121 | mov $ff,#0 | ||
| 122 | adcs $a7,$a7,$a7 | ||
| 123 | adc $ff,$ff,#0 | ||
| 124 | |||
| 125 | b .Lreduce_by_sub | ||
| 126 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 | ||
| 127 | |||
| 128 | @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], | ||
| 129 | @ const BN_ULONG r2[8]); | ||
| 130 | .globl ecp_nistz256_add | ||
| 131 | .type ecp_nistz256_add,%function | ||
| 132 | .align 4 | ||
| 133 | ecp_nistz256_add: | ||
| 134 | stmdb sp!,{r4-r12,lr} | ||
| 135 | bl __ecp_nistz256_add | ||
| 136 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 137 | ldmia sp!,{r4-r12,pc} | ||
| 138 | #else | ||
| 139 | ldmia sp!,{r4-r12,lr} | ||
| 140 | bx lr @ interoperable with Thumb ISA:-) | ||
| 141 | #endif | ||
| 142 | .size ecp_nistz256_add,.-ecp_nistz256_add | ||
| 143 | |||
| 144 | .type __ecp_nistz256_add,%function | ||
| 145 | .align 4 | ||
| 146 | __ecp_nistz256_add: | ||
| 147 | str lr,[sp,#-4]! @ push lr | ||
| 148 | |||
| 149 | ldr $a0,[$a_ptr,#0] | ||
| 150 | ldr $a1,[$a_ptr,#4] | ||
| 151 | ldr $a2,[$a_ptr,#8] | ||
| 152 | ldr $a3,[$a_ptr,#12] | ||
| 153 | ldr $a4,[$a_ptr,#16] | ||
| 154 | ldr $t0,[$b_ptr,#0] | ||
| 155 | ldr $a5,[$a_ptr,#20] | ||
| 156 | ldr $t1,[$b_ptr,#4] | ||
| 157 | ldr $a6,[$a_ptr,#24] | ||
| 158 | ldr $t2,[$b_ptr,#8] | ||
| 159 | ldr $a7,[$a_ptr,#28] | ||
| 160 | ldr $t3,[$b_ptr,#12] | ||
| 161 | adds $a0,$a0,$t0 | ||
| 162 | ldr $t0,[$b_ptr,#16] | ||
| 163 | adcs $a1,$a1,$t1 | ||
| 164 | ldr $t1,[$b_ptr,#20] | ||
| 165 | adcs $a2,$a2,$t2 | ||
| 166 | ldr $t2,[$b_ptr,#24] | ||
| 167 | adcs $a3,$a3,$t3 | ||
| 168 | ldr $t3,[$b_ptr,#28] | ||
| 169 | adcs $a4,$a4,$t0 | ||
| 170 | adcs $a5,$a5,$t1 | ||
| 171 | adcs $a6,$a6,$t2 | ||
| 172 | mov $ff,#0 | ||
| 173 | adcs $a7,$a7,$t3 | ||
| 174 | adc $ff,$ff,#0 | ||
| 175 | ldr lr,[sp],#4 @ pop lr | ||
| 176 | |||
| 177 | .Lreduce_by_sub: | ||
| 178 | |||
| 179 | @ if a+b >= modulus, subtract modulus. | ||
| 180 | @ | ||
| 181 | @ But since comparison implies subtraction, we subtract | ||
| 182 | @ modulus and then add it back if subtraction borrowed. | ||
| 183 | |||
| 184 | subs $a0,$a0,#-1 | ||
| 185 | sbcs $a1,$a1,#-1 | ||
| 186 | sbcs $a2,$a2,#-1 | ||
| 187 | sbcs $a3,$a3,#0 | ||
| 188 | sbcs $a4,$a4,#0 | ||
| 189 | sbcs $a5,$a5,#0 | ||
| 190 | sbcs $a6,$a6,#1 | ||
| 191 | sbcs $a7,$a7,#-1 | ||
| 192 | sbc $ff,$ff,#0 | ||
| 193 | |||
| 194 | @ Note that because mod has special form, i.e. consists of | ||
| 195 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 196 | @ using value of borrow as a whole or extracting single bit. | ||
| 197 | @ Follow $ff register... | ||
| 198 | |||
| 199 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 200 | adcs $a1,$a1,$ff | ||
| 201 | str $a0,[$r_ptr,#0] | ||
| 202 | adcs $a2,$a2,$ff | ||
| 203 | str $a1,[$r_ptr,#4] | ||
| 204 | adcs $a3,$a3,#0 | ||
| 205 | str $a2,[$r_ptr,#8] | ||
| 206 | adcs $a4,$a4,#0 | ||
| 207 | str $a3,[$r_ptr,#12] | ||
| 208 | adcs $a5,$a5,#0 | ||
| 209 | str $a4,[$r_ptr,#16] | ||
| 210 | adcs $a6,$a6,$ff,lsr#31 | ||
| 211 | str $a5,[$r_ptr,#20] | ||
| 212 | adcs $a7,$a7,$ff | ||
| 213 | str $a6,[$r_ptr,#24] | ||
| 214 | str $a7,[$r_ptr,#28] | ||
| 215 | |||
| 216 | mov pc,lr | ||
| 217 | .size __ecp_nistz256_add,.-__ecp_nistz256_add | ||
| 218 | |||
| 219 | @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 220 | .globl ecp_nistz256_mul_by_3 | ||
| 221 | .type ecp_nistz256_mul_by_3,%function | ||
| 222 | .align 4 | ||
| 223 | ecp_nistz256_mul_by_3: | ||
| 224 | stmdb sp!,{r4-r12,lr} | ||
| 225 | bl __ecp_nistz256_mul_by_3 | ||
| 226 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 227 | ldmia sp!,{r4-r12,pc} | ||
| 228 | #else | ||
| 229 | ldmia sp!,{r4-r12,lr} | ||
| 230 | bx lr @ interoperable with Thumb ISA:-) | ||
| 231 | #endif | ||
| 232 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | ||
| 233 | |||
| 234 | .type __ecp_nistz256_mul_by_3,%function | ||
| 235 | .align 4 | ||
| 236 | __ecp_nistz256_mul_by_3: | ||
| 237 | str lr,[sp,#-4]! @ push lr | ||
| 238 | |||
| 239 | @ As multiplication by 3 is performed as 2*n+n, below are inline | ||
| 240 | @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see | ||
| 241 | @ corresponding subroutines for details. | ||
| 242 | |||
| 243 | ldr $a0,[$a_ptr,#0] | ||
| 244 | ldr $a1,[$a_ptr,#4] | ||
| 245 | ldr $a2,[$a_ptr,#8] | ||
| 246 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] | ||
| 247 | ldr $a3,[$a_ptr,#12] | ||
| 248 | adcs $a1,$a1,$a1 | ||
| 249 | ldr $a4,[$a_ptr,#16] | ||
| 250 | adcs $a2,$a2,$a2 | ||
| 251 | ldr $a5,[$a_ptr,#20] | ||
| 252 | adcs $a3,$a3,$a3 | ||
| 253 | ldr $a6,[$a_ptr,#24] | ||
| 254 | adcs $a4,$a4,$a4 | ||
| 255 | ldr $a7,[$a_ptr,#28] | ||
| 256 | adcs $a5,$a5,$a5 | ||
| 257 | adcs $a6,$a6,$a6 | ||
| 258 | mov $ff,#0 | ||
| 259 | adcs $a7,$a7,$a7 | ||
| 260 | adc $ff,$ff,#0 | ||
| 261 | |||
| 262 | subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores | ||
| 263 | sbcs $a1,$a1,#-1 | ||
| 264 | sbcs $a2,$a2,#-1 | ||
| 265 | sbcs $a3,$a3,#0 | ||
| 266 | sbcs $a4,$a4,#0 | ||
| 267 | sbcs $a5,$a5,#0 | ||
| 268 | sbcs $a6,$a6,#1 | ||
| 269 | sbcs $a7,$a7,#-1 | ||
| 270 | sbc $ff,$ff,#0 | ||
| 271 | |||
| 272 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 273 | adcs $a1,$a1,$ff | ||
| 274 | adcs $a2,$a2,$ff | ||
| 275 | adcs $a3,$a3,#0 | ||
| 276 | adcs $a4,$a4,#0 | ||
| 277 | ldr $b_ptr,[$a_ptr,#0] | ||
| 278 | adcs $a5,$a5,#0 | ||
| 279 | ldr $t1,[$a_ptr,#4] | ||
| 280 | adcs $a6,$a6,$ff,lsr#31 | ||
| 281 | ldr $t2,[$a_ptr,#8] | ||
| 282 | adc $a7,$a7,$ff | ||
| 283 | |||
| 284 | ldr $t0,[$a_ptr,#12] | ||
| 285 | adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] | ||
| 286 | ldr $b_ptr,[$a_ptr,#16] | ||
| 287 | adcs $a1,$a1,$t1 | ||
| 288 | ldr $t1,[$a_ptr,#20] | ||
| 289 | adcs $a2,$a2,$t2 | ||
| 290 | ldr $t2,[$a_ptr,#24] | ||
| 291 | adcs $a3,$a3,$t0 | ||
| 292 | ldr $t3,[$a_ptr,#28] | ||
| 293 | adcs $a4,$a4,$b_ptr | ||
| 294 | adcs $a5,$a5,$t1 | ||
| 295 | adcs $a6,$a6,$t2 | ||
| 296 | mov $ff,#0 | ||
| 297 | adcs $a7,$a7,$t3 | ||
| 298 | adc $ff,$ff,#0 | ||
| 299 | ldr lr,[sp],#4 @ pop lr | ||
| 300 | |||
| 301 | b .Lreduce_by_sub | ||
| 302 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | ||
| 303 | |||
| 304 | @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 305 | .globl ecp_nistz256_div_by_2 | ||
| 306 | .type ecp_nistz256_div_by_2,%function | ||
| 307 | .align 4 | ||
| 308 | ecp_nistz256_div_by_2: | ||
| 309 | stmdb sp!,{r4-r12,lr} | ||
| 310 | bl __ecp_nistz256_div_by_2 | ||
| 311 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 312 | ldmia sp!,{r4-r12,pc} | ||
| 313 | #else | ||
| 314 | ldmia sp!,{r4-r12,lr} | ||
| 315 | bx lr @ interoperable with Thumb ISA:-) | ||
| 316 | #endif | ||
| 317 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | ||
| 318 | |||
| 319 | .type __ecp_nistz256_div_by_2,%function | ||
| 320 | .align 4 | ||
| 321 | __ecp_nistz256_div_by_2: | ||
| 322 | @ ret = (a is odd ? a+mod : a) >> 1 | ||
| 323 | |||
| 324 | ldr $a0,[$a_ptr,#0] | ||
| 325 | ldr $a1,[$a_ptr,#4] | ||
| 326 | ldr $a2,[$a_ptr,#8] | ||
| 327 | mov $ff,$a0,lsl#31 @ place least significant bit to most | ||
| 328 | @ significant position, now arithmetic | ||
| 329 | @ right shift by 31 will produce -1 or | ||
| 330 | @ 0, while logical right shift 1 or 0, | ||
| 331 | @ this is how modulus is conditionally | ||
| 332 | @ synthesized in this case... | ||
| 333 | ldr $a3,[$a_ptr,#12] | ||
| 334 | adds $a0,$a0,$ff,asr#31 | ||
| 335 | ldr $a4,[$a_ptr,#16] | ||
| 336 | adcs $a1,$a1,$ff,asr#31 | ||
| 337 | ldr $a5,[$a_ptr,#20] | ||
| 338 | adcs $a2,$a2,$ff,asr#31 | ||
| 339 | ldr $a6,[$a_ptr,#24] | ||
| 340 | adcs $a3,$a3,#0 | ||
| 341 | ldr $a7,[$a_ptr,#28] | ||
| 342 | adcs $a4,$a4,#0 | ||
| 343 | mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early | ||
| 344 | @ because it doesn't affect flags | ||
| 345 | adcs $a5,$a5,#0 | ||
| 346 | orr $a0,$a0,$a1,lsl#31 | ||
| 347 | adcs $a6,$a6,$ff,lsr#31 | ||
| 348 | mov $b_ptr,#0 | ||
| 349 | adcs $a7,$a7,$ff,asr#31 | ||
| 350 | mov $a1,$a1,lsr#1 | ||
| 351 | adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition | ||
| 352 | |||
| 353 | orr $a1,$a1,$a2,lsl#31 | ||
| 354 | mov $a2,$a2,lsr#1 | ||
| 355 | str $a0,[$r_ptr,#0] | ||
| 356 | orr $a2,$a2,$a3,lsl#31 | ||
| 357 | mov $a3,$a3,lsr#1 | ||
| 358 | str $a1,[$r_ptr,#4] | ||
| 359 | orr $a3,$a3,$a4,lsl#31 | ||
| 360 | mov $a4,$a4,lsr#1 | ||
| 361 | str $a2,[$r_ptr,#8] | ||
| 362 | orr $a4,$a4,$a5,lsl#31 | ||
| 363 | mov $a5,$a5,lsr#1 | ||
| 364 | str $a3,[$r_ptr,#12] | ||
| 365 | orr $a5,$a5,$a6,lsl#31 | ||
| 366 | mov $a6,$a6,lsr#1 | ||
| 367 | str $a4,[$r_ptr,#16] | ||
| 368 | orr $a6,$a6,$a7,lsl#31 | ||
| 369 | mov $a7,$a7,lsr#1 | ||
| 370 | str $a5,[$r_ptr,#20] | ||
| 371 | orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit | ||
| 372 | str $a6,[$r_ptr,#24] | ||
| 373 | str $a7,[$r_ptr,#28] | ||
| 374 | |||
| 375 | mov pc,lr | ||
| 376 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 | ||
| 377 | |||
| 378 | @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], | ||
| 379 | @ const BN_ULONG r2[8]); | ||
| 380 | .globl ecp_nistz256_sub | ||
| 381 | .type ecp_nistz256_sub,%function | ||
| 382 | .align 4 | ||
| 383 | ecp_nistz256_sub: | ||
| 384 | stmdb sp!,{r4-r12,lr} | ||
| 385 | bl __ecp_nistz256_sub | ||
| 386 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 387 | ldmia sp!,{r4-r12,pc} | ||
| 388 | #else | ||
| 389 | ldmia sp!,{r4-r12,lr} | ||
| 390 | bx lr @ interoperable with Thumb ISA:-) | ||
| 391 | #endif | ||
| 392 | .size ecp_nistz256_sub,.-ecp_nistz256_sub | ||
| 393 | |||
| 394 | .type __ecp_nistz256_sub,%function | ||
| 395 | .align 4 | ||
| 396 | __ecp_nistz256_sub: | ||
| 397 | str lr,[sp,#-4]! @ push lr | ||
| 398 | |||
| 399 | ldr $a0,[$a_ptr,#0] | ||
| 400 | ldr $a1,[$a_ptr,#4] | ||
| 401 | ldr $a2,[$a_ptr,#8] | ||
| 402 | ldr $a3,[$a_ptr,#12] | ||
| 403 | ldr $a4,[$a_ptr,#16] | ||
| 404 | ldr $t0,[$b_ptr,#0] | ||
| 405 | ldr $a5,[$a_ptr,#20] | ||
| 406 | ldr $t1,[$b_ptr,#4] | ||
| 407 | ldr $a6,[$a_ptr,#24] | ||
| 408 | ldr $t2,[$b_ptr,#8] | ||
| 409 | ldr $a7,[$a_ptr,#28] | ||
| 410 | ldr $t3,[$b_ptr,#12] | ||
| 411 | subs $a0,$a0,$t0 | ||
| 412 | ldr $t0,[$b_ptr,#16] | ||
| 413 | sbcs $a1,$a1,$t1 | ||
| 414 | ldr $t1,[$b_ptr,#20] | ||
| 415 | sbcs $a2,$a2,$t2 | ||
| 416 | ldr $t2,[$b_ptr,#24] | ||
| 417 | sbcs $a3,$a3,$t3 | ||
| 418 | ldr $t3,[$b_ptr,#28] | ||
| 419 | sbcs $a4,$a4,$t0 | ||
| 420 | sbcs $a5,$a5,$t1 | ||
| 421 | sbcs $a6,$a6,$t2 | ||
| 422 | sbcs $a7,$a7,$t3 | ||
| 423 | sbc $ff,$ff,$ff @ broadcast borrow bit | ||
| 424 | ldr lr,[sp],#4 @ pop lr | ||
| 425 | |||
| 426 | .Lreduce_by_add: | ||
| 427 | |||
| 428 | @ if a-b borrows, add modulus. | ||
| 429 | @ | ||
| 430 | @ Note that because mod has special form, i.e. consists of | ||
| 431 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 432 | @ broadcasting borrow bit to a register, $ff, and using it as | ||
| 433 | @ a whole or extracting single bit. | ||
| 434 | |||
| 435 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 436 | adcs $a1,$a1,$ff | ||
| 437 | str $a0,[$r_ptr,#0] | ||
| 438 | adcs $a2,$a2,$ff | ||
| 439 | str $a1,[$r_ptr,#4] | ||
| 440 | adcs $a3,$a3,#0 | ||
| 441 | str $a2,[$r_ptr,#8] | ||
| 442 | adcs $a4,$a4,#0 | ||
| 443 | str $a3,[$r_ptr,#12] | ||
| 444 | adcs $a5,$a5,#0 | ||
| 445 | str $a4,[$r_ptr,#16] | ||
| 446 | adcs $a6,$a6,$ff,lsr#31 | ||
| 447 | str $a5,[$r_ptr,#20] | ||
| 448 | adcs $a7,$a7,$ff | ||
| 449 | str $a6,[$r_ptr,#24] | ||
| 450 | str $a7,[$r_ptr,#28] | ||
| 451 | |||
| 452 | mov pc,lr | ||
| 453 | .size __ecp_nistz256_sub,.-__ecp_nistz256_sub | ||
| 454 | |||
| 455 | @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 456 | .globl ecp_nistz256_neg | ||
| 457 | .type ecp_nistz256_neg,%function | ||
| 458 | .align 4 | ||
| 459 | ecp_nistz256_neg: | ||
| 460 | stmdb sp!,{r4-r12,lr} | ||
| 461 | bl __ecp_nistz256_neg | ||
| 462 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 463 | ldmia sp!,{r4-r12,pc} | ||
| 464 | #else | ||
| 465 | ldmia sp!,{r4-r12,lr} | ||
| 466 | bx lr @ interoperable with Thumb ISA:-) | ||
| 467 | #endif | ||
| 468 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | ||
| 469 | |||
| 470 | .type __ecp_nistz256_neg,%function | ||
| 471 | .align 4 | ||
| 472 | __ecp_nistz256_neg: | ||
| 473 | ldr $a0,[$a_ptr,#0] | ||
| 474 | eor $ff,$ff,$ff | ||
| 475 | ldr $a1,[$a_ptr,#4] | ||
| 476 | ldr $a2,[$a_ptr,#8] | ||
| 477 | subs $a0,$ff,$a0 | ||
| 478 | ldr $a3,[$a_ptr,#12] | ||
| 479 | sbcs $a1,$ff,$a1 | ||
| 480 | ldr $a4,[$a_ptr,#16] | ||
| 481 | sbcs $a2,$ff,$a2 | ||
| 482 | ldr $a5,[$a_ptr,#20] | ||
| 483 | sbcs $a3,$ff,$a3 | ||
| 484 | ldr $a6,[$a_ptr,#24] | ||
| 485 | sbcs $a4,$ff,$a4 | ||
| 486 | ldr $a7,[$a_ptr,#28] | ||
| 487 | sbcs $a5,$ff,$a5 | ||
| 488 | sbcs $a6,$ff,$a6 | ||
| 489 | sbcs $a7,$ff,$a7 | ||
| 490 | sbc $ff,$ff,$ff | ||
| 491 | |||
| 492 | b .Lreduce_by_add | ||
| 493 | .size __ecp_nistz256_neg,.-__ecp_nistz256_neg | ||
| 494 | ___ | ||
| 495 | { | ||
| 496 | my @acc=map("r$_",(3..11)); | ||
| 497 | my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); | ||
| 498 | |||
| 499 | $code.=<<___; | ||
| 500 | @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | ||
| 501 | .globl ecp_nistz256_sqr_mont | ||
| 502 | .type ecp_nistz256_sqr_mont,%function | ||
| 503 | .align 4 | ||
| 504 | ecp_nistz256_sqr_mont: | ||
| 505 | mov $b_ptr,$a_ptr | ||
| 506 | b .Lecp_nistz256_mul_mont | ||
| 507 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | ||
| 508 | |||
| 509 | @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], | ||
| 510 | @ const BN_ULONG r2[8]); | ||
| 511 | .globl ecp_nistz256_mul_mont | ||
| 512 | .type ecp_nistz256_mul_mont,%function | ||
| 513 | .align 4 | ||
| 514 | ecp_nistz256_mul_mont: | ||
| 515 | .Lecp_nistz256_mul_mont: | ||
| 516 | stmdb sp!,{r4-r12,lr} | ||
| 517 | bl __ecp_nistz256_mul_mont | ||
| 518 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 519 | ldmia sp!,{r4-r12,pc} | ||
| 520 | #else | ||
| 521 | ldmia sp!,{r4-r12,lr} | ||
| 522 | bx lr @ interoperable with Thumb ISA:-) | ||
| 523 | #endif | ||
| 524 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | ||
| 525 | |||
| 526 | .type __ecp_nistz256_mul_mont,%function | ||
| 527 | .align 4 | ||
| 528 | __ecp_nistz256_mul_mont: | ||
| 529 | stmdb sp!,{r0-r2,lr} @ make a copy of arguments too | ||
| 530 | |||
| 531 | ldr $bj,[$b_ptr,#0] @ b[0] | ||
| 532 | ldmia $a_ptr,{@acc[1]-@acc[8]} | ||
| 533 | |||
| 534 | umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] | ||
| 535 | stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so | ||
| 536 | @ that it can be addressed | ||
| 537 | @ without spending register | ||
| 538 | @ on address | ||
| 539 | umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] | ||
| 540 | umull @acc[2],$t1,@acc[3],$bj | ||
| 541 | adds @acc[1],@acc[1],$t3 @ accumulate high part of mult | ||
| 542 | umull @acc[3],$t2,@acc[4],$bj | ||
| 543 | adcs @acc[2],@acc[2],$t0 | ||
| 544 | umull @acc[4],$t3,@acc[5],$bj | ||
| 545 | adcs @acc[3],@acc[3],$t1 | ||
| 546 | umull @acc[5],$t0,@acc[6],$bj | ||
| 547 | adcs @acc[4],@acc[4],$t2 | ||
| 548 | umull @acc[6],$t1,@acc[7],$bj | ||
| 549 | adcs @acc[5],@acc[5],$t3 | ||
| 550 | umull @acc[7],$t2,@acc[8],$bj | ||
| 551 | adcs @acc[6],@acc[6],$t0 | ||
| 552 | adcs @acc[7],@acc[7],$t1 | ||
| 553 | eor $t3,$t3,$t3 @ first overflow bit is zero | ||
| 554 | adc @acc[8],$t2,#0 | ||
| 555 | ___ | ||
| 556 | for(my $i=1;$i<8;$i++) { | ||
| 557 | my $t4=@acc[0]; | ||
| 558 | |||
| 559 | # Reduction iteration is normally performed by accumulating | ||
| 560 | # result of multiplication of modulus by "magic" digit [and | ||
| 561 | # omitting least significant word, which is guaranteed to | ||
| 562 | # be 0], but thanks to special form of modulus and "magic" | ||
| 563 | # digit being equal to least significant word, it can be | ||
| 564 | # performed with additions and subtractions alone. Indeed: | ||
| 565 | # | ||
| 566 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | ||
| 567 | # * abcd | ||
| 568 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 569 | # | ||
| 570 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 571 | # rewrite above as: | ||
| 572 | # | ||
| 573 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 574 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | ||
| 575 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | ||
| 576 | # | ||
| 577 | # or marking redundant operations: | ||
| 578 | # | ||
| 579 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | ||
| 580 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | ||
| 581 | # - abcd.----.----.----.----.----.----.---- | ||
| 582 | |||
| 583 | $code.=<<___; | ||
| 584 | @ multiplication-less reduction $i | ||
| 585 | adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] | ||
| 586 | ldr $bj,[sp,#40] @ restore b_ptr | ||
| 587 | adcs @acc[4],@acc[4],#0 @ r[4]+=0 | ||
| 588 | adcs @acc[5],@acc[5],#0 @ r[5]+=0 | ||
| 589 | adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] | ||
| 590 | ldr $t1,[sp,#0] @ load a[0] | ||
| 591 | adcs @acc[7],@acc[7],#0 @ r[7]+=0 | ||
| 592 | ldr $bj,[$bj,#4*$i] @ load b[i] | ||
| 593 | adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] | ||
| 594 | eor $t0,$t0,$t0 | ||
| 595 | adc $t3,$t3,#0 @ overflow bit | ||
| 596 | subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] | ||
| 597 | ldr $t2,[sp,#4] @ a[1] | ||
| 598 | sbcs @acc[8],@acc[8],#0 @ r[8]-=0 | ||
| 599 | umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] | ||
| 600 | eor $t1,$t1,$t1 | ||
| 601 | sbc @acc[0],$t3,#0 @ overflow bit, keep in mind | ||
| 602 | @ that netto result is | ||
| 603 | @ addition of a value which | ||
| 604 | @ makes underflow impossible | ||
| 605 | |||
| 606 | ldr $t3,[sp,#8] @ a[2] | ||
| 607 | umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] | ||
| 608 | str @acc[0],[sp,#36] @ temporarily offload overflow | ||
| 609 | eor $t2,$t2,$t2 | ||
| 610 | ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] | ||
| 611 | umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] | ||
| 612 | eor $t3,$t3,$t3 | ||
| 613 | adds @acc[2],@acc[2],$t0 @ accumulate high part of mult | ||
| 614 | ldr $t0,[sp,#16] @ a[4] | ||
| 615 | umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] | ||
| 616 | eor $t4,$t4,$t4 | ||
| 617 | adcs @acc[3],@acc[3],$t1 | ||
| 618 | ldr $t1,[sp,#20] @ a[5] | ||
| 619 | umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] | ||
| 620 | eor $t0,$t0,$t0 | ||
| 621 | adcs @acc[4],@acc[4],$t2 | ||
| 622 | ldr $t2,[sp,#24] @ a[6] | ||
| 623 | umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] | ||
| 624 | eor $t1,$t1,$t1 | ||
| 625 | adcs @acc[5],@acc[5],$t3 | ||
| 626 | ldr $t3,[sp,#28] @ a[7] | ||
| 627 | umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] | ||
| 628 | eor $t2,$t2,$t2 | ||
| 629 | adcs @acc[6],@acc[6],$t4 | ||
| 630 | ldr @acc[0],[sp,#36] @ restore overflow bit | ||
| 631 | umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] | ||
| 632 | eor $t3,$t3,$t3 | ||
| 633 | adcs @acc[7],@acc[7],$t0 | ||
| 634 | adcs @acc[8],@acc[8],$t1 | ||
| 635 | adcs @acc[0],$acc[0],$t2 | ||
| 636 | adc $t3,$t3,#0 @ new overflow bit | ||
| 637 | ___ | ||
| 638 | push(@acc,shift(@acc)); # rotate registers, so that | ||
| 639 | # "r[i]" becomes r[i] | ||
| 640 | } | ||
| 641 | $code.=<<___; | ||
| 642 | @ last multiplication-less reduction | ||
| 643 | adds @acc[3],@acc[3],@acc[0] | ||
| 644 | ldr $r_ptr,[sp,#32] @ restore r_ptr | ||
| 645 | adcs @acc[4],@acc[4],#0 | ||
| 646 | adcs @acc[5],@acc[5],#0 | ||
| 647 | adcs @acc[6],@acc[6],@acc[0] | ||
| 648 | adcs @acc[7],@acc[7],#0 | ||
| 649 | adcs @acc[8],@acc[8],@acc[0] | ||
| 650 | adc $t3,$t3,#0 | ||
| 651 | subs @acc[7],@acc[7],@acc[0] | ||
| 652 | sbcs @acc[8],@acc[8],#0 | ||
| 653 | sbc @acc[0],$t3,#0 @ overflow bit | ||
| 654 | |||
| 655 | @ Final step is "if result > mod, subtract mod", but we do it | ||
| 656 | @ "other way around", namely subtract modulus from result | ||
| 657 | @ and if it borrowed, add modulus back. | ||
| 658 | |||
| 659 | adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 | ||
| 660 | adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 | ||
| 661 | adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 | ||
| 662 | sbcs @acc[4],@acc[4],#0 | ||
| 663 | sbcs @acc[5],@acc[5],#0 | ||
| 664 | sbcs @acc[6],@acc[6],#0 | ||
| 665 | sbcs @acc[7],@acc[7],#1 | ||
| 666 | adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 | ||
| 667 | ldr lr,[sp,#44] @ restore lr | ||
| 668 | sbc @acc[0],@acc[0],#0 @ broadcast borrow bit | ||
| 669 | add sp,sp,#48 | ||
| 670 | |||
| 671 | @ Note that because mod has special form, i.e. consists of | ||
| 672 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 673 | @ broadcasting borrow bit to a register, @acc[0], and using it as | ||
| 674 | @ a whole or extracting single bit. | ||
| 675 | |||
| 676 | adds @acc[1],@acc[1],@acc[0] @ add modulus or zero | ||
| 677 | adcs @acc[2],@acc[2],@acc[0] | ||
| 678 | str @acc[1],[$r_ptr,#0] | ||
| 679 | adcs @acc[3],@acc[3],@acc[0] | ||
| 680 | str @acc[2],[$r_ptr,#4] | ||
| 681 | adcs @acc[4],@acc[4],#0 | ||
| 682 | str @acc[3],[$r_ptr,#8] | ||
| 683 | adcs @acc[5],@acc[5],#0 | ||
| 684 | str @acc[4],[$r_ptr,#12] | ||
| 685 | adcs @acc[6],@acc[6],#0 | ||
| 686 | str @acc[5],[$r_ptr,#16] | ||
| 687 | adcs @acc[7],@acc[7],@acc[0],lsr#31 | ||
| 688 | str @acc[6],[$r_ptr,#20] | ||
| 689 | adc @acc[8],@acc[8],@acc[0] | ||
| 690 | str @acc[7],[$r_ptr,#24] | ||
| 691 | str @acc[8],[$r_ptr,#28] | ||
| 692 | |||
| 693 | mov pc,lr | ||
| 694 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont | ||
| 695 | ___ | ||
| 696 | } | ||
| 697 | |||
| 698 | { | ||
| 699 | my ($out,$inp,$index,$mask)=map("r$_",(0..3)); | ||
| 700 | $code.=<<___; | ||
| 701 | @ void ecp_nistz256_select_w5(P256_POINT *r0,const void *r1, | ||
| 702 | @ int r2); | ||
| 703 | .globl ecp_nistz256_select_w5 | ||
| 704 | .type ecp_nistz256_select_w5,%function | ||
| 705 | .align 5 | ||
| 706 | ecp_nistz256_select_w5: | ||
| 707 | stmdb sp!,{r4-r11} | ||
| 708 | |||
| 709 | cmp $index,#0 | ||
| 710 | mov $mask,#0 | ||
| 711 | #ifdef __thumb2__ | ||
| 712 | itt ne | ||
| 713 | #endif | ||
| 714 | subne $index,$index,#1 | ||
| 715 | movne $mask,#-1 | ||
| 716 | add $inp,$inp,$index,lsl#2 | ||
| 717 | |||
| 718 | ldr r4,[$inp,#64*0] | ||
| 719 | ldr r5,[$inp,#64*1] | ||
| 720 | ldr r6,[$inp,#64*2] | ||
| 721 | and r4,r4,$mask | ||
| 722 | ldr r7,[$inp,#64*3] | ||
| 723 | and r5,r5,$mask | ||
| 724 | ldr r8,[$inp,#64*4] | ||
| 725 | and r6,r6,$mask | ||
| 726 | ldr r9,[$inp,#64*5] | ||
| 727 | and r7,r7,$mask | ||
| 728 | ldr r10,[$inp,#64*6] | ||
| 729 | and r8,r8,$mask | ||
| 730 | ldr r11,[$inp,#64*7] | ||
| 731 | add $inp,$inp,#64*8 | ||
| 732 | and r9,r9,$mask | ||
| 733 | and r10,r10,$mask | ||
| 734 | and r11,r11,$mask | ||
| 735 | stmia $out!,{r4-r11} @ X | ||
| 736 | |||
| 737 | ldr r4,[$inp,#64*0] | ||
| 738 | ldr r5,[$inp,#64*1] | ||
| 739 | ldr r6,[$inp,#64*2] | ||
| 740 | and r4,r4,$mask | ||
| 741 | ldr r7,[$inp,#64*3] | ||
| 742 | and r5,r5,$mask | ||
| 743 | ldr r8,[$inp,#64*4] | ||
| 744 | and r6,r6,$mask | ||
| 745 | ldr r9,[$inp,#64*5] | ||
| 746 | and r7,r7,$mask | ||
| 747 | ldr r10,[$inp,#64*6] | ||
| 748 | and r8,r8,$mask | ||
| 749 | ldr r11,[$inp,#64*7] | ||
| 750 | add $inp,$inp,#64*8 | ||
| 751 | and r9,r9,$mask | ||
| 752 | and r10,r10,$mask | ||
| 753 | and r11,r11,$mask | ||
| 754 | stmia $out!,{r4-r11} @ Y | ||
| 755 | |||
| 756 | ldr r4,[$inp,#64*0] | ||
| 757 | ldr r5,[$inp,#64*1] | ||
| 758 | ldr r6,[$inp,#64*2] | ||
| 759 | and r4,r4,$mask | ||
| 760 | ldr r7,[$inp,#64*3] | ||
| 761 | and r5,r5,$mask | ||
| 762 | ldr r8,[$inp,#64*4] | ||
| 763 | and r6,r6,$mask | ||
| 764 | ldr r9,[$inp,#64*5] | ||
| 765 | and r7,r7,$mask | ||
| 766 | ldr r10,[$inp,#64*6] | ||
| 767 | and r8,r8,$mask | ||
| 768 | ldr r11,[$inp,#64*7] | ||
| 769 | and r9,r9,$mask | ||
| 770 | and r10,r10,$mask | ||
| 771 | and r11,r11,$mask | ||
| 772 | stmia $out,{r4-r11} @ Z | ||
| 773 | |||
| 774 | ldmia sp!,{r4-r11} | ||
| 775 | #if __ARM_ARCH__>=5 || defined(__thumb__) | ||
| 776 | bx lr | ||
| 777 | #else | ||
| 778 | mov pc,lr | ||
| 779 | #endif | ||
| 780 | .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 | ||
| 781 | |||
| 782 | @ void ecp_nistz256_select_w7(P256_POINT_AFFINE *r0,const void *r1, | ||
| 783 | @ int r2); | ||
| 784 | .globl ecp_nistz256_select_w7 | ||
| 785 | .type ecp_nistz256_select_w7,%function | ||
| 786 | .align 5 | ||
| 787 | ecp_nistz256_select_w7: | ||
| 788 | stmdb sp!,{r4-r7} | ||
| 789 | |||
| 790 | cmp $index,#0 | ||
| 791 | mov $mask,#0 | ||
| 792 | #ifdef __thumb2__ | ||
| 793 | itt ne | ||
| 794 | #endif | ||
| 795 | subne $index,$index,#1 | ||
| 796 | movne $mask,#-1 | ||
| 797 | add $inp,$inp,$index | ||
| 798 | mov $index,#64/4 | ||
| 799 | nop | ||
| 800 | .Loop_select_w7: | ||
| 801 | ldrb r4,[$inp,#64*0] | ||
| 802 | subs $index,$index,#1 | ||
| 803 | ldrb r5,[$inp,#64*1] | ||
| 804 | ldrb r6,[$inp,#64*2] | ||
| 805 | ldrb r7,[$inp,#64*3] | ||
| 806 | add $inp,$inp,#64*4 | ||
| 807 | orr r4,r4,r5,lsl#8 | ||
| 808 | orr r4,r4,r6,lsl#16 | ||
| 809 | orr r4,r4,r7,lsl#24 | ||
| 810 | and r4,r4,$mask | ||
| 811 | str r4,[$out],#4 | ||
| 812 | bne .Loop_select_w7 | ||
| 813 | |||
| 814 | ldmia sp!,{r4-r7} | ||
| 815 | #if __ARM_ARCH__>=5 || defined(__thumb__) | ||
| 816 | bx lr | ||
| 817 | #else | ||
| 818 | mov pc,lr | ||
| 819 | #endif | ||
| 820 | .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 | ||
| 821 | ___ | ||
| 822 | } | ||
| 823 | if (0) { | ||
| 824 | # In comparison to integer-only equivalent of below subroutine: | ||
| 825 | # | ||
| 826 | # Cortex-A8 +10% | ||
| 827 | # Cortex-A9 -10% | ||
| 828 | # Snapdragon S4 +5% | ||
| 829 | # | ||
| 830 | # As not all time is spent in multiplication, overall impact is deemed | ||
| 831 | # too low to care about. | ||
| 832 | |||
| 833 | my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); | ||
| 834 | my $mask="q4"; | ||
| 835 | my $mult="q5"; | ||
| 836 | my @AxB=map("q$_",(8..15)); | ||
| 837 | |||
| 838 | my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); | ||
| 839 | |||
| 840 | $code.=<<___; | ||
| 841 | #if __ARM_ARCH__>=7 | ||
| 842 | .fpu neon | ||
| 843 | |||
| 844 | .globl ecp_nistz256_mul_mont_neon | ||
| 845 | .type ecp_nistz256_mul_mont_neon,%function | ||
| 846 | .align 5 | ||
| 847 | ecp_nistz256_mul_mont_neon: | ||
| 848 | mov ip,sp | ||
| 849 | stmdb sp!,{r4-r9} | ||
| 850 | vstmdb sp!,{q4-q5} @ ABI specification says so | ||
| 851 | |||
| 852 | sub $toutptr,sp,#40 | ||
| 853 | vld1.32 {${Bi}[0]},[$bptr,:32]! | ||
| 854 | veor $zero,$zero,$zero | ||
| 855 | vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( | ||
| 856 | vzip.16 $Bi,$zero | ||
| 857 | mov sp,$toutptr @ alloca | ||
| 858 | vmov.i64 $mask,#0xffff | ||
| 859 | |||
| 860 | vmull.u32 @AxB[0],$Bi,${A0}[0] | ||
| 861 | vmull.u32 @AxB[1],$Bi,${A0}[1] | ||
| 862 | vmull.u32 @AxB[2],$Bi,${A1}[0] | ||
| 863 | vmull.u32 @AxB[3],$Bi,${A1}[1] | ||
| 864 | vshr.u64 $temp,@AxB[0]#lo,#16 | ||
| 865 | vmull.u32 @AxB[4],$Bi,${A2}[0] | ||
| 866 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp | ||
| 867 | vmull.u32 @AxB[5],$Bi,${A2}[1] | ||
| 868 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] | ||
| 869 | vmull.u32 @AxB[6],$Bi,${A3}[0] | ||
| 870 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] | ||
| 871 | vmull.u32 @AxB[7],$Bi,${A3}[1] | ||
| 872 | ___ | ||
| 873 | for($i=1;$i<8;$i++) { | ||
| 874 | $code.=<<___; | ||
| 875 | vld1.32 {${Bi}[0]},[$bptr,:32]! | ||
| 876 | veor $zero,$zero,$zero | ||
| 877 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction | ||
| 878 | vshl.u64 $mult,@AxB[0],#32 | ||
| 879 | vadd.u64 @AxB[3],@AxB[3],@AxB[0] | ||
| 880 | vsub.u64 $mult,$mult,@AxB[0] | ||
| 881 | vzip.16 $Bi,$zero | ||
| 882 | vadd.u64 @AxB[6],@AxB[6],@AxB[0] | ||
| 883 | vadd.u64 @AxB[7],@AxB[7],$mult | ||
| 884 | ___ | ||
| 885 | push(@AxB,shift(@AxB)); | ||
| 886 | $code.=<<___; | ||
| 887 | vmlal.u32 @AxB[0],$Bi,${A0}[0] | ||
| 888 | vmlal.u32 @AxB[1],$Bi,${A0}[1] | ||
| 889 | vmlal.u32 @AxB[2],$Bi,${A1}[0] | ||
| 890 | vmlal.u32 @AxB[3],$Bi,${A1}[1] | ||
| 891 | vshr.u64 $temp,@AxB[0]#lo,#16 | ||
| 892 | vmlal.u32 @AxB[4],$Bi,${A2}[0] | ||
| 893 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp | ||
| 894 | vmlal.u32 @AxB[5],$Bi,${A2}[1] | ||
| 895 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] | ||
| 896 | vmlal.u32 @AxB[6],$Bi,${A3}[0] | ||
| 897 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] | ||
| 898 | vmull.u32 @AxB[7],$Bi,${A3}[1] | ||
| 899 | ___ | ||
| 900 | } | ||
| 901 | $code.=<<___; | ||
| 902 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction | ||
| 903 | vshl.u64 $mult,@AxB[0],#32 | ||
| 904 | vadd.u64 @AxB[3],@AxB[3],@AxB[0] | ||
| 905 | vsub.u64 $mult,$mult,@AxB[0] | ||
| 906 | vadd.u64 @AxB[6],@AxB[6],@AxB[0] | ||
| 907 | vadd.u64 @AxB[7],@AxB[7],$mult | ||
| 908 | |||
| 909 | vshr.u64 $temp,@AxB[1]#lo,#16 @ convert | ||
| 910 | vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp | ||
| 911 | vshr.u64 $temp,@AxB[1]#hi,#16 | ||
| 912 | vzip.16 @AxB[1]#lo,@AxB[1]#hi | ||
| 913 | ___ | ||
| 914 | foreach (2..7) { | ||
| 915 | $code.=<<___; | ||
| 916 | vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp | ||
| 917 | vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! | ||
| 918 | vshr.u64 $temp,@AxB[$_]#lo,#16 | ||
| 919 | vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp | ||
| 920 | vshr.u64 $temp,@AxB[$_]#hi,#16 | ||
| 921 | vzip.16 @AxB[$_]#lo,@AxB[$_]#hi | ||
| 922 | ___ | ||
| 923 | } | ||
| 924 | $code.=<<___; | ||
| 925 | vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! | ||
| 926 | vst1.32 {$temp},[$toutptr] @ upper 33 bits | ||
| 927 | |||
| 928 | ldr r1,[sp,#0] | ||
| 929 | ldr r2,[sp,#4] | ||
| 930 | ldr r3,[sp,#8] | ||
| 931 | subs r1,r1,#-1 | ||
| 932 | ldr r4,[sp,#12] | ||
| 933 | sbcs r2,r2,#-1 | ||
| 934 | ldr r5,[sp,#16] | ||
| 935 | sbcs r3,r3,#-1 | ||
| 936 | ldr r6,[sp,#20] | ||
| 937 | sbcs r4,r4,#0 | ||
| 938 | ldr r7,[sp,#24] | ||
| 939 | sbcs r5,r5,#0 | ||
| 940 | ldr r8,[sp,#28] | ||
| 941 | sbcs r6,r6,#0 | ||
| 942 | ldr r9,[sp,#32] @ top-most bit | ||
| 943 | sbcs r7,r7,#1 | ||
| 944 | sub sp,ip,#40+16 | ||
| 945 | sbcs r8,r8,#-1 | ||
| 946 | sbc r9,r9,#0 | ||
| 947 | vldmia sp!,{q4-q5} | ||
| 948 | |||
| 949 | adds r1,r1,r9 | ||
| 950 | adcs r2,r2,r9 | ||
| 951 | str r1,[$rptr,#0] | ||
| 952 | adcs r3,r3,r9 | ||
| 953 | str r2,[$rptr,#4] | ||
| 954 | adcs r4,r4,#0 | ||
| 955 | str r3,[$rptr,#8] | ||
| 956 | adcs r5,r5,#0 | ||
| 957 | str r4,[$rptr,#12] | ||
| 958 | adcs r6,r6,#0 | ||
| 959 | str r5,[$rptr,#16] | ||
| 960 | adcs r7,r7,r9,lsr#31 | ||
| 961 | str r6,[$rptr,#20] | ||
| 962 | adcs r8,r8,r9 | ||
| 963 | str r7,[$rptr,#24] | ||
| 964 | str r8,[$rptr,#28] | ||
| 965 | |||
| 966 | ldmia sp!,{r4-r9} | ||
| 967 | bx lr | ||
| 968 | .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon | ||
| 969 | #endif | ||
| 970 | ___ | ||
| 971 | } | ||
| 972 | |||
| 973 | {{{ | ||
| 974 | ######################################################################## | ||
| 975 | # Below $aN assignment matches order in which 256-bit result appears in | ||
| 976 | # register bank at return from __ecp_nistz256_mul_mont, so that we can | ||
| 977 | # skip over reloading it from memory. This means that below functions | ||
| 978 | # use custom calling sequence accepting 256-bit input in registers, | ||
| 979 | # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. | ||
| 980 | # | ||
| 981 | # See their "normal" counterparts for insights on calculations. | ||
| 982 | |||
| 983 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, | ||
| 984 | $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); | ||
| 985 | my $ff=$b_ptr; | ||
| 986 | |||
| 987 | $code.=<<___; | ||
| 988 | .type __ecp_nistz256_sub_from,%function | ||
| 989 | .align 5 | ||
| 990 | __ecp_nistz256_sub_from: | ||
| 991 | str lr,[sp,#-4]! @ push lr | ||
| 992 | |||
| 993 | ldr $t0,[$b_ptr,#0] | ||
| 994 | ldr $t1,[$b_ptr,#4] | ||
| 995 | ldr $t2,[$b_ptr,#8] | ||
| 996 | ldr $t3,[$b_ptr,#12] | ||
| 997 | subs $a0,$a0,$t0 | ||
| 998 | ldr $t0,[$b_ptr,#16] | ||
| 999 | sbcs $a1,$a1,$t1 | ||
| 1000 | ldr $t1,[$b_ptr,#20] | ||
| 1001 | sbcs $a2,$a2,$t2 | ||
| 1002 | ldr $t2,[$b_ptr,#24] | ||
| 1003 | sbcs $a3,$a3,$t3 | ||
| 1004 | ldr $t3,[$b_ptr,#28] | ||
| 1005 | sbcs $a4,$a4,$t0 | ||
| 1006 | sbcs $a5,$a5,$t1 | ||
| 1007 | sbcs $a6,$a6,$t2 | ||
| 1008 | sbcs $a7,$a7,$t3 | ||
| 1009 | sbc $ff,$ff,$ff @ broadcast borrow bit | ||
| 1010 | ldr lr,[sp],#4 @ pop lr | ||
| 1011 | |||
| 1012 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 1013 | adcs $a1,$a1,$ff | ||
| 1014 | str $a0,[$r_ptr,#0] | ||
| 1015 | adcs $a2,$a2,$ff | ||
| 1016 | str $a1,[$r_ptr,#4] | ||
| 1017 | adcs $a3,$a3,#0 | ||
| 1018 | str $a2,[$r_ptr,#8] | ||
| 1019 | adcs $a4,$a4,#0 | ||
| 1020 | str $a3,[$r_ptr,#12] | ||
| 1021 | adcs $a5,$a5,#0 | ||
| 1022 | str $a4,[$r_ptr,#16] | ||
| 1023 | adcs $a6,$a6,$ff,lsr#31 | ||
| 1024 | str $a5,[$r_ptr,#20] | ||
| 1025 | adcs $a7,$a7,$ff | ||
| 1026 | str $a6,[$r_ptr,#24] | ||
| 1027 | str $a7,[$r_ptr,#28] | ||
| 1028 | |||
| 1029 | mov pc,lr | ||
| 1030 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | ||
| 1031 | |||
| 1032 | .type __ecp_nistz256_sub_morf,%function | ||
| 1033 | .align 5 | ||
| 1034 | __ecp_nistz256_sub_morf: | ||
| 1035 | str lr,[sp,#-4]! @ push lr | ||
| 1036 | |||
| 1037 | ldr $t0,[$b_ptr,#0] | ||
| 1038 | ldr $t1,[$b_ptr,#4] | ||
| 1039 | ldr $t2,[$b_ptr,#8] | ||
| 1040 | ldr $t3,[$b_ptr,#12] | ||
| 1041 | subs $a0,$t0,$a0 | ||
| 1042 | ldr $t0,[$b_ptr,#16] | ||
| 1043 | sbcs $a1,$t1,$a1 | ||
| 1044 | ldr $t1,[$b_ptr,#20] | ||
| 1045 | sbcs $a2,$t2,$a2 | ||
| 1046 | ldr $t2,[$b_ptr,#24] | ||
| 1047 | sbcs $a3,$t3,$a3 | ||
| 1048 | ldr $t3,[$b_ptr,#28] | ||
| 1049 | sbcs $a4,$t0,$a4 | ||
| 1050 | sbcs $a5,$t1,$a5 | ||
| 1051 | sbcs $a6,$t2,$a6 | ||
| 1052 | sbcs $a7,$t3,$a7 | ||
| 1053 | sbc $ff,$ff,$ff @ broadcast borrow bit | ||
| 1054 | ldr lr,[sp],#4 @ pop lr | ||
| 1055 | |||
| 1056 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 1057 | adcs $a1,$a1,$ff | ||
| 1058 | str $a0,[$r_ptr,#0] | ||
| 1059 | adcs $a2,$a2,$ff | ||
| 1060 | str $a1,[$r_ptr,#4] | ||
| 1061 | adcs $a3,$a3,#0 | ||
| 1062 | str $a2,[$r_ptr,#8] | ||
| 1063 | adcs $a4,$a4,#0 | ||
| 1064 | str $a3,[$r_ptr,#12] | ||
| 1065 | adcs $a5,$a5,#0 | ||
| 1066 | str $a4,[$r_ptr,#16] | ||
| 1067 | adcs $a6,$a6,$ff,lsr#31 | ||
| 1068 | str $a5,[$r_ptr,#20] | ||
| 1069 | adcs $a7,$a7,$ff | ||
| 1070 | str $a6,[$r_ptr,#24] | ||
| 1071 | str $a7,[$r_ptr,#28] | ||
| 1072 | |||
| 1073 | mov pc,lr | ||
| 1074 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | ||
| 1075 | |||
| 1076 | .type __ecp_nistz256_add_self,%function | ||
| 1077 | .align 4 | ||
| 1078 | __ecp_nistz256_add_self: | ||
| 1079 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] | ||
| 1080 | adcs $a1,$a1,$a1 | ||
| 1081 | adcs $a2,$a2,$a2 | ||
| 1082 | adcs $a3,$a3,$a3 | ||
| 1083 | adcs $a4,$a4,$a4 | ||
| 1084 | adcs $a5,$a5,$a5 | ||
| 1085 | adcs $a6,$a6,$a6 | ||
| 1086 | mov $ff,#0 | ||
| 1087 | adcs $a7,$a7,$a7 | ||
| 1088 | adc $ff,$ff,#0 | ||
| 1089 | |||
| 1090 | @ if a+b >= modulus, subtract modulus. | ||
| 1091 | @ | ||
| 1092 | @ But since comparison implies subtraction, we subtract | ||
| 1093 | @ modulus and then add it back if subtraction borrowed. | ||
| 1094 | |||
| 1095 | subs $a0,$a0,#-1 | ||
| 1096 | sbcs $a1,$a1,#-1 | ||
| 1097 | sbcs $a2,$a2,#-1 | ||
| 1098 | sbcs $a3,$a3,#0 | ||
| 1099 | sbcs $a4,$a4,#0 | ||
| 1100 | sbcs $a5,$a5,#0 | ||
| 1101 | sbcs $a6,$a6,#1 | ||
| 1102 | sbcs $a7,$a7,#-1 | ||
| 1103 | sbc $ff,$ff,#0 | ||
| 1104 | |||
| 1105 | @ Note that because mod has special form, i.e. consists of | ||
| 1106 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 1107 | @ using value of borrow as a whole or extracting single bit. | ||
| 1108 | @ Follow $ff register... | ||
| 1109 | |||
| 1110 | adds $a0,$a0,$ff @ add synthesized modulus | ||
| 1111 | adcs $a1,$a1,$ff | ||
| 1112 | str $a0,[$r_ptr,#0] | ||
| 1113 | adcs $a2,$a2,$ff | ||
| 1114 | str $a1,[$r_ptr,#4] | ||
| 1115 | adcs $a3,$a3,#0 | ||
| 1116 | str $a2,[$r_ptr,#8] | ||
| 1117 | adcs $a4,$a4,#0 | ||
| 1118 | str $a3,[$r_ptr,#12] | ||
| 1119 | adcs $a5,$a5,#0 | ||
| 1120 | str $a4,[$r_ptr,#16] | ||
| 1121 | adcs $a6,$a6,$ff,lsr#31 | ||
| 1122 | str $a5,[$r_ptr,#20] | ||
| 1123 | adcs $a7,$a7,$ff | ||
| 1124 | str $a6,[$r_ptr,#24] | ||
| 1125 | str $a7,[$r_ptr,#28] | ||
| 1126 | |||
| 1127 | mov pc,lr | ||
| 1128 | .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self | ||
| 1129 | |||
| 1130 | ___ | ||
| 1131 | |||
| 1132 | ######################################################################## | ||
| 1133 | # following subroutines are "literal" implementation of those found in | ||
| 1134 | # ecp_nistz256.c | ||
| 1135 | # | ||
| 1136 | ######################################################################## | ||
| 1137 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 1138 | # | ||
| 1139 | { | ||
| 1140 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | ||
| 1141 | # above map() describes stack layout with 5 temporary | ||
| 1142 | # 256-bit vectors on top. Then note that we push | ||
| 1143 | # starting from r0, which means that we have copy of | ||
| 1144 | # input arguments just below these temporary vectors. | ||
| 1145 | |||
| 1146 | $code.=<<___; | ||
| 1147 | .globl ecp_nistz256_point_double | ||
| 1148 | .type ecp_nistz256_point_double,%function | ||
| 1149 | .align 5 | ||
| 1150 | ecp_nistz256_point_double: | ||
| 1151 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | ||
| 1152 | sub sp,sp,#32*5 | ||
| 1153 | |||
| 1154 | .Lpoint_double_shortcut: | ||
| 1155 | add r3,sp,#$in_x | ||
| 1156 | ldmia $a_ptr!,{r4-r11} @ copy in_x | ||
| 1157 | stmia r3,{r4-r11} | ||
| 1158 | |||
| 1159 | add $r_ptr,sp,#$S | ||
| 1160 | bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); | ||
| 1161 | |||
| 1162 | add $b_ptr,$a_ptr,#32 | ||
| 1163 | add $a_ptr,$a_ptr,#32 | ||
| 1164 | add $r_ptr,sp,#$Zsqr | ||
| 1165 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); | ||
| 1166 | |||
| 1167 | add $a_ptr,sp,#$S | ||
| 1168 | add $b_ptr,sp,#$S | ||
| 1169 | add $r_ptr,sp,#$S | ||
| 1170 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); | ||
| 1171 | |||
| 1172 | ldr $b_ptr,[sp,#32*5+4] | ||
| 1173 | add $a_ptr,$b_ptr,#32 | ||
| 1174 | add $b_ptr,$b_ptr,#64 | ||
| 1175 | add $r_ptr,sp,#$tmp0 | ||
| 1176 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); | ||
| 1177 | |||
| 1178 | ldr $r_ptr,[sp,#32*5] | ||
| 1179 | add $r_ptr,$r_ptr,#64 | ||
| 1180 | bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); | ||
| 1181 | |||
| 1182 | add $a_ptr,sp,#$in_x | ||
| 1183 | add $b_ptr,sp,#$Zsqr | ||
| 1184 | add $r_ptr,sp,#$M | ||
| 1185 | bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); | ||
| 1186 | |||
| 1187 | add $a_ptr,sp,#$in_x | ||
| 1188 | add $b_ptr,sp,#$Zsqr | ||
| 1189 | add $r_ptr,sp,#$Zsqr | ||
| 1190 | bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); | ||
| 1191 | |||
| 1192 | add $a_ptr,sp,#$S | ||
| 1193 | add $b_ptr,sp,#$S | ||
| 1194 | add $r_ptr,sp,#$tmp0 | ||
| 1195 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); | ||
| 1196 | |||
| 1197 | add $a_ptr,sp,#$Zsqr | ||
| 1198 | add $b_ptr,sp,#$M | ||
| 1199 | add $r_ptr,sp,#$M | ||
| 1200 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); | ||
| 1201 | |||
| 1202 | ldr $r_ptr,[sp,#32*5] | ||
| 1203 | add $a_ptr,sp,#$tmp0 | ||
| 1204 | add $r_ptr,$r_ptr,#32 | ||
| 1205 | bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); | ||
| 1206 | |||
| 1207 | add $a_ptr,sp,#$M | ||
| 1208 | add $r_ptr,sp,#$M | ||
| 1209 | bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); | ||
| 1210 | |||
| 1211 | add $a_ptr,sp,#$in_x | ||
| 1212 | add $b_ptr,sp,#$S | ||
| 1213 | add $r_ptr,sp,#$S | ||
| 1214 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); | ||
| 1215 | |||
| 1216 | add $r_ptr,sp,#$tmp0 | ||
| 1217 | bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); | ||
| 1218 | |||
| 1219 | ldr $r_ptr,[sp,#32*5] | ||
| 1220 | add $a_ptr,sp,#$M | ||
| 1221 | add $b_ptr,sp,#$M | ||
| 1222 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); | ||
| 1223 | |||
| 1224 | add $b_ptr,sp,#$tmp0 | ||
| 1225 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); | ||
| 1226 | |||
| 1227 | add $b_ptr,sp,#$S | ||
| 1228 | add $r_ptr,sp,#$S | ||
| 1229 | bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); | ||
| 1230 | |||
| 1231 | add $a_ptr,sp,#$M | ||
| 1232 | add $b_ptr,sp,#$S | ||
| 1233 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); | ||
| 1234 | |||
| 1235 | ldr $r_ptr,[sp,#32*5] | ||
| 1236 | add $b_ptr,$r_ptr,#32 | ||
| 1237 | add $r_ptr,$r_ptr,#32 | ||
| 1238 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); | ||
| 1239 | |||
| 1240 | add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" | ||
| 1241 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 1242 | ldmia sp!,{r4-r12,pc} | ||
| 1243 | #else | ||
| 1244 | ldmia sp!,{r4-r12,lr} | ||
| 1245 | bx lr @ interoperable with Thumb ISA:-) | ||
| 1246 | #endif | ||
| 1247 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | ||
| 1248 | ___ | ||
| 1249 | } | ||
| 1250 | |||
| 1251 | ######################################################################## | ||
| 1252 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 1253 | # const P256_POINT *in2); | ||
| 1254 | { | ||
| 1255 | my ($res_x,$res_y,$res_z, | ||
| 1256 | $in1_x,$in1_y,$in1_z, | ||
| 1257 | $in2_x,$in2_y,$in2_z, | ||
| 1258 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 1259 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | ||
| 1260 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 1261 | # above map() describes stack layout with 18 temporary | ||
| 1262 | # 256-bit vectors on top. Then note that we push | ||
| 1263 | # starting from r0, which means that we have copy of | ||
| 1264 | # input arguments just below these temporary vectors. | ||
| 1265 | # We use three of them for !in1infty, !in2intfy and | ||
| 1266 | # result of check for zero. | ||
| 1267 | |||
| 1268 | $code.=<<___; | ||
| 1269 | .globl ecp_nistz256_point_add | ||
| 1270 | .type ecp_nistz256_point_add,%function | ||
| 1271 | .align 5 | ||
| 1272 | ecp_nistz256_point_add: | ||
| 1273 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | ||
| 1274 | sub sp,sp,#32*18+16 | ||
| 1275 | |||
| 1276 | ldmia $b_ptr!,{r4-r11} @ copy in2_x | ||
| 1277 | add r3,sp,#$in2_x | ||
| 1278 | stmia r3!,{r4-r11} | ||
| 1279 | ldmia $b_ptr!,{r4-r11} @ copy in2_y | ||
| 1280 | stmia r3!,{r4-r11} | ||
| 1281 | ldmia $b_ptr,{r4-r11} @ copy in2_z | ||
| 1282 | orr r12,r4,r5 | ||
| 1283 | orr r12,r12,r6 | ||
| 1284 | orr r12,r12,r7 | ||
| 1285 | orr r12,r12,r8 | ||
| 1286 | orr r12,r12,r9 | ||
| 1287 | orr r12,r12,r10 | ||
| 1288 | orr r12,r12,r11 | ||
| 1289 | cmp r12,#0 | ||
| 1290 | #ifdef __thumb2__ | ||
| 1291 | it ne | ||
| 1292 | #endif | ||
| 1293 | movne r12,#-1 | ||
| 1294 | stmia r3,{r4-r11} | ||
| 1295 | str r12,[sp,#32*18+8] @ !in2infty | ||
| 1296 | |||
| 1297 | ldmia $a_ptr!,{r4-r11} @ copy in1_x | ||
| 1298 | add r3,sp,#$in1_x | ||
| 1299 | stmia r3!,{r4-r11} | ||
| 1300 | ldmia $a_ptr!,{r4-r11} @ copy in1_y | ||
| 1301 | stmia r3!,{r4-r11} | ||
| 1302 | ldmia $a_ptr,{r4-r11} @ copy in1_z | ||
| 1303 | orr r12,r4,r5 | ||
| 1304 | orr r12,r12,r6 | ||
| 1305 | orr r12,r12,r7 | ||
| 1306 | orr r12,r12,r8 | ||
| 1307 | orr r12,r12,r9 | ||
| 1308 | orr r12,r12,r10 | ||
| 1309 | orr r12,r12,r11 | ||
| 1310 | cmp r12,#0 | ||
| 1311 | #ifdef __thumb2__ | ||
| 1312 | it ne | ||
| 1313 | #endif | ||
| 1314 | movne r12,#-1 | ||
| 1315 | stmia r3,{r4-r11} | ||
| 1316 | str r12,[sp,#32*18+4] @ !in1infty | ||
| 1317 | |||
| 1318 | add $a_ptr,sp,#$in2_z | ||
| 1319 | add $b_ptr,sp,#$in2_z | ||
| 1320 | add $r_ptr,sp,#$Z2sqr | ||
| 1321 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); | ||
| 1322 | |||
| 1323 | add $a_ptr,sp,#$in1_z | ||
| 1324 | add $b_ptr,sp,#$in1_z | ||
| 1325 | add $r_ptr,sp,#$Z1sqr | ||
| 1326 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); | ||
| 1327 | |||
| 1328 | add $a_ptr,sp,#$in2_z | ||
| 1329 | add $b_ptr,sp,#$Z2sqr | ||
| 1330 | add $r_ptr,sp,#$S1 | ||
| 1331 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 1332 | |||
| 1333 | add $a_ptr,sp,#$in1_z | ||
| 1334 | add $b_ptr,sp,#$Z1sqr | ||
| 1335 | add $r_ptr,sp,#$S2 | ||
| 1336 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1337 | |||
| 1338 | add $a_ptr,sp,#$in1_y | ||
| 1339 | add $b_ptr,sp,#$S1 | ||
| 1340 | add $r_ptr,sp,#$S1 | ||
| 1341 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); | ||
| 1342 | |||
| 1343 | add $a_ptr,sp,#$in2_y | ||
| 1344 | add $b_ptr,sp,#$S2 | ||
| 1345 | add $r_ptr,sp,#$S2 | ||
| 1346 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); | ||
| 1347 | |||
| 1348 | add $b_ptr,sp,#$S1 | ||
| 1349 | add $r_ptr,sp,#$R | ||
| 1350 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); | ||
| 1351 | |||
| 1352 | orr $a0,$a0,$a1 @ see if result is zero | ||
| 1353 | orr $a2,$a2,$a3 | ||
| 1354 | orr $a4,$a4,$a5 | ||
| 1355 | orr $a0,$a0,$a2 | ||
| 1356 | orr $a4,$a4,$a6 | ||
| 1357 | orr $a0,$a0,$a7 | ||
| 1358 | add $a_ptr,sp,#$in1_x | ||
| 1359 | orr $a0,$a0,$a4 | ||
| 1360 | add $b_ptr,sp,#$Z2sqr | ||
| 1361 | str $a0,[sp,#32*18+12] | ||
| 1362 | |||
| 1363 | add $r_ptr,sp,#$U1 | ||
| 1364 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 1365 | |||
| 1366 | add $a_ptr,sp,#$in2_x | ||
| 1367 | add $b_ptr,sp,#$Z1sqr | ||
| 1368 | add $r_ptr,sp,#$U2 | ||
| 1369 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 1370 | |||
| 1371 | add $b_ptr,sp,#$U1 | ||
| 1372 | add $r_ptr,sp,#$H | ||
| 1373 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); | ||
| 1374 | |||
| 1375 | orr $a0,$a0,$a1 @ see if result is zero | ||
| 1376 | orr $a2,$a2,$a3 | ||
| 1377 | orr $a4,$a4,$a5 | ||
| 1378 | orr $a0,$a0,$a2 | ||
| 1379 | orr $a4,$a4,$a6 | ||
| 1380 | orr $a0,$a0,$a7 | ||
| 1381 | orrs $a0,$a0,$a4 | ||
| 1382 | |||
| 1383 | bne .Ladd_proceed @ is_equal(U1,U2)? | ||
| 1384 | |||
| 1385 | ldr $t0,[sp,#32*18+4] | ||
| 1386 | ldr $t1,[sp,#32*18+8] | ||
| 1387 | ldr $t2,[sp,#32*18+12] | ||
| 1388 | tst $t0,$t1 | ||
| 1389 | beq .Ladd_proceed @ (in1infty || in2infty)? | ||
| 1390 | tst $t2,$t2 | ||
| 1391 | beq .Ladd_double @ is_equal(S1,S2)? | ||
| 1392 | |||
| 1393 | ldr $r_ptr,[sp,#32*18+16] | ||
| 1394 | eor r4,r4,r4 | ||
| 1395 | eor r5,r5,r5 | ||
| 1396 | eor r6,r6,r6 | ||
| 1397 | eor r7,r7,r7 | ||
| 1398 | eor r8,r8,r8 | ||
| 1399 | eor r9,r9,r9 | ||
| 1400 | eor r10,r10,r10 | ||
| 1401 | eor r11,r11,r11 | ||
| 1402 | stmia $r_ptr!,{r4-r11} | ||
| 1403 | stmia $r_ptr!,{r4-r11} | ||
| 1404 | stmia $r_ptr!,{r4-r11} | ||
| 1405 | b .Ladd_done | ||
| 1406 | |||
| 1407 | .align 4 | ||
| 1408 | .Ladd_double: | ||
| 1409 | ldr $a_ptr,[sp,#32*18+20] | ||
| 1410 | add sp,sp,#32*(18-5)+16 @ difference in frame sizes | ||
| 1411 | b .Lpoint_double_shortcut | ||
| 1412 | |||
| 1413 | .align 4 | ||
| 1414 | .Ladd_proceed: | ||
| 1415 | add $a_ptr,sp,#$R | ||
| 1416 | add $b_ptr,sp,#$R | ||
| 1417 | add $r_ptr,sp,#$Rsqr | ||
| 1418 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); | ||
| 1419 | |||
| 1420 | add $a_ptr,sp,#$H | ||
| 1421 | add $b_ptr,sp,#$in1_z | ||
| 1422 | add $r_ptr,sp,#$res_z | ||
| 1423 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); | ||
| 1424 | |||
| 1425 | add $a_ptr,sp,#$H | ||
| 1426 | add $b_ptr,sp,#$H | ||
| 1427 | add $r_ptr,sp,#$Hsqr | ||
| 1428 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); | ||
| 1429 | |||
| 1430 | add $a_ptr,sp,#$in2_z | ||
| 1431 | add $b_ptr,sp,#$res_z | ||
| 1432 | add $r_ptr,sp,#$res_z | ||
| 1433 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); | ||
| 1434 | |||
| 1435 | add $a_ptr,sp,#$H | ||
| 1436 | add $b_ptr,sp,#$Hsqr | ||
| 1437 | add $r_ptr,sp,#$Hcub | ||
| 1438 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); | ||
| 1439 | |||
| 1440 | add $a_ptr,sp,#$Hsqr | ||
| 1441 | add $b_ptr,sp,#$U1 | ||
| 1442 | add $r_ptr,sp,#$U2 | ||
| 1443 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); | ||
| 1444 | |||
| 1445 | add $r_ptr,sp,#$Hsqr | ||
| 1446 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); | ||
| 1447 | |||
| 1448 | add $b_ptr,sp,#$Rsqr | ||
| 1449 | add $r_ptr,sp,#$res_x | ||
| 1450 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); | ||
| 1451 | |||
| 1452 | add $b_ptr,sp,#$Hcub | ||
| 1453 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); | ||
| 1454 | |||
| 1455 | add $b_ptr,sp,#$U2 | ||
| 1456 | add $r_ptr,sp,#$res_y | ||
| 1457 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); | ||
| 1458 | |||
| 1459 | add $a_ptr,sp,#$Hcub | ||
| 1460 | add $b_ptr,sp,#$S1 | ||
| 1461 | add $r_ptr,sp,#$S2 | ||
| 1462 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); | ||
| 1463 | |||
| 1464 | add $a_ptr,sp,#$R | ||
| 1465 | add $b_ptr,sp,#$res_y | ||
| 1466 | add $r_ptr,sp,#$res_y | ||
| 1467 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); | ||
| 1468 | |||
| 1469 | add $b_ptr,sp,#$S2 | ||
| 1470 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); | ||
| 1471 | |||
| 1472 | ldr r11,[sp,#32*18+4] @ !in1intfy | ||
| 1473 | ldr r12,[sp,#32*18+8] @ !in2intfy | ||
| 1474 | add r1,sp,#$res_x | ||
| 1475 | add r2,sp,#$in2_x | ||
| 1476 | and r10,r11,r12 | ||
| 1477 | mvn r11,r11 | ||
| 1478 | add r3,sp,#$in1_x | ||
| 1479 | and r11,r11,r12 | ||
| 1480 | mvn r12,r12 | ||
| 1481 | ldr $r_ptr,[sp,#32*18+16] | ||
| 1482 | ___ | ||
| 1483 | for($i=0;$i<96;$i+=8) { # conditional moves | ||
| 1484 | $code.=<<___; | ||
| 1485 | ldmia r1!,{r4-r5} @ res_x | ||
| 1486 | ldmia r2!,{r6-r7} @ in2_x | ||
| 1487 | ldmia r3!,{r8-r9} @ in1_x | ||
| 1488 | and r4,r4,r10 | ||
| 1489 | and r5,r5,r10 | ||
| 1490 | and r6,r6,r11 | ||
| 1491 | and r7,r7,r11 | ||
| 1492 | and r8,r8,r12 | ||
| 1493 | and r9,r9,r12 | ||
| 1494 | orr r4,r4,r6 | ||
| 1495 | orr r5,r5,r7 | ||
| 1496 | orr r4,r4,r8 | ||
| 1497 | orr r5,r5,r9 | ||
| 1498 | stmia $r_ptr!,{r4-r5} | ||
| 1499 | ___ | ||
| 1500 | } | ||
| 1501 | $code.=<<___; | ||
| 1502 | .Ladd_done: | ||
| 1503 | add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" | ||
| 1504 | #if __ARM_ARCH__>=5 || defined(__thumb__) | ||
| 1505 | ldmia sp!,{r4-r12,pc} | ||
| 1506 | #else | ||
| 1507 | ldmia sp!,{r4-r12,lr} | ||
| 1508 | bx lr @ interoperable with Thumb ISA:-) | ||
| 1509 | #endif | ||
| 1510 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | ||
| 1511 | ___ | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | ######################################################################## | ||
| 1515 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | ||
| 1516 | # const P256_POINT_AFFINE *in2); | ||
| 1517 | { | ||
| 1518 | my ($res_x,$res_y,$res_z, | ||
| 1519 | $in1_x,$in1_y,$in1_z, | ||
| 1520 | $in2_x,$in2_y, | ||
| 1521 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | ||
| 1522 | my $Z1sqr = $S2; | ||
| 1523 | # above map() describes stack layout with 18 temporary | ||
| 1524 | # 256-bit vectors on top. Then note that we push | ||
| 1525 | # starting from r0, which means that we have copy of | ||
| 1526 | # input arguments just below these temporary vectors. | ||
| 1527 | # We use two of them for !in1infty, !in2intfy. | ||
| 1528 | |||
| 1529 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | ||
| 1530 | |||
| 1531 | $code.=<<___; | ||
| 1532 | .globl ecp_nistz256_point_add_affine | ||
| 1533 | .type ecp_nistz256_point_add_affine,%function | ||
| 1534 | .align 5 | ||
| 1535 | ecp_nistz256_point_add_affine: | ||
| 1536 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | ||
| 1537 | sub sp,sp,#32*15 | ||
| 1538 | |||
| 1539 | ldmia $a_ptr!,{r4-r11} @ copy in1_x | ||
| 1540 | add r3,sp,#$in1_x | ||
| 1541 | stmia r3!,{r4-r11} | ||
| 1542 | ldmia $a_ptr!,{r4-r11} @ copy in1_y | ||
| 1543 | stmia r3!,{r4-r11} | ||
| 1544 | ldmia $a_ptr,{r4-r11} @ copy in1_z | ||
| 1545 | orr r12,r4,r5 | ||
| 1546 | orr r12,r12,r6 | ||
| 1547 | orr r12,r12,r7 | ||
| 1548 | orr r12,r12,r8 | ||
| 1549 | orr r12,r12,r9 | ||
| 1550 | orr r12,r12,r10 | ||
| 1551 | orr r12,r12,r11 | ||
| 1552 | cmp r12,#0 | ||
| 1553 | #ifdef __thumb2__ | ||
| 1554 | it ne | ||
| 1555 | #endif | ||
| 1556 | movne r12,#-1 | ||
| 1557 | stmia r3,{r4-r11} | ||
| 1558 | str r12,[sp,#32*15+4] @ !in1infty | ||
| 1559 | |||
| 1560 | ldmia $b_ptr!,{r4-r11} @ copy in2_x | ||
| 1561 | add r3,sp,#$in2_x | ||
| 1562 | orr r12,r4,r5 | ||
| 1563 | orr r12,r12,r6 | ||
| 1564 | orr r12,r12,r7 | ||
| 1565 | orr r12,r12,r8 | ||
| 1566 | orr r12,r12,r9 | ||
| 1567 | orr r12,r12,r10 | ||
| 1568 | orr r12,r12,r11 | ||
| 1569 | stmia r3!,{r4-r11} | ||
| 1570 | ldmia $b_ptr!,{r4-r11} @ copy in2_y | ||
| 1571 | orr r12,r12,r4 | ||
| 1572 | orr r12,r12,r5 | ||
| 1573 | orr r12,r12,r6 | ||
| 1574 | orr r12,r12,r7 | ||
| 1575 | orr r12,r12,r8 | ||
| 1576 | orr r12,r12,r9 | ||
| 1577 | orr r12,r12,r10 | ||
| 1578 | orr r12,r12,r11 | ||
| 1579 | stmia r3!,{r4-r11} | ||
| 1580 | cmp r12,#0 | ||
| 1581 | #ifdef __thumb2__ | ||
| 1582 | it ne | ||
| 1583 | #endif | ||
| 1584 | movne r12,#-1 | ||
| 1585 | str r12,[sp,#32*15+8] @ !in2infty | ||
| 1586 | |||
| 1587 | add $a_ptr,sp,#$in1_z | ||
| 1588 | add $b_ptr,sp,#$in1_z | ||
| 1589 | add $r_ptr,sp,#$Z1sqr | ||
| 1590 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); | ||
| 1591 | |||
| 1592 | add $a_ptr,sp,#$Z1sqr | ||
| 1593 | add $b_ptr,sp,#$in2_x | ||
| 1594 | add $r_ptr,sp,#$U2 | ||
| 1595 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1596 | |||
| 1597 | add $b_ptr,sp,#$in1_x | ||
| 1598 | add $r_ptr,sp,#$H | ||
| 1599 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); | ||
| 1600 | |||
| 1601 | add $a_ptr,sp,#$Z1sqr | ||
| 1602 | add $b_ptr,sp,#$in1_z | ||
| 1603 | add $r_ptr,sp,#$S2 | ||
| 1604 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1605 | |||
| 1606 | add $a_ptr,sp,#$H | ||
| 1607 | add $b_ptr,sp,#$in1_z | ||
| 1608 | add $r_ptr,sp,#$res_z | ||
| 1609 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); | ||
| 1610 | |||
| 1611 | add $a_ptr,sp,#$in2_y | ||
| 1612 | add $b_ptr,sp,#$S2 | ||
| 1613 | add $r_ptr,sp,#$S2 | ||
| 1614 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); | ||
| 1615 | |||
| 1616 | add $b_ptr,sp,#$in1_y | ||
| 1617 | add $r_ptr,sp,#$R | ||
| 1618 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); | ||
| 1619 | |||
| 1620 | add $a_ptr,sp,#$H | ||
| 1621 | add $b_ptr,sp,#$H | ||
| 1622 | add $r_ptr,sp,#$Hsqr | ||
| 1623 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); | ||
| 1624 | |||
| 1625 | add $a_ptr,sp,#$R | ||
| 1626 | add $b_ptr,sp,#$R | ||
| 1627 | add $r_ptr,sp,#$Rsqr | ||
| 1628 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); | ||
| 1629 | |||
| 1630 | add $a_ptr,sp,#$H | ||
| 1631 | add $b_ptr,sp,#$Hsqr | ||
| 1632 | add $r_ptr,sp,#$Hcub | ||
| 1633 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); | ||
| 1634 | |||
| 1635 | add $a_ptr,sp,#$Hsqr | ||
| 1636 | add $b_ptr,sp,#$in1_x | ||
| 1637 | add $r_ptr,sp,#$U2 | ||
| 1638 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1639 | |||
| 1640 | add $r_ptr,sp,#$Hsqr | ||
| 1641 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); | ||
| 1642 | |||
| 1643 | add $b_ptr,sp,#$Rsqr | ||
| 1644 | add $r_ptr,sp,#$res_x | ||
| 1645 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); | ||
| 1646 | |||
| 1647 | add $b_ptr,sp,#$Hcub | ||
| 1648 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); | ||
| 1649 | |||
| 1650 | add $b_ptr,sp,#$U2 | ||
| 1651 | add $r_ptr,sp,#$res_y | ||
| 1652 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); | ||
| 1653 | |||
| 1654 | add $a_ptr,sp,#$Hcub | ||
| 1655 | add $b_ptr,sp,#$in1_y | ||
| 1656 | add $r_ptr,sp,#$S2 | ||
| 1657 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); | ||
| 1658 | |||
| 1659 | add $a_ptr,sp,#$R | ||
| 1660 | add $b_ptr,sp,#$res_y | ||
| 1661 | add $r_ptr,sp,#$res_y | ||
| 1662 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); | ||
| 1663 | |||
| 1664 | add $b_ptr,sp,#$S2 | ||
| 1665 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); | ||
| 1666 | |||
| 1667 | ldr r11,[sp,#32*15+4] @ !in1intfy | ||
| 1668 | ldr r12,[sp,#32*15+8] @ !in2intfy | ||
| 1669 | add r1,sp,#$res_x | ||
| 1670 | add r2,sp,#$in2_x | ||
| 1671 | and r10,r11,r12 | ||
| 1672 | mvn r11,r11 | ||
| 1673 | add r3,sp,#$in1_x | ||
| 1674 | and r11,r11,r12 | ||
| 1675 | mvn r12,r12 | ||
| 1676 | ldr $r_ptr,[sp,#32*15] | ||
| 1677 | ___ | ||
| 1678 | for($i=0;$i<64;$i+=8) { # conditional moves | ||
| 1679 | $code.=<<___; | ||
| 1680 | ldmia r1!,{r4-r5} @ res_x | ||
| 1681 | ldmia r2!,{r6-r7} @ in2_x | ||
| 1682 | ldmia r3!,{r8-r9} @ in1_x | ||
| 1683 | and r4,r4,r10 | ||
| 1684 | and r5,r5,r10 | ||
| 1685 | and r6,r6,r11 | ||
| 1686 | and r7,r7,r11 | ||
| 1687 | and r8,r8,r12 | ||
| 1688 | and r9,r9,r12 | ||
| 1689 | orr r4,r4,r6 | ||
| 1690 | orr r5,r5,r7 | ||
| 1691 | orr r4,r4,r8 | ||
| 1692 | orr r5,r5,r9 | ||
| 1693 | stmia $r_ptr!,{r4-r5} | ||
| 1694 | ___ | ||
| 1695 | } | ||
| 1696 | for(;$i<96;$i+=8) { | ||
| 1697 | my $j=($i-64)/4; | ||
| 1698 | $code.=<<___; | ||
| 1699 | ldmia r1!,{r4-r5} @ res_z | ||
| 1700 | ldmia r3!,{r8-r9} @ in1_z | ||
| 1701 | and r4,r4,r10 | ||
| 1702 | and r5,r5,r10 | ||
| 1703 | and r6,r11,#@ONE_mont[$j] | ||
| 1704 | and r7,r11,#@ONE_mont[$j+1] | ||
| 1705 | and r8,r8,r12 | ||
| 1706 | and r9,r9,r12 | ||
| 1707 | orr r4,r4,r6 | ||
| 1708 | orr r5,r5,r7 | ||
| 1709 | orr r4,r4,r8 | ||
| 1710 | orr r5,r5,r9 | ||
| 1711 | stmia $r_ptr!,{r4-r5} | ||
| 1712 | ___ | ||
| 1713 | } | ||
| 1714 | $code.=<<___; | ||
| 1715 | add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" | ||
| 1716 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | ||
| 1717 | ldmia sp!,{r4-r12,pc} | ||
| 1718 | #else | ||
| 1719 | ldmia sp!,{r4-r12,lr} | ||
| 1720 | bx lr @ interoperable with Thumb ISA:-) | ||
| 1721 | #endif | ||
| 1722 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | ||
| 1723 | ___ | ||
| 1724 | } }}} | ||
| 1725 | |||
| 1726 | foreach (split("\n",$code)) { | ||
| 1727 | s/\`([^\`]*)\`/eval $1/geo; | ||
| 1728 | |||
| 1729 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; | ||
| 1730 | |||
| 1731 | print $_,"\n"; | ||
| 1732 | } | ||
| 1733 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl deleted file mode 100644 index 49460fefdc..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl +++ /dev/null | |||
| @@ -1,2890 +0,0 @@ | |||
| 1 | #! /usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-sparcv9.pl,v 1.2 2022/12/26 07:18:51 jmc Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | |||
| 12 | # ==================================================================== | ||
| 13 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 14 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 15 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 16 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 17 | # ==================================================================== | ||
| 18 | # | ||
| 19 | # ECP_NISTZ256 module for SPARCv9. | ||
| 20 | # | ||
| 21 | # February 2015. | ||
| 22 | # | ||
| 23 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | ||
| 24 | # http://eprint.iacr.org/2013/816. In the process of adaptation | ||
| 25 | # original .c module was made 32-bit savvy in order to make this | ||
| 26 | # implementation possible. | ||
| 27 | # | ||
| 28 | # with/without -DECP_NISTZ256_ASM | ||
| 29 | # UltraSPARC III +12-18% | ||
| 30 | # SPARC T4 +99-550% (+66-150% on 32-bit Solaris) | ||
| 31 | # | ||
| 32 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 33 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | ||
| 34 | # operation. Keep in mind that +200% means 3x improvement. | ||
| 35 | |||
| 36 | # Uncomment when all sparcv9 assembly generators are updated to take the output | ||
| 37 | # file as last argument... | ||
| 38 | # $output = pop; | ||
| 39 | # open STDOUT,">$output"; | ||
| 40 | |||
| 41 | $code.=<<___; | ||
| 42 | #define STACK_FRAME 192 | ||
| 43 | #define STACK_BIAS 2047 | ||
| 44 | |||
| 45 | #define LOCALS (STACK_BIAS+STACK_FRAME) | ||
| 46 | .register %g2,#scratch | ||
| 47 | .register %g3,#scratch | ||
| 48 | # define STACK64_FRAME STACK_FRAME | ||
| 49 | # define LOCALS64 LOCALS | ||
| 50 | |||
| 51 | .section ".text",#alloc,#execinstr | ||
| 52 | ___ | ||
| 53 | |||
| 54 | {{{ | ||
| 55 | my ($rp,$ap,$bp)=map("%i$_",(0..2)); | ||
| 56 | my @acc=map("%l$_",(0..7)); | ||
| 57 | my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5"); | ||
| 58 | my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1"); | ||
| 59 | my ($rp_real,$ap_real)=("%g2","%g3"); | ||
| 60 | |||
| 61 | $code.=<<___; | ||
| 62 | .align 64 | ||
| 63 | .Lone: | ||
| 64 | .long 1,0,0,0,0,0,0,0 | ||
| 65 | |||
| 66 | ! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 67 | .globl ecp_nistz256_from_mont | ||
| 68 | .align 32 | ||
| 69 | ecp_nistz256_from_mont: | ||
| 70 | save %sp,-STACK_FRAME,%sp | ||
| 71 | nop | ||
| 72 | 1: call .+8 | ||
| 73 | add %o7,.Lone-1b,$bp | ||
| 74 | call __ecp_nistz256_mul_mont | ||
| 75 | nop | ||
| 76 | ret | ||
| 77 | restore | ||
| 78 | .type ecp_nistz256_from_mont,#function | ||
| 79 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | ||
| 80 | |||
| 81 | ! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8], | ||
| 82 | ! const BN_ULONG %i2[8]); | ||
| 83 | .globl ecp_nistz256_mul_mont | ||
| 84 | .align 32 | ||
| 85 | ecp_nistz256_mul_mont: | ||
| 86 | save %sp,-STACK_FRAME,%sp | ||
| 87 | nop | ||
| 88 | call __ecp_nistz256_mul_mont | ||
| 89 | nop | ||
| 90 | ret | ||
| 91 | restore | ||
| 92 | .type ecp_nistz256_mul_mont,#function | ||
| 93 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | ||
| 94 | |||
| 95 | ! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]); | ||
| 96 | .globl ecp_nistz256_sqr_mont | ||
| 97 | .align 32 | ||
| 98 | ecp_nistz256_sqr_mont: | ||
| 99 | save %sp,-STACK_FRAME,%sp | ||
| 100 | mov $ap,$bp | ||
| 101 | call __ecp_nistz256_mul_mont | ||
| 102 | nop | ||
| 103 | ret | ||
| 104 | restore | ||
| 105 | .type ecp_nistz256_sqr_mont,#function | ||
| 106 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | ||
| 107 | ___ | ||
| 108 | |||
| 109 | ######################################################################## | ||
| 110 | # Special thing to keep in mind is that $t0-$t7 hold 64-bit values, | ||
| 111 | # while all others are meant to keep 32. "Meant to" means that additions | ||
| 112 | # to @acc[0-7] do "contaminate" upper bits, but they are cleared before | ||
| 113 | # they can affect outcome (follow 'and' with $mask). Also keep in mind | ||
| 114 | # that addition with carry is addition with 32-bit carry, even though | ||
| 115 | # CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see | ||
| 116 | # below for VIS3 code paths.] | ||
| 117 | |||
| 118 | $code.=<<___; | ||
| 119 | .align 32 | ||
| 120 | __ecp_nistz256_mul_mont: | ||
| 121 | ld [$bp+0],$bi ! b[0] | ||
| 122 | mov -1,$mask | ||
| 123 | ld [$ap+0],$a0 | ||
| 124 | srl $mask,0,$mask ! 0xffffffff | ||
| 125 | ld [$ap+4],$t1 | ||
| 126 | ld [$ap+8],$t2 | ||
| 127 | ld [$ap+12],$t3 | ||
| 128 | ld [$ap+16],$t4 | ||
| 129 | ld [$ap+20],$t5 | ||
| 130 | ld [$ap+24],$t6 | ||
| 131 | ld [$ap+28],$t7 | ||
| 132 | mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results | ||
| 133 | mulx $t1,$bi,$t1 | ||
| 134 | mulx $t2,$bi,$t2 | ||
| 135 | mulx $t3,$bi,$t3 | ||
| 136 | mulx $t4,$bi,$t4 | ||
| 137 | mulx $t5,$bi,$t5 | ||
| 138 | mulx $t6,$bi,$t6 | ||
| 139 | mulx $t7,$bi,$t7 | ||
| 140 | srlx $t0,32,@acc[1] ! extract high parts | ||
| 141 | srlx $t1,32,@acc[2] | ||
| 142 | srlx $t2,32,@acc[3] | ||
| 143 | srlx $t3,32,@acc[4] | ||
| 144 | srlx $t4,32,@acc[5] | ||
| 145 | srlx $t5,32,@acc[6] | ||
| 146 | srlx $t6,32,@acc[7] | ||
| 147 | srlx $t7,32,@acc[0] ! "@acc[8]" | ||
| 148 | mov 0,$carry | ||
| 149 | ___ | ||
| 150 | for($i=1;$i<8;$i++) { | ||
| 151 | $code.=<<___; | ||
| 152 | addcc @acc[1],$t1,@acc[1] ! accumulate high parts | ||
| 153 | ld [$bp+4*$i],$bi ! b[$i] | ||
| 154 | ld [$ap+4],$t1 ! re-load a[1-7] | ||
| 155 | addccc @acc[2],$t2,@acc[2] | ||
| 156 | addccc @acc[3],$t3,@acc[3] | ||
| 157 | ld [$ap+8],$t2 | ||
| 158 | ld [$ap+12],$t3 | ||
| 159 | addccc @acc[4],$t4,@acc[4] | ||
| 160 | addccc @acc[5],$t5,@acc[5] | ||
| 161 | ld [$ap+16],$t4 | ||
| 162 | ld [$ap+20],$t5 | ||
| 163 | addccc @acc[6],$t6,@acc[6] | ||
| 164 | addccc @acc[7],$t7,@acc[7] | ||
| 165 | ld [$ap+24],$t6 | ||
| 166 | ld [$ap+28],$t7 | ||
| 167 | addccc @acc[0],$carry,@acc[0] ! "@acc[8]" | ||
| 168 | addc %g0,%g0,$carry | ||
| 169 | ___ | ||
| 170 | # Reduction iteration is normally performed by accumulating | ||
| 171 | # result of multiplication of modulus by "magic" digit [and | ||
| 172 | # omitting least significant word, which is guaranteed to | ||
| 173 | # be 0], but thanks to special form of modulus and "magic" | ||
| 174 | # digit being equal to least significant word, it can be | ||
| 175 | # performed with additions and subtractions alone. Indeed: | ||
| 176 | # | ||
| 177 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | ||
| 178 | # * abcd | ||
| 179 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 180 | # | ||
| 181 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 182 | # rewrite above as: | ||
| 183 | # | ||
| 184 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 185 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | ||
| 186 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | ||
| 187 | # | ||
| 188 | # or marking redundant operations: | ||
| 189 | # | ||
| 190 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | ||
| 191 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | ||
| 192 | # - abcd.----.----.----.----.----.----.---- | ||
| 193 | |||
| 194 | $code.=<<___; | ||
| 195 | ! multiplication-less reduction | ||
| 196 | addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0] | ||
| 197 | addccc @acc[4],%g0,@acc[4] ! r[4]+=0 | ||
| 198 | and @acc[1],$mask,@acc[1] | ||
| 199 | and @acc[2],$mask,@acc[2] | ||
| 200 | addccc @acc[5],%g0,@acc[5] ! r[5]+=0 | ||
| 201 | addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0] | ||
| 202 | and @acc[3],$mask,@acc[3] | ||
| 203 | and @acc[4],$mask,@acc[4] | ||
| 204 | addccc @acc[7],%g0,@acc[7] ! r[7]+=0 | ||
| 205 | addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]" | ||
| 206 | and @acc[5],$mask,@acc[5] | ||
| 207 | and @acc[6],$mask,@acc[6] | ||
| 208 | addc $carry,%g0,$carry ! top-most carry | ||
| 209 | subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0] | ||
| 210 | subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]" | ||
| 211 | subc $carry,%g0,$carry ! top-most carry | ||
| 212 | and @acc[7],$mask,@acc[7] | ||
| 213 | and @acc[0],$mask,@acc[0] ! "@acc[8]" | ||
| 214 | ___ | ||
| 215 | push(@acc,shift(@acc)); # rotate registers to "omit" acc[0] | ||
| 216 | $code.=<<___; | ||
| 217 | mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results | ||
| 218 | mulx $t1,$bi,$t1 | ||
| 219 | mulx $t2,$bi,$t2 | ||
| 220 | mulx $t3,$bi,$t3 | ||
| 221 | mulx $t4,$bi,$t4 | ||
| 222 | mulx $t5,$bi,$t5 | ||
| 223 | mulx $t6,$bi,$t6 | ||
| 224 | mulx $t7,$bi,$t7 | ||
| 225 | add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow | ||
| 226 | add @acc[1],$t1,$t1 | ||
| 227 | srlx $t0,32,@acc[1] ! extract high parts | ||
| 228 | add @acc[2],$t2,$t2 | ||
| 229 | srlx $t1,32,@acc[2] | ||
| 230 | add @acc[3],$t3,$t3 | ||
| 231 | srlx $t2,32,@acc[3] | ||
| 232 | add @acc[4],$t4,$t4 | ||
| 233 | srlx $t3,32,@acc[4] | ||
| 234 | add @acc[5],$t5,$t5 | ||
| 235 | srlx $t4,32,@acc[5] | ||
| 236 | add @acc[6],$t6,$t6 | ||
| 237 | srlx $t5,32,@acc[6] | ||
| 238 | add @acc[7],$t7,$t7 | ||
| 239 | srlx $t6,32,@acc[7] | ||
| 240 | srlx $t7,32,@acc[0] ! "@acc[8]" | ||
| 241 | ___ | ||
| 242 | } | ||
| 243 | $code.=<<___; | ||
| 244 | addcc @acc[1],$t1,@acc[1] ! accumulate high parts | ||
| 245 | addccc @acc[2],$t2,@acc[2] | ||
| 246 | addccc @acc[3],$t3,@acc[3] | ||
| 247 | addccc @acc[4],$t4,@acc[4] | ||
| 248 | addccc @acc[5],$t5,@acc[5] | ||
| 249 | addccc @acc[6],$t6,@acc[6] | ||
| 250 | addccc @acc[7],$t7,@acc[7] | ||
| 251 | addccc @acc[0],$carry,@acc[0] ! "@acc[8]" | ||
| 252 | addc %g0,%g0,$carry | ||
| 253 | |||
| 254 | addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction | ||
| 255 | addccc @acc[4],%g0,@acc[4] | ||
| 256 | addccc @acc[5],%g0,@acc[5] | ||
| 257 | addccc @acc[6],$t0,@acc[6] | ||
| 258 | addccc @acc[7],%g0,@acc[7] | ||
| 259 | addccc @acc[0],$t0,@acc[0] ! "@acc[8]" | ||
| 260 | addc $carry,%g0,$carry | ||
| 261 | subcc @acc[7],$t0,@acc[7] | ||
| 262 | subccc @acc[0],%g0,@acc[0] ! "@acc[8]" | ||
| 263 | subc $carry,%g0,$carry ! top-most carry | ||
| 264 | ___ | ||
| 265 | push(@acc,shift(@acc)); # rotate registers to omit acc[0] | ||
| 266 | $code.=<<___; | ||
| 267 | ! Final step is "if result > mod, subtract mod", but we do it | ||
| 268 | ! "other way around", namely subtract modulus from result | ||
| 269 | ! and if it borrowed, add modulus back. | ||
| 270 | |||
| 271 | subcc @acc[0],-1,@acc[0] ! subtract modulus | ||
| 272 | subccc @acc[1],-1,@acc[1] | ||
| 273 | subccc @acc[2],-1,@acc[2] | ||
| 274 | subccc @acc[3],0,@acc[3] | ||
| 275 | subccc @acc[4],0,@acc[4] | ||
| 276 | subccc @acc[5],0,@acc[5] | ||
| 277 | subccc @acc[6],1,@acc[6] | ||
| 278 | subccc @acc[7],-1,@acc[7] | ||
| 279 | subc $carry,0,$carry ! broadcast borrow bit | ||
| 280 | |||
| 281 | ! Note that because mod has special form, i.e. consists of | ||
| 282 | ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 283 | ! using value of broadcasted borrow and the borrow bit itself. | ||
| 284 | ! To minimize dependency chain we first broadcast and then | ||
| 285 | ! extract the bit by negating (follow $bi). | ||
| 286 | |||
| 287 | addcc @acc[0],$carry,@acc[0] ! add modulus or zero | ||
| 288 | addccc @acc[1],$carry,@acc[1] | ||
| 289 | neg $carry,$bi | ||
| 290 | st @acc[0],[$rp] | ||
| 291 | addccc @acc[2],$carry,@acc[2] | ||
| 292 | st @acc[1],[$rp+4] | ||
| 293 | addccc @acc[3],0,@acc[3] | ||
| 294 | st @acc[2],[$rp+8] | ||
| 295 | addccc @acc[4],0,@acc[4] | ||
| 296 | st @acc[3],[$rp+12] | ||
| 297 | addccc @acc[5],0,@acc[5] | ||
| 298 | st @acc[4],[$rp+16] | ||
| 299 | addccc @acc[6],$bi,@acc[6] | ||
| 300 | st @acc[5],[$rp+20] | ||
| 301 | addc @acc[7],$carry,@acc[7] | ||
| 302 | st @acc[6],[$rp+24] | ||
| 303 | retl | ||
| 304 | st @acc[7],[$rp+28] | ||
| 305 | .type __ecp_nistz256_mul_mont,#function | ||
| 306 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont | ||
| 307 | |||
| 308 | ! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8], | ||
| 309 | ! const BN_ULONG %i2[8]); | ||
| 310 | .globl ecp_nistz256_add | ||
| 311 | .align 32 | ||
| 312 | ecp_nistz256_add: | ||
| 313 | save %sp,-STACK_FRAME,%sp | ||
| 314 | ld [$ap],@acc[0] | ||
| 315 | ld [$ap+4],@acc[1] | ||
| 316 | ld [$ap+8],@acc[2] | ||
| 317 | ld [$ap+12],@acc[3] | ||
| 318 | ld [$ap+16],@acc[4] | ||
| 319 | ld [$ap+20],@acc[5] | ||
| 320 | ld [$ap+24],@acc[6] | ||
| 321 | call __ecp_nistz256_add | ||
| 322 | ld [$ap+28],@acc[7] | ||
| 323 | ret | ||
| 324 | restore | ||
| 325 | .type ecp_nistz256_add,#function | ||
| 326 | .size ecp_nistz256_add,.-ecp_nistz256_add | ||
| 327 | |||
| 328 | .align 32 | ||
| 329 | __ecp_nistz256_add: | ||
| 330 | ld [$bp+0],$t0 ! b[0] | ||
| 331 | ld [$bp+4],$t1 | ||
| 332 | ld [$bp+8],$t2 | ||
| 333 | ld [$bp+12],$t3 | ||
| 334 | addcc @acc[0],$t0,@acc[0] | ||
| 335 | ld [$bp+16],$t4 | ||
| 336 | ld [$bp+20],$t5 | ||
| 337 | addccc @acc[1],$t1,@acc[1] | ||
| 338 | ld [$bp+24],$t6 | ||
| 339 | ld [$bp+28],$t7 | ||
| 340 | addccc @acc[2],$t2,@acc[2] | ||
| 341 | addccc @acc[3],$t3,@acc[3] | ||
| 342 | addccc @acc[4],$t4,@acc[4] | ||
| 343 | addccc @acc[5],$t5,@acc[5] | ||
| 344 | addccc @acc[6],$t6,@acc[6] | ||
| 345 | addccc @acc[7],$t7,@acc[7] | ||
| 346 | addc %g0,%g0,$carry | ||
| 347 | |||
| 348 | .Lreduce_by_sub: | ||
| 349 | |||
| 350 | ! if a+b >= modulus, subtract modulus. | ||
| 351 | ! | ||
| 352 | ! But since comparison implies subtraction, we subtract | ||
| 353 | ! modulus and then add it back if subtraction borrowed. | ||
| 354 | |||
| 355 | subcc @acc[0],-1,@acc[0] | ||
| 356 | subccc @acc[1],-1,@acc[1] | ||
| 357 | subccc @acc[2],-1,@acc[2] | ||
| 358 | subccc @acc[3], 0,@acc[3] | ||
| 359 | subccc @acc[4], 0,@acc[4] | ||
| 360 | subccc @acc[5], 0,@acc[5] | ||
| 361 | subccc @acc[6], 1,@acc[6] | ||
| 362 | subccc @acc[7],-1,@acc[7] | ||
| 363 | subc $carry,0,$carry | ||
| 364 | |||
| 365 | ! Note that because mod has special form, i.e. consists of | ||
| 366 | ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 367 | ! using value of borrow and its negative. | ||
| 368 | |||
| 369 | addcc @acc[0],$carry,@acc[0] ! add synthesized modulus | ||
| 370 | addccc @acc[1],$carry,@acc[1] | ||
| 371 | neg $carry,$bi | ||
| 372 | st @acc[0],[$rp] | ||
| 373 | addccc @acc[2],$carry,@acc[2] | ||
| 374 | st @acc[1],[$rp+4] | ||
| 375 | addccc @acc[3],0,@acc[3] | ||
| 376 | st @acc[2],[$rp+8] | ||
| 377 | addccc @acc[4],0,@acc[4] | ||
| 378 | st @acc[3],[$rp+12] | ||
| 379 | addccc @acc[5],0,@acc[5] | ||
| 380 | st @acc[4],[$rp+16] | ||
| 381 | addccc @acc[6],$bi,@acc[6] | ||
| 382 | st @acc[5],[$rp+20] | ||
| 383 | addc @acc[7],$carry,@acc[7] | ||
| 384 | st @acc[6],[$rp+24] | ||
| 385 | retl | ||
| 386 | st @acc[7],[$rp+28] | ||
| 387 | .type __ecp_nistz256_add,#function | ||
| 388 | .size __ecp_nistz256_add,.-__ecp_nistz256_add | ||
| 389 | |||
| 390 | ! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 391 | .globl ecp_nistz256_mul_by_2 | ||
| 392 | .align 32 | ||
| 393 | ecp_nistz256_mul_by_2: | ||
| 394 | save %sp,-STACK_FRAME,%sp | ||
| 395 | ld [$ap],@acc[0] | ||
| 396 | ld [$ap+4],@acc[1] | ||
| 397 | ld [$ap+8],@acc[2] | ||
| 398 | ld [$ap+12],@acc[3] | ||
| 399 | ld [$ap+16],@acc[4] | ||
| 400 | ld [$ap+20],@acc[5] | ||
| 401 | ld [$ap+24],@acc[6] | ||
| 402 | call __ecp_nistz256_mul_by_2 | ||
| 403 | ld [$ap+28],@acc[7] | ||
| 404 | ret | ||
| 405 | restore | ||
| 406 | .type ecp_nistz256_mul_by_2,#function | ||
| 407 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | ||
| 408 | |||
| 409 | .align 32 | ||
| 410 | __ecp_nistz256_mul_by_2: | ||
| 411 | addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a | ||
| 412 | addccc @acc[1],@acc[1],@acc[1] | ||
| 413 | addccc @acc[2],@acc[2],@acc[2] | ||
| 414 | addccc @acc[3],@acc[3],@acc[3] | ||
| 415 | addccc @acc[4],@acc[4],@acc[4] | ||
| 416 | addccc @acc[5],@acc[5],@acc[5] | ||
| 417 | addccc @acc[6],@acc[6],@acc[6] | ||
| 418 | addccc @acc[7],@acc[7],@acc[7] | ||
| 419 | b .Lreduce_by_sub | ||
| 420 | addc %g0,%g0,$carry | ||
| 421 | .type __ecp_nistz256_mul_by_2,#function | ||
| 422 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 | ||
| 423 | |||
| 424 | ! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 425 | .globl ecp_nistz256_mul_by_3 | ||
| 426 | .align 32 | ||
| 427 | ecp_nistz256_mul_by_3: | ||
| 428 | save %sp,-STACK_FRAME,%sp | ||
| 429 | ld [$ap],@acc[0] | ||
| 430 | ld [$ap+4],@acc[1] | ||
| 431 | ld [$ap+8],@acc[2] | ||
| 432 | ld [$ap+12],@acc[3] | ||
| 433 | ld [$ap+16],@acc[4] | ||
| 434 | ld [$ap+20],@acc[5] | ||
| 435 | ld [$ap+24],@acc[6] | ||
| 436 | call __ecp_nistz256_mul_by_3 | ||
| 437 | ld [$ap+28],@acc[7] | ||
| 438 | ret | ||
| 439 | restore | ||
| 440 | .type ecp_nistz256_mul_by_3,#function | ||
| 441 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | ||
| 442 | |||
| 443 | .align 32 | ||
| 444 | __ecp_nistz256_mul_by_3: | ||
| 445 | addcc @acc[0],@acc[0],$t0 ! a+a=2*a | ||
| 446 | addccc @acc[1],@acc[1],$t1 | ||
| 447 | addccc @acc[2],@acc[2],$t2 | ||
| 448 | addccc @acc[3],@acc[3],$t3 | ||
| 449 | addccc @acc[4],@acc[4],$t4 | ||
| 450 | addccc @acc[5],@acc[5],$t5 | ||
| 451 | addccc @acc[6],@acc[6],$t6 | ||
| 452 | addccc @acc[7],@acc[7],$t7 | ||
| 453 | addc %g0,%g0,$carry | ||
| 454 | |||
| 455 | subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores | ||
| 456 | subccc $t1,-1,$t1 | ||
| 457 | subccc $t2,-1,$t2 | ||
| 458 | subccc $t3, 0,$t3 | ||
| 459 | subccc $t4, 0,$t4 | ||
| 460 | subccc $t5, 0,$t5 | ||
| 461 | subccc $t6, 1,$t6 | ||
| 462 | subccc $t7,-1,$t7 | ||
| 463 | subc $carry,0,$carry | ||
| 464 | |||
| 465 | addcc $t0,$carry,$t0 ! add synthesized modulus | ||
| 466 | addccc $t1,$carry,$t1 | ||
| 467 | neg $carry,$bi | ||
| 468 | addccc $t2,$carry,$t2 | ||
| 469 | addccc $t3,0,$t3 | ||
| 470 | addccc $t4,0,$t4 | ||
| 471 | addccc $t5,0,$t5 | ||
| 472 | addccc $t6,$bi,$t6 | ||
| 473 | addc $t7,$carry,$t7 | ||
| 474 | |||
| 475 | addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a | ||
| 476 | addccc $t1,@acc[1],@acc[1] | ||
| 477 | addccc $t2,@acc[2],@acc[2] | ||
| 478 | addccc $t3,@acc[3],@acc[3] | ||
| 479 | addccc $t4,@acc[4],@acc[4] | ||
| 480 | addccc $t5,@acc[5],@acc[5] | ||
| 481 | addccc $t6,@acc[6],@acc[6] | ||
| 482 | addccc $t7,@acc[7],@acc[7] | ||
| 483 | b .Lreduce_by_sub | ||
| 484 | addc %g0,%g0,$carry | ||
| 485 | .type __ecp_nistz256_mul_by_3,#function | ||
| 486 | .size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 | ||
| 487 | |||
| 488 | ! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 489 | .globl ecp_nistz256_neg | ||
| 490 | .align 32 | ||
| 491 | ecp_nistz256_neg: | ||
| 492 | save %sp,-STACK_FRAME,%sp | ||
| 493 | mov $ap,$bp | ||
| 494 | mov 0,@acc[0] | ||
| 495 | mov 0,@acc[1] | ||
| 496 | mov 0,@acc[2] | ||
| 497 | mov 0,@acc[3] | ||
| 498 | mov 0,@acc[4] | ||
| 499 | mov 0,@acc[5] | ||
| 500 | mov 0,@acc[6] | ||
| 501 | call __ecp_nistz256_sub_from | ||
| 502 | mov 0,@acc[7] | ||
| 503 | ret | ||
| 504 | restore | ||
| 505 | .type ecp_nistz256_neg,#function | ||
| 506 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | ||
| 507 | |||
| 508 | .align 32 | ||
| 509 | __ecp_nistz256_sub_from: | ||
| 510 | ld [$bp+0],$t0 ! b[0] | ||
| 511 | ld [$bp+4],$t1 | ||
| 512 | ld [$bp+8],$t2 | ||
| 513 | ld [$bp+12],$t3 | ||
| 514 | subcc @acc[0],$t0,@acc[0] | ||
| 515 | ld [$bp+16],$t4 | ||
| 516 | ld [$bp+20],$t5 | ||
| 517 | subccc @acc[1],$t1,@acc[1] | ||
| 518 | subccc @acc[2],$t2,@acc[2] | ||
| 519 | ld [$bp+24],$t6 | ||
| 520 | ld [$bp+28],$t7 | ||
| 521 | subccc @acc[3],$t3,@acc[3] | ||
| 522 | subccc @acc[4],$t4,@acc[4] | ||
| 523 | subccc @acc[5],$t5,@acc[5] | ||
| 524 | subccc @acc[6],$t6,@acc[6] | ||
| 525 | subccc @acc[7],$t7,@acc[7] | ||
| 526 | subc %g0,%g0,$carry ! broadcast borrow bit | ||
| 527 | |||
| 528 | .Lreduce_by_add: | ||
| 529 | |||
| 530 | ! if a-b borrows, add modulus. | ||
| 531 | ! | ||
| 532 | ! Note that because mod has special form, i.e. consists of | ||
| 533 | ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 534 | ! using value of broadcasted borrow and the borrow bit itself. | ||
| 535 | ! To minimize dependency chain we first broadcast and then | ||
| 536 | ! extract the bit by negating (follow $bi). | ||
| 537 | |||
| 538 | addcc @acc[0],$carry,@acc[0] ! add synthesized modulus | ||
| 539 | addccc @acc[1],$carry,@acc[1] | ||
| 540 | neg $carry,$bi | ||
| 541 | st @acc[0],[$rp] | ||
| 542 | addccc @acc[2],$carry,@acc[2] | ||
| 543 | st @acc[1],[$rp+4] | ||
| 544 | addccc @acc[3],0,@acc[3] | ||
| 545 | st @acc[2],[$rp+8] | ||
| 546 | addccc @acc[4],0,@acc[4] | ||
| 547 | st @acc[3],[$rp+12] | ||
| 548 | addccc @acc[5],0,@acc[5] | ||
| 549 | st @acc[4],[$rp+16] | ||
| 550 | addccc @acc[6],$bi,@acc[6] | ||
| 551 | st @acc[5],[$rp+20] | ||
| 552 | addc @acc[7],$carry,@acc[7] | ||
| 553 | st @acc[6],[$rp+24] | ||
| 554 | retl | ||
| 555 | st @acc[7],[$rp+28] | ||
| 556 | .type __ecp_nistz256_sub_from,#function | ||
| 557 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | ||
| 558 | |||
| 559 | .align 32 | ||
| 560 | __ecp_nistz256_sub_morf: | ||
| 561 | ld [$bp+0],$t0 ! b[0] | ||
| 562 | ld [$bp+4],$t1 | ||
| 563 | ld [$bp+8],$t2 | ||
| 564 | ld [$bp+12],$t3 | ||
| 565 | subcc $t0,@acc[0],@acc[0] | ||
| 566 | ld [$bp+16],$t4 | ||
| 567 | ld [$bp+20],$t5 | ||
| 568 | subccc $t1,@acc[1],@acc[1] | ||
| 569 | subccc $t2,@acc[2],@acc[2] | ||
| 570 | ld [$bp+24],$t6 | ||
| 571 | ld [$bp+28],$t7 | ||
| 572 | subccc $t3,@acc[3],@acc[3] | ||
| 573 | subccc $t4,@acc[4],@acc[4] | ||
| 574 | subccc $t5,@acc[5],@acc[5] | ||
| 575 | subccc $t6,@acc[6],@acc[6] | ||
| 576 | subccc $t7,@acc[7],@acc[7] | ||
| 577 | b .Lreduce_by_add | ||
| 578 | subc %g0,%g0,$carry ! broadcast borrow bit | ||
| 579 | .type __ecp_nistz256_sub_morf,#function | ||
| 580 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | ||
| 581 | |||
| 582 | ! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); | ||
| 583 | .globl ecp_nistz256_div_by_2 | ||
| 584 | .align 32 | ||
| 585 | ecp_nistz256_div_by_2: | ||
| 586 | save %sp,-STACK_FRAME,%sp | ||
| 587 | ld [$ap],@acc[0] | ||
| 588 | ld [$ap+4],@acc[1] | ||
| 589 | ld [$ap+8],@acc[2] | ||
| 590 | ld [$ap+12],@acc[3] | ||
| 591 | ld [$ap+16],@acc[4] | ||
| 592 | ld [$ap+20],@acc[5] | ||
| 593 | ld [$ap+24],@acc[6] | ||
| 594 | call __ecp_nistz256_div_by_2 | ||
| 595 | ld [$ap+28],@acc[7] | ||
| 596 | ret | ||
| 597 | restore | ||
| 598 | .type ecp_nistz256_div_by_2,#function | ||
| 599 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | ||
| 600 | |||
| 601 | .align 32 | ||
| 602 | __ecp_nistz256_div_by_2: | ||
| 603 | ! ret = (a is odd ? a+mod : a) >> 1 | ||
| 604 | |||
| 605 | and @acc[0],1,$bi | ||
| 606 | neg $bi,$carry | ||
| 607 | addcc @acc[0],$carry,@acc[0] | ||
| 608 | addccc @acc[1],$carry,@acc[1] | ||
| 609 | addccc @acc[2],$carry,@acc[2] | ||
| 610 | addccc @acc[3],0,@acc[3] | ||
| 611 | addccc @acc[4],0,@acc[4] | ||
| 612 | addccc @acc[5],0,@acc[5] | ||
| 613 | addccc @acc[6],$bi,@acc[6] | ||
| 614 | addccc @acc[7],$carry,@acc[7] | ||
| 615 | addc %g0,%g0,$carry | ||
| 616 | |||
| 617 | ! ret >>= 1 | ||
| 618 | |||
| 619 | srl @acc[0],1,@acc[0] | ||
| 620 | sll @acc[1],31,$t0 | ||
| 621 | srl @acc[1],1,@acc[1] | ||
| 622 | or @acc[0],$t0,@acc[0] | ||
| 623 | sll @acc[2],31,$t1 | ||
| 624 | srl @acc[2],1,@acc[2] | ||
| 625 | or @acc[1],$t1,@acc[1] | ||
| 626 | sll @acc[3],31,$t2 | ||
| 627 | st @acc[0],[$rp] | ||
| 628 | srl @acc[3],1,@acc[3] | ||
| 629 | or @acc[2],$t2,@acc[2] | ||
| 630 | sll @acc[4],31,$t3 | ||
| 631 | st @acc[1],[$rp+4] | ||
| 632 | srl @acc[4],1,@acc[4] | ||
| 633 | or @acc[3],$t3,@acc[3] | ||
| 634 | sll @acc[5],31,$t4 | ||
| 635 | st @acc[2],[$rp+8] | ||
| 636 | srl @acc[5],1,@acc[5] | ||
| 637 | or @acc[4],$t4,@acc[4] | ||
| 638 | sll @acc[6],31,$t5 | ||
| 639 | st @acc[3],[$rp+12] | ||
| 640 | srl @acc[6],1,@acc[6] | ||
| 641 | or @acc[5],$t5,@acc[5] | ||
| 642 | sll @acc[7],31,$t6 | ||
| 643 | st @acc[4],[$rp+16] | ||
| 644 | srl @acc[7],1,@acc[7] | ||
| 645 | or @acc[6],$t6,@acc[6] | ||
| 646 | sll $carry,31,$t7 | ||
| 647 | st @acc[5],[$rp+20] | ||
| 648 | or @acc[7],$t7,@acc[7] | ||
| 649 | st @acc[6],[$rp+24] | ||
| 650 | retl | ||
| 651 | st @acc[7],[$rp+28] | ||
| 652 | .type __ecp_nistz256_div_by_2,#function | ||
| 653 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 | ||
| 654 | ___ | ||
| 655 | |||
| 656 | ######################################################################## | ||
| 657 | # following subroutines are "literal" implementation of those found in | ||
| 658 | # ecp_nistz256.c | ||
| 659 | # | ||
| 660 | ######################################################################## | ||
| 661 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 662 | # | ||
| 663 | { | ||
| 664 | my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); | ||
| 665 | # above map() describes stack layout with 4 temporary | ||
| 666 | # 256-bit vectors on top. | ||
| 667 | |||
| 668 | $code.=<<___; | ||
| 669 | #if 0 | ||
| 670 | #ifdef __PIC__ | ||
| 671 | SPARC_PIC_THUNK(%g1) | ||
| 672 | #endif | ||
| 673 | #endif | ||
| 674 | |||
| 675 | .globl ecp_nistz256_point_double | ||
| 676 | .align 32 | ||
| 677 | ecp_nistz256_point_double: | ||
| 678 | #if 0 | ||
| 679 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | ||
| 680 | ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] | ||
| 681 | and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 | ||
| 682 | cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) | ||
| 683 | be ecp_nistz256_point_double_vis3 | ||
| 684 | nop | ||
| 685 | #endif | ||
| 686 | |||
| 687 | save %sp,-STACK_FRAME-32*4,%sp | ||
| 688 | |||
| 689 | mov $rp,$rp_real | ||
| 690 | mov $ap,$ap_real | ||
| 691 | |||
| 692 | .Lpoint_double_shortcut: | ||
| 693 | ld [$ap+32],@acc[0] | ||
| 694 | ld [$ap+32+4],@acc[1] | ||
| 695 | ld [$ap+32+8],@acc[2] | ||
| 696 | ld [$ap+32+12],@acc[3] | ||
| 697 | ld [$ap+32+16],@acc[4] | ||
| 698 | ld [$ap+32+20],@acc[5] | ||
| 699 | ld [$ap+32+24],@acc[6] | ||
| 700 | ld [$ap+32+28],@acc[7] | ||
| 701 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y); | ||
| 702 | add %sp,LOCALS+$S,$rp | ||
| 703 | |||
| 704 | add $ap_real,64,$bp | ||
| 705 | add $ap_real,64,$ap | ||
| 706 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z); | ||
| 707 | add %sp,LOCALS+$Zsqr,$rp | ||
| 708 | |||
| 709 | add $ap_real,0,$bp | ||
| 710 | call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x); | ||
| 711 | add %sp,LOCALS+$M,$rp | ||
| 712 | |||
| 713 | add %sp,LOCALS+$S,$bp | ||
| 714 | add %sp,LOCALS+$S,$ap | ||
| 715 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S); | ||
| 716 | add %sp,LOCALS+$S,$rp | ||
| 717 | |||
| 718 | ld [$ap_real],@acc[0] | ||
| 719 | add %sp,LOCALS+$Zsqr,$bp | ||
| 720 | ld [$ap_real+4],@acc[1] | ||
| 721 | ld [$ap_real+8],@acc[2] | ||
| 722 | ld [$ap_real+12],@acc[3] | ||
| 723 | ld [$ap_real+16],@acc[4] | ||
| 724 | ld [$ap_real+20],@acc[5] | ||
| 725 | ld [$ap_real+24],@acc[6] | ||
| 726 | ld [$ap_real+28],@acc[7] | ||
| 727 | call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr); | ||
| 728 | add %sp,LOCALS+$Zsqr,$rp | ||
| 729 | |||
| 730 | add $ap_real,32,$bp | ||
| 731 | add $ap_real,64,$ap | ||
| 732 | call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y); | ||
| 733 | add %sp,LOCALS+$tmp0,$rp | ||
| 734 | |||
| 735 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0); | ||
| 736 | add $rp_real,64,$rp | ||
| 737 | |||
| 738 | add %sp,LOCALS+$Zsqr,$bp | ||
| 739 | add %sp,LOCALS+$M,$ap | ||
| 740 | call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr); | ||
| 741 | add %sp,LOCALS+$M,$rp | ||
| 742 | |||
| 743 | call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M); | ||
| 744 | add %sp,LOCALS+$M,$rp | ||
| 745 | |||
| 746 | add %sp,LOCALS+$S,$bp | ||
| 747 | add %sp,LOCALS+$S,$ap | ||
| 748 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S); | ||
| 749 | add %sp,LOCALS+$tmp0,$rp | ||
| 750 | |||
| 751 | call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0); | ||
| 752 | add $rp_real,32,$rp | ||
| 753 | |||
| 754 | add $ap_real,0,$bp | ||
| 755 | add %sp,LOCALS+$S,$ap | ||
| 756 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x); | ||
| 757 | add %sp,LOCALS+$S,$rp | ||
| 758 | |||
| 759 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S); | ||
| 760 | add %sp,LOCALS+$tmp0,$rp | ||
| 761 | |||
| 762 | add %sp,LOCALS+$M,$bp | ||
| 763 | add %sp,LOCALS+$M,$ap | ||
| 764 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M); | ||
| 765 | add $rp_real,0,$rp | ||
| 766 | |||
| 767 | add %sp,LOCALS+$tmp0,$bp | ||
| 768 | call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0); | ||
| 769 | add $rp_real,0,$rp | ||
| 770 | |||
| 771 | add %sp,LOCALS+$S,$bp | ||
| 772 | call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x); | ||
| 773 | add %sp,LOCALS+$S,$rp | ||
| 774 | |||
| 775 | add %sp,LOCALS+$M,$bp | ||
| 776 | add %sp,LOCALS+$S,$ap | ||
| 777 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M); | ||
| 778 | add %sp,LOCALS+$S,$rp | ||
| 779 | |||
| 780 | add $rp_real,32,$bp | ||
| 781 | call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y); | ||
| 782 | add $rp_real,32,$rp | ||
| 783 | |||
| 784 | ret | ||
| 785 | restore | ||
| 786 | .type ecp_nistz256_point_double,#function | ||
| 787 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | ||
| 788 | ___ | ||
| 789 | } | ||
| 790 | |||
| 791 | ######################################################################## | ||
| 792 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 793 | # const P256_POINT *in2); | ||
| 794 | { | ||
| 795 | my ($res_x,$res_y,$res_z, | ||
| 796 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 797 | $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); | ||
| 798 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 799 | |||
| 800 | # above map() describes stack layout with 12 temporary | ||
| 801 | # 256-bit vectors on top. Then we reserve some space for | ||
| 802 | # !in1infty, !in2infty, result of check for zero and return pointer. | ||
| 803 | |||
| 804 | my $bp_real=$rp_real; | ||
| 805 | |||
| 806 | $code.=<<___; | ||
| 807 | .globl ecp_nistz256_point_add | ||
| 808 | .align 32 | ||
| 809 | ecp_nistz256_point_add: | ||
| 810 | #if 0 | ||
| 811 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | ||
| 812 | ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] | ||
| 813 | and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 | ||
| 814 | cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) | ||
| 815 | be ecp_nistz256_point_add_vis3 | ||
| 816 | nop | ||
| 817 | #endif | ||
| 818 | |||
| 819 | save %sp,-STACK_FRAME-32*12-32,%sp | ||
| 820 | |||
| 821 | stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp | ||
| 822 | mov $ap,$ap_real | ||
| 823 | mov $bp,$bp_real | ||
| 824 | |||
| 825 | ld [$bp+64],$t0 ! in2_z | ||
| 826 | ld [$bp+64+4],$t1 | ||
| 827 | ld [$bp+64+8],$t2 | ||
| 828 | ld [$bp+64+12],$t3 | ||
| 829 | ld [$bp+64+16],$t4 | ||
| 830 | ld [$bp+64+20],$t5 | ||
| 831 | ld [$bp+64+24],$t6 | ||
| 832 | ld [$bp+64+28],$t7 | ||
| 833 | or $t1,$t0,$t0 | ||
| 834 | or $t3,$t2,$t2 | ||
| 835 | or $t5,$t4,$t4 | ||
| 836 | or $t7,$t6,$t6 | ||
| 837 | or $t2,$t0,$t0 | ||
| 838 | or $t6,$t4,$t4 | ||
| 839 | or $t4,$t0,$t0 ! !in2infty | ||
| 840 | movrnz $t0,-1,$t0 | ||
| 841 | st $t0,[%fp+STACK_BIAS-12] | ||
| 842 | |||
| 843 | ld [$ap+64],$t0 ! in1_z | ||
| 844 | ld [$ap+64+4],$t1 | ||
| 845 | ld [$ap+64+8],$t2 | ||
| 846 | ld [$ap+64+12],$t3 | ||
| 847 | ld [$ap+64+16],$t4 | ||
| 848 | ld [$ap+64+20],$t5 | ||
| 849 | ld [$ap+64+24],$t6 | ||
| 850 | ld [$ap+64+28],$t7 | ||
| 851 | or $t1,$t0,$t0 | ||
| 852 | or $t3,$t2,$t2 | ||
| 853 | or $t5,$t4,$t4 | ||
| 854 | or $t7,$t6,$t6 | ||
| 855 | or $t2,$t0,$t0 | ||
| 856 | or $t6,$t4,$t4 | ||
| 857 | or $t4,$t0,$t0 ! !in1infty | ||
| 858 | movrnz $t0,-1,$t0 | ||
| 859 | st $t0,[%fp+STACK_BIAS-16] | ||
| 860 | |||
| 861 | add $bp_real,64,$bp | ||
| 862 | add $bp_real,64,$ap | ||
| 863 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z); | ||
| 864 | add %sp,LOCALS+$Z2sqr,$rp | ||
| 865 | |||
| 866 | add $ap_real,64,$bp | ||
| 867 | add $ap_real,64,$ap | ||
| 868 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 869 | add %sp,LOCALS+$Z1sqr,$rp | ||
| 870 | |||
| 871 | add $bp_real,64,$bp | ||
| 872 | add %sp,LOCALS+$Z2sqr,$ap | ||
| 873 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 874 | add %sp,LOCALS+$S1,$rp | ||
| 875 | |||
| 876 | add $ap_real,64,$bp | ||
| 877 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 878 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 879 | add %sp,LOCALS+$S2,$rp | ||
| 880 | |||
| 881 | add $ap_real,32,$bp | ||
| 882 | add %sp,LOCALS+$S1,$ap | ||
| 883 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y); | ||
| 884 | add %sp,LOCALS+$S1,$rp | ||
| 885 | |||
| 886 | add $bp_real,32,$bp | ||
| 887 | add %sp,LOCALS+$S2,$ap | ||
| 888 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); | ||
| 889 | add %sp,LOCALS+$S2,$rp | ||
| 890 | |||
| 891 | add %sp,LOCALS+$S1,$bp | ||
| 892 | call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1); | ||
| 893 | add %sp,LOCALS+$R,$rp | ||
| 894 | |||
| 895 | or @acc[1],@acc[0],@acc[0] ! see if result is zero | ||
| 896 | or @acc[3],@acc[2],@acc[2] | ||
| 897 | or @acc[5],@acc[4],@acc[4] | ||
| 898 | or @acc[7],@acc[6],@acc[6] | ||
| 899 | or @acc[2],@acc[0],@acc[0] | ||
| 900 | or @acc[6],@acc[4],@acc[4] | ||
| 901 | or @acc[4],@acc[0],@acc[0] | ||
| 902 | st @acc[0],[%fp+STACK_BIAS-20] | ||
| 903 | |||
| 904 | add $ap_real,0,$bp | ||
| 905 | add %sp,LOCALS+$Z2sqr,$ap | ||
| 906 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 907 | add %sp,LOCALS+$U1,$rp | ||
| 908 | |||
| 909 | add $bp_real,0,$bp | ||
| 910 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 911 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 912 | add %sp,LOCALS+$U2,$rp | ||
| 913 | |||
| 914 | add %sp,LOCALS+$U1,$bp | ||
| 915 | call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1); | ||
| 916 | add %sp,LOCALS+$H,$rp | ||
| 917 | |||
| 918 | or @acc[1],@acc[0],@acc[0] ! see if result is zero | ||
| 919 | or @acc[3],@acc[2],@acc[2] | ||
| 920 | or @acc[5],@acc[4],@acc[4] | ||
| 921 | or @acc[7],@acc[6],@acc[6] | ||
| 922 | or @acc[2],@acc[0],@acc[0] | ||
| 923 | or @acc[6],@acc[4],@acc[4] | ||
| 924 | orcc @acc[4],@acc[0],@acc[0] | ||
| 925 | |||
| 926 | bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)? | ||
| 927 | nop | ||
| 928 | |||
| 929 | ld [%fp+STACK_BIAS-12],$t0 | ||
| 930 | ld [%fp+STACK_BIAS-16],$t1 | ||
| 931 | ld [%fp+STACK_BIAS-20],$t2 | ||
| 932 | andcc $t0,$t1,%g0 | ||
| 933 | be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)? | ||
| 934 | nop | ||
| 935 | andcc $t2,$t2,%g0 | ||
| 936 | be,pt %icc,.Ladd_double ! is_equal(S1,S2)? | ||
| 937 | nop | ||
| 938 | |||
| 939 | ldx [%fp+STACK_BIAS-8],$rp | ||
| 940 | st %g0,[$rp] | ||
| 941 | st %g0,[$rp+4] | ||
| 942 | st %g0,[$rp+8] | ||
| 943 | st %g0,[$rp+12] | ||
| 944 | st %g0,[$rp+16] | ||
| 945 | st %g0,[$rp+20] | ||
| 946 | st %g0,[$rp+24] | ||
| 947 | st %g0,[$rp+28] | ||
| 948 | st %g0,[$rp+32] | ||
| 949 | st %g0,[$rp+32+4] | ||
| 950 | st %g0,[$rp+32+8] | ||
| 951 | st %g0,[$rp+32+12] | ||
| 952 | st %g0,[$rp+32+16] | ||
| 953 | st %g0,[$rp+32+20] | ||
| 954 | st %g0,[$rp+32+24] | ||
| 955 | st %g0,[$rp+32+28] | ||
| 956 | st %g0,[$rp+64] | ||
| 957 | st %g0,[$rp+64+4] | ||
| 958 | st %g0,[$rp+64+8] | ||
| 959 | st %g0,[$rp+64+12] | ||
| 960 | st %g0,[$rp+64+16] | ||
| 961 | st %g0,[$rp+64+20] | ||
| 962 | st %g0,[$rp+64+24] | ||
| 963 | st %g0,[$rp+64+28] | ||
| 964 | b .Ladd_done | ||
| 965 | nop | ||
| 966 | |||
| 967 | .align 16 | ||
| 968 | .Ladd_double: | ||
| 969 | ldx [%fp+STACK_BIAS-8],$rp_real | ||
| 970 | mov $ap_real,$ap | ||
| 971 | b .Lpoint_double_shortcut | ||
| 972 | add %sp,32*(12-4)+32,%sp ! difference in frame sizes | ||
| 973 | |||
| 974 | .align 16 | ||
| 975 | .Ladd_proceed: | ||
| 976 | add %sp,LOCALS+$R,$bp | ||
| 977 | add %sp,LOCALS+$R,$ap | ||
| 978 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); | ||
| 979 | add %sp,LOCALS+$Rsqr,$rp | ||
| 980 | |||
| 981 | add $ap_real,64,$bp | ||
| 982 | add %sp,LOCALS+$H,$ap | ||
| 983 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); | ||
| 984 | add %sp,LOCALS+$res_z,$rp | ||
| 985 | |||
| 986 | add %sp,LOCALS+$H,$bp | ||
| 987 | add %sp,LOCALS+$H,$ap | ||
| 988 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); | ||
| 989 | add %sp,LOCALS+$Hsqr,$rp | ||
| 990 | |||
| 991 | add $bp_real,64,$bp | ||
| 992 | add %sp,LOCALS+$res_z,$ap | ||
| 993 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z); | ||
| 994 | add %sp,LOCALS+$res_z,$rp | ||
| 995 | |||
| 996 | add %sp,LOCALS+$H,$bp | ||
| 997 | add %sp,LOCALS+$Hsqr,$ap | ||
| 998 | call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 999 | add %sp,LOCALS+$Hcub,$rp | ||
| 1000 | |||
| 1001 | add %sp,LOCALS+$U1,$bp | ||
| 1002 | add %sp,LOCALS+$Hsqr,$ap | ||
| 1003 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr); | ||
| 1004 | add %sp,LOCALS+$U2,$rp | ||
| 1005 | |||
| 1006 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); | ||
| 1007 | add %sp,LOCALS+$Hsqr,$rp | ||
| 1008 | |||
| 1009 | add %sp,LOCALS+$Rsqr,$bp | ||
| 1010 | call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 1011 | add %sp,LOCALS+$res_x,$rp | ||
| 1012 | |||
| 1013 | add %sp,LOCALS+$Hcub,$bp | ||
| 1014 | call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); | ||
| 1015 | add %sp,LOCALS+$res_x,$rp | ||
| 1016 | |||
| 1017 | add %sp,LOCALS+$U2,$bp | ||
| 1018 | call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); | ||
| 1019 | add %sp,LOCALS+$res_y,$rp | ||
| 1020 | |||
| 1021 | add %sp,LOCALS+$Hcub,$bp | ||
| 1022 | add %sp,LOCALS+$S1,$ap | ||
| 1023 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub); | ||
| 1024 | add %sp,LOCALS+$S2,$rp | ||
| 1025 | |||
| 1026 | add %sp,LOCALS+$R,$bp | ||
| 1027 | add %sp,LOCALS+$res_y,$ap | ||
| 1028 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); | ||
| 1029 | add %sp,LOCALS+$res_y,$rp | ||
| 1030 | |||
| 1031 | add %sp,LOCALS+$S2,$bp | ||
| 1032 | call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); | ||
| 1033 | add %sp,LOCALS+$res_y,$rp | ||
| 1034 | |||
| 1035 | ld [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 1036 | ld [%fp+STACK_BIAS-12],$t2 ! !in2infty | ||
| 1037 | ldx [%fp+STACK_BIAS-8],$rp | ||
| 1038 | ___ | ||
| 1039 | for($i=0;$i<96;$i+=8) { # conditional moves | ||
| 1040 | $code.=<<___; | ||
| 1041 | ld [%sp+LOCALS+$i],@acc[0] ! res | ||
| 1042 | ld [%sp+LOCALS+$i+4],@acc[1] | ||
| 1043 | ld [$bp_real+$i],@acc[2] ! in2 | ||
| 1044 | ld [$bp_real+$i+4],@acc[3] | ||
| 1045 | ld [$ap_real+$i],@acc[4] ! in1 | ||
| 1046 | ld [$ap_real+$i+4],@acc[5] | ||
| 1047 | movrz $t1,@acc[2],@acc[0] | ||
| 1048 | movrz $t1,@acc[3],@acc[1] | ||
| 1049 | movrz $t2,@acc[4],@acc[0] | ||
| 1050 | movrz $t2,@acc[5],@acc[1] | ||
| 1051 | st @acc[0],[$rp+$i] | ||
| 1052 | st @acc[1],[$rp+$i+4] | ||
| 1053 | ___ | ||
| 1054 | } | ||
| 1055 | $code.=<<___; | ||
| 1056 | .Ladd_done: | ||
| 1057 | ret | ||
| 1058 | restore | ||
| 1059 | .type ecp_nistz256_point_add,#function | ||
| 1060 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | ||
| 1061 | ___ | ||
| 1062 | } | ||
| 1063 | |||
| 1064 | ######################################################################## | ||
| 1065 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | ||
| 1066 | # const P256_POINT_AFFINE *in2); | ||
| 1067 | { | ||
| 1068 | my ($res_x,$res_y,$res_z, | ||
| 1069 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); | ||
| 1070 | my $Z1sqr = $S2; | ||
| 1071 | # above map() describes stack layout with 10 temporary | ||
| 1072 | # 256-bit vectors on top. Then we reserve some space for | ||
| 1073 | # !in1infty, !in2infty, result of check for zero and return pointer. | ||
| 1074 | |||
| 1075 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | ||
| 1076 | my $bp_real=$rp_real; | ||
| 1077 | |||
| 1078 | $code.=<<___; | ||
| 1079 | .globl ecp_nistz256_point_add_affine | ||
| 1080 | .align 32 | ||
| 1081 | ecp_nistz256_point_add_affine: | ||
| 1082 | #if 0 | ||
| 1083 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) | ||
| 1084 | ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] | ||
| 1085 | and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 | ||
| 1086 | cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) | ||
| 1087 | be ecp_nistz256_point_add_affine_vis3 | ||
| 1088 | nop | ||
| 1089 | #endif | ||
| 1090 | |||
| 1091 | save %sp,-STACK_FRAME-32*10-32,%sp | ||
| 1092 | |||
| 1093 | stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp | ||
| 1094 | mov $ap,$ap_real | ||
| 1095 | mov $bp,$bp_real | ||
| 1096 | |||
| 1097 | ld [$ap+64],$t0 ! in1_z | ||
| 1098 | ld [$ap+64+4],$t1 | ||
| 1099 | ld [$ap+64+8],$t2 | ||
| 1100 | ld [$ap+64+12],$t3 | ||
| 1101 | ld [$ap+64+16],$t4 | ||
| 1102 | ld [$ap+64+20],$t5 | ||
| 1103 | ld [$ap+64+24],$t6 | ||
| 1104 | ld [$ap+64+28],$t7 | ||
| 1105 | or $t1,$t0,$t0 | ||
| 1106 | or $t3,$t2,$t2 | ||
| 1107 | or $t5,$t4,$t4 | ||
| 1108 | or $t7,$t6,$t6 | ||
| 1109 | or $t2,$t0,$t0 | ||
| 1110 | or $t6,$t4,$t4 | ||
| 1111 | or $t4,$t0,$t0 ! !in1infty | ||
| 1112 | movrnz $t0,-1,$t0 | ||
| 1113 | st $t0,[%fp+STACK_BIAS-16] | ||
| 1114 | |||
| 1115 | ld [$bp],@acc[0] ! in2_x | ||
| 1116 | ld [$bp+4],@acc[1] | ||
| 1117 | ld [$bp+8],@acc[2] | ||
| 1118 | ld [$bp+12],@acc[3] | ||
| 1119 | ld [$bp+16],@acc[4] | ||
| 1120 | ld [$bp+20],@acc[5] | ||
| 1121 | ld [$bp+24],@acc[6] | ||
| 1122 | ld [$bp+28],@acc[7] | ||
| 1123 | ld [$bp+32],$t0 ! in2_y | ||
| 1124 | ld [$bp+32+4],$t1 | ||
| 1125 | ld [$bp+32+8],$t2 | ||
| 1126 | ld [$bp+32+12],$t3 | ||
| 1127 | ld [$bp+32+16],$t4 | ||
| 1128 | ld [$bp+32+20],$t5 | ||
| 1129 | ld [$bp+32+24],$t6 | ||
| 1130 | ld [$bp+32+28],$t7 | ||
| 1131 | or @acc[1],@acc[0],@acc[0] | ||
| 1132 | or @acc[3],@acc[2],@acc[2] | ||
| 1133 | or @acc[5],@acc[4],@acc[4] | ||
| 1134 | or @acc[7],@acc[6],@acc[6] | ||
| 1135 | or @acc[2],@acc[0],@acc[0] | ||
| 1136 | or @acc[6],@acc[4],@acc[4] | ||
| 1137 | or @acc[4],@acc[0],@acc[0] | ||
| 1138 | or $t1,$t0,$t0 | ||
| 1139 | or $t3,$t2,$t2 | ||
| 1140 | or $t5,$t4,$t4 | ||
| 1141 | or $t7,$t6,$t6 | ||
| 1142 | or $t2,$t0,$t0 | ||
| 1143 | or $t6,$t4,$t4 | ||
| 1144 | or $t4,$t0,$t0 | ||
| 1145 | or @acc[0],$t0,$t0 ! !in2infty | ||
| 1146 | movrnz $t0,-1,$t0 | ||
| 1147 | st $t0,[%fp+STACK_BIAS-12] | ||
| 1148 | |||
| 1149 | add $ap_real,64,$bp | ||
| 1150 | add $ap_real,64,$ap | ||
| 1151 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 1152 | add %sp,LOCALS+$Z1sqr,$rp | ||
| 1153 | |||
| 1154 | add $bp_real,0,$bp | ||
| 1155 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 1156 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1157 | add %sp,LOCALS+$U2,$rp | ||
| 1158 | |||
| 1159 | add $ap_real,0,$bp | ||
| 1160 | call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x); | ||
| 1161 | add %sp,LOCALS+$H,$rp | ||
| 1162 | |||
| 1163 | add $ap_real,64,$bp | ||
| 1164 | add %sp,LOCALS+$Z1sqr,$ap | ||
| 1165 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1166 | add %sp,LOCALS+$S2,$rp | ||
| 1167 | |||
| 1168 | add $ap_real,64,$bp | ||
| 1169 | add %sp,LOCALS+$H,$ap | ||
| 1170 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); | ||
| 1171 | add %sp,LOCALS+$res_z,$rp | ||
| 1172 | |||
| 1173 | add $bp_real,32,$bp | ||
| 1174 | add %sp,LOCALS+$S2,$ap | ||
| 1175 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); | ||
| 1176 | add %sp,LOCALS+$S2,$rp | ||
| 1177 | |||
| 1178 | add $ap_real,32,$bp | ||
| 1179 | call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y); | ||
| 1180 | add %sp,LOCALS+$R,$rp | ||
| 1181 | |||
| 1182 | add %sp,LOCALS+$H,$bp | ||
| 1183 | add %sp,LOCALS+$H,$ap | ||
| 1184 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); | ||
| 1185 | add %sp,LOCALS+$Hsqr,$rp | ||
| 1186 | |||
| 1187 | add %sp,LOCALS+$R,$bp | ||
| 1188 | add %sp,LOCALS+$R,$ap | ||
| 1189 | call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); | ||
| 1190 | add %sp,LOCALS+$Rsqr,$rp | ||
| 1191 | |||
| 1192 | add %sp,LOCALS+$H,$bp | ||
| 1193 | add %sp,LOCALS+$Hsqr,$ap | ||
| 1194 | call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 1195 | add %sp,LOCALS+$Hcub,$rp | ||
| 1196 | |||
| 1197 | add $ap_real,0,$bp | ||
| 1198 | add %sp,LOCALS+$Hsqr,$ap | ||
| 1199 | call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1200 | add %sp,LOCALS+$U2,$rp | ||
| 1201 | |||
| 1202 | call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); | ||
| 1203 | add %sp,LOCALS+$Hsqr,$rp | ||
| 1204 | |||
| 1205 | add %sp,LOCALS+$Rsqr,$bp | ||
| 1206 | call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 1207 | add %sp,LOCALS+$res_x,$rp | ||
| 1208 | |||
| 1209 | add %sp,LOCALS+$Hcub,$bp | ||
| 1210 | call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); | ||
| 1211 | add %sp,LOCALS+$res_x,$rp | ||
| 1212 | |||
| 1213 | add %sp,LOCALS+$U2,$bp | ||
| 1214 | call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); | ||
| 1215 | add %sp,LOCALS+$res_y,$rp | ||
| 1216 | |||
| 1217 | add $ap_real,32,$bp | ||
| 1218 | add %sp,LOCALS+$Hcub,$ap | ||
| 1219 | call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub); | ||
| 1220 | add %sp,LOCALS+$S2,$rp | ||
| 1221 | |||
| 1222 | add %sp,LOCALS+$R,$bp | ||
| 1223 | add %sp,LOCALS+$res_y,$ap | ||
| 1224 | call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); | ||
| 1225 | add %sp,LOCALS+$res_y,$rp | ||
| 1226 | |||
| 1227 | add %sp,LOCALS+$S2,$bp | ||
| 1228 | call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); | ||
| 1229 | add %sp,LOCALS+$res_y,$rp | ||
| 1230 | |||
| 1231 | ld [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 1232 | ld [%fp+STACK_BIAS-12],$t2 ! !in2infty | ||
| 1233 | ldx [%fp+STACK_BIAS-8],$rp | ||
| 1234 | ___ | ||
| 1235 | for($i=0;$i<64;$i+=8) { # conditional moves | ||
| 1236 | $code.=<<___; | ||
| 1237 | ld [%sp+LOCALS+$i],@acc[0] ! res | ||
| 1238 | ld [%sp+LOCALS+$i+4],@acc[1] | ||
| 1239 | ld [$bp_real+$i],@acc[2] ! in2 | ||
| 1240 | ld [$bp_real+$i+4],@acc[3] | ||
| 1241 | ld [$ap_real+$i],@acc[4] ! in1 | ||
| 1242 | ld [$ap_real+$i+4],@acc[5] | ||
| 1243 | movrz $t1,@acc[2],@acc[0] | ||
| 1244 | movrz $t1,@acc[3],@acc[1] | ||
| 1245 | movrz $t2,@acc[4],@acc[0] | ||
| 1246 | movrz $t2,@acc[5],@acc[1] | ||
| 1247 | st @acc[0],[$rp+$i] | ||
| 1248 | st @acc[1],[$rp+$i+4] | ||
| 1249 | ___ | ||
| 1250 | } | ||
| 1251 | for(;$i<96;$i+=8) { | ||
| 1252 | my $j=($i-64)/4; | ||
| 1253 | $code.=<<___; | ||
| 1254 | ld [%sp+LOCALS+$i],@acc[0] ! res | ||
| 1255 | ld [%sp+LOCALS+$i+4],@acc[1] | ||
| 1256 | ld [$ap_real+$i],@acc[4] ! in1 | ||
| 1257 | ld [$ap_real+$i+4],@acc[5] | ||
| 1258 | movrz $t1,@ONE_mont[$j],@acc[0] | ||
| 1259 | movrz $t1,@ONE_mont[$j+1],@acc[1] | ||
| 1260 | movrz $t2,@acc[4],@acc[0] | ||
| 1261 | movrz $t2,@acc[5],@acc[1] | ||
| 1262 | st @acc[0],[$rp+$i] | ||
| 1263 | st @acc[1],[$rp+$i+4] | ||
| 1264 | ___ | ||
| 1265 | } | ||
| 1266 | $code.=<<___; | ||
| 1267 | ret | ||
| 1268 | restore | ||
| 1269 | .type ecp_nistz256_point_add_affine,#function | ||
| 1270 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | ||
| 1271 | ___ | ||
| 1272 | } }}} | ||
| 1273 | {{{ | ||
| 1274 | my ($out,$inp,$index)=map("%i$_",(0..2)); | ||
| 1275 | my $mask="%o0"; | ||
| 1276 | |||
| 1277 | $code.=<<___; | ||
| 1278 | ! void ecp_nistz256_select_w5(P256_POINT *%i0,const void *%i1, | ||
| 1279 | ! int %i2); | ||
| 1280 | .globl ecp_nistz256_select_w5 | ||
| 1281 | .align 32 | ||
| 1282 | ecp_nistz256_select_w5: | ||
| 1283 | save %sp,-STACK_FRAME,%sp | ||
| 1284 | |||
| 1285 | neg $index,$mask | ||
| 1286 | srax $mask,63,$mask | ||
| 1287 | |||
| 1288 | add $index,$mask,$index | ||
| 1289 | sll $index,2,$index | ||
| 1290 | add $inp,$index,$inp | ||
| 1291 | |||
| 1292 | ld [$inp+64*0],%l0 | ||
| 1293 | ld [$inp+64*1],%l1 | ||
| 1294 | ld [$inp+64*2],%l2 | ||
| 1295 | ld [$inp+64*3],%l3 | ||
| 1296 | ld [$inp+64*4],%l4 | ||
| 1297 | ld [$inp+64*5],%l5 | ||
| 1298 | ld [$inp+64*6],%l6 | ||
| 1299 | ld [$inp+64*7],%l7 | ||
| 1300 | add $inp,64*8,$inp | ||
| 1301 | and %l0,$mask,%l0 | ||
| 1302 | and %l1,$mask,%l1 | ||
| 1303 | st %l0,[$out] ! X | ||
| 1304 | and %l2,$mask,%l2 | ||
| 1305 | st %l1,[$out+4] | ||
| 1306 | and %l3,$mask,%l3 | ||
| 1307 | st %l2,[$out+8] | ||
| 1308 | and %l4,$mask,%l4 | ||
| 1309 | st %l3,[$out+12] | ||
| 1310 | and %l5,$mask,%l5 | ||
| 1311 | st %l4,[$out+16] | ||
| 1312 | and %l6,$mask,%l6 | ||
| 1313 | st %l5,[$out+20] | ||
| 1314 | and %l7,$mask,%l7 | ||
| 1315 | st %l6,[$out+24] | ||
| 1316 | st %l7,[$out+28] | ||
| 1317 | add $out,32,$out | ||
| 1318 | |||
| 1319 | ld [$inp+64*0],%l0 | ||
| 1320 | ld [$inp+64*1],%l1 | ||
| 1321 | ld [$inp+64*2],%l2 | ||
| 1322 | ld [$inp+64*3],%l3 | ||
| 1323 | ld [$inp+64*4],%l4 | ||
| 1324 | ld [$inp+64*5],%l5 | ||
| 1325 | ld [$inp+64*6],%l6 | ||
| 1326 | ld [$inp+64*7],%l7 | ||
| 1327 | add $inp,64*8,$inp | ||
| 1328 | and %l0,$mask,%l0 | ||
| 1329 | and %l1,$mask,%l1 | ||
| 1330 | st %l0,[$out] ! Y | ||
| 1331 | and %l2,$mask,%l2 | ||
| 1332 | st %l1,[$out+4] | ||
| 1333 | and %l3,$mask,%l3 | ||
| 1334 | st %l2,[$out+8] | ||
| 1335 | and %l4,$mask,%l4 | ||
| 1336 | st %l3,[$out+12] | ||
| 1337 | and %l5,$mask,%l5 | ||
| 1338 | st %l4,[$out+16] | ||
| 1339 | and %l6,$mask,%l6 | ||
| 1340 | st %l5,[$out+20] | ||
| 1341 | and %l7,$mask,%l7 | ||
| 1342 | st %l6,[$out+24] | ||
| 1343 | st %l7,[$out+28] | ||
| 1344 | add $out,32,$out | ||
| 1345 | |||
| 1346 | ld [$inp+64*0],%l0 | ||
| 1347 | ld [$inp+64*1],%l1 | ||
| 1348 | ld [$inp+64*2],%l2 | ||
| 1349 | ld [$inp+64*3],%l3 | ||
| 1350 | ld [$inp+64*4],%l4 | ||
| 1351 | ld [$inp+64*5],%l5 | ||
| 1352 | ld [$inp+64*6],%l6 | ||
| 1353 | ld [$inp+64*7],%l7 | ||
| 1354 | and %l0,$mask,%l0 | ||
| 1355 | and %l1,$mask,%l1 | ||
| 1356 | st %l0,[$out] ! Z | ||
| 1357 | and %l2,$mask,%l2 | ||
| 1358 | st %l1,[$out+4] | ||
| 1359 | and %l3,$mask,%l3 | ||
| 1360 | st %l2,[$out+8] | ||
| 1361 | and %l4,$mask,%l4 | ||
| 1362 | st %l3,[$out+12] | ||
| 1363 | and %l5,$mask,%l5 | ||
| 1364 | st %l4,[$out+16] | ||
| 1365 | and %l6,$mask,%l6 | ||
| 1366 | st %l5,[$out+20] | ||
| 1367 | and %l7,$mask,%l7 | ||
| 1368 | st %l6,[$out+24] | ||
| 1369 | st %l7,[$out+28] | ||
| 1370 | |||
| 1371 | ret | ||
| 1372 | restore | ||
| 1373 | .type ecp_nistz256_select_w5,#function | ||
| 1374 | .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 | ||
| 1375 | |||
| 1376 | ! void ecp_nistz256_select_w7(P256_POINT_AFFINE *%i0,const void *%i1, | ||
| 1377 | ! int %i2); | ||
| 1378 | .globl ecp_nistz256_select_w7 | ||
| 1379 | .align 32 | ||
| 1380 | ecp_nistz256_select_w7: | ||
| 1381 | save %sp,-STACK_FRAME,%sp | ||
| 1382 | |||
| 1383 | neg $index,$mask | ||
| 1384 | srax $mask,63,$mask | ||
| 1385 | |||
| 1386 | add $index,$mask,$index | ||
| 1387 | add $inp,$index,$inp | ||
| 1388 | mov 64/4,$index | ||
| 1389 | |||
| 1390 | .Loop_select_w7: | ||
| 1391 | ldub [$inp+64*0],%l0 | ||
| 1392 | prefetch [$inp+3840+64*0],1 | ||
| 1393 | subcc $index,1,$index | ||
| 1394 | ldub [$inp+64*1],%l1 | ||
| 1395 | prefetch [$inp+3840+64*1],1 | ||
| 1396 | ldub [$inp+64*2],%l2 | ||
| 1397 | prefetch [$inp+3840+64*2],1 | ||
| 1398 | ldub [$inp+64*3],%l3 | ||
| 1399 | prefetch [$inp+3840+64*3],1 | ||
| 1400 | add $inp,64*4,$inp | ||
| 1401 | sll %l1,8,%l1 | ||
| 1402 | sll %l2,16,%l2 | ||
| 1403 | or %l0,%l1,%l0 | ||
| 1404 | sll %l3,24,%l3 | ||
| 1405 | or %l0,%l2,%l0 | ||
| 1406 | or %l0,%l3,%l0 | ||
| 1407 | and %l0,$mask,%l0 | ||
| 1408 | st %l0,[$out] | ||
| 1409 | bne .Loop_select_w7 | ||
| 1410 | add $out,4,$out | ||
| 1411 | |||
| 1412 | ret | ||
| 1413 | restore | ||
| 1414 | .type ecp_nistz256_select_w7,#function | ||
| 1415 | .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 | ||
| 1416 | ___ | ||
| 1417 | }}} | ||
| 1418 | {{{ | ||
| 1419 | ######################################################################## | ||
| 1420 | # Following subroutines are VIS3 counterparts of those above that | ||
| 1421 | # implement ones found in ecp_nistz256.c. Key difference is that they | ||
| 1422 | # use 128-bit muliplication and addition with 64-bit carry, and in order | ||
| 1423 | # to do that they perform conversion from uin32_t[8] to uint64_t[4] upon | ||
| 1424 | # entry and vice versa on return. | ||
| 1425 | # | ||
| 1426 | my ($rp,$ap,$bp)=map("%i$_",(0..2)); | ||
| 1427 | my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7)); | ||
| 1428 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5)); | ||
| 1429 | my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1"); | ||
| 1430 | my ($rp_real,$ap_real)=("%g2","%g3"); | ||
| 1431 | my ($acc6,$acc7)=($bp,$bi); # used in squaring | ||
| 1432 | |||
| 1433 | $code.=<<___; | ||
| 1434 | #if 0 | ||
| 1435 | .align 32 | ||
| 1436 | __ecp_nistz256_mul_by_2_vis3: | ||
| 1437 | addcc $acc0,$acc0,$acc0 | ||
| 1438 | addxccc $acc1,$acc1,$acc1 | ||
| 1439 | addxccc $acc2,$acc2,$acc2 | ||
| 1440 | addxccc $acc3,$acc3,$acc3 | ||
| 1441 | b .Lreduce_by_sub_vis3 | ||
| 1442 | addxc %g0,%g0,$acc4 ! did it carry? | ||
| 1443 | .type __ecp_nistz256_mul_by_2_vis3,#function | ||
| 1444 | .size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3 | ||
| 1445 | |||
| 1446 | .align 32 | ||
| 1447 | __ecp_nistz256_add_vis3: | ||
| 1448 | ldx [$bp+0],$t0 | ||
| 1449 | ldx [$bp+8],$t1 | ||
| 1450 | ldx [$bp+16],$t2 | ||
| 1451 | ldx [$bp+24],$t3 | ||
| 1452 | |||
| 1453 | __ecp_nistz256_add_noload_vis3: | ||
| 1454 | |||
| 1455 | addcc $t0,$acc0,$acc0 | ||
| 1456 | addxccc $t1,$acc1,$acc1 | ||
| 1457 | addxccc $t2,$acc2,$acc2 | ||
| 1458 | addxccc $t3,$acc3,$acc3 | ||
| 1459 | addxc %g0,%g0,$acc4 ! did it carry? | ||
| 1460 | |||
| 1461 | .Lreduce_by_sub_vis3: | ||
| 1462 | |||
| 1463 | addcc $acc0,1,$t0 ! add -modulus, i.e. subtract | ||
| 1464 | addxccc $acc1,$poly1,$t1 | ||
| 1465 | addxccc $acc2,$minus1,$t2 | ||
| 1466 | addxccc $acc3,$poly3,$t3 | ||
| 1467 | addxc $acc4,$minus1,$acc4 | ||
| 1468 | |||
| 1469 | movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus | ||
| 1470 | movrz $acc4,$t1,$acc1 | ||
| 1471 | stx $acc0,[$rp] | ||
| 1472 | movrz $acc4,$t2,$acc2 | ||
| 1473 | stx $acc1,[$rp+8] | ||
| 1474 | movrz $acc4,$t3,$acc3 | ||
| 1475 | stx $acc2,[$rp+16] | ||
| 1476 | retl | ||
| 1477 | stx $acc3,[$rp+24] | ||
| 1478 | .type __ecp_nistz256_add_vis3,#function | ||
| 1479 | .size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3 | ||
| 1480 | |||
| 1481 | ! Trouble with subtraction is that there is no subtraction with 64-bit | ||
| 1482 | ! borrow, only with 32-bit one. For this reason we "decompose" 64-bit | ||
| 1483 | ! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But | ||
| 1484 | ! recall that SPARC is big-endian, which is why you'll observe that | ||
| 1485 | ! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we | ||
| 1486 | ! "collect" result back to 64-bit $acc0-$acc3. | ||
| 1487 | .align 32 | ||
| 1488 | __ecp_nistz256_sub_from_vis3: | ||
| 1489 | ld [$bp+4],$t0 | ||
| 1490 | ld [$bp+0],$t1 | ||
| 1491 | ld [$bp+12],$t2 | ||
| 1492 | ld [$bp+8],$t3 | ||
| 1493 | |||
| 1494 | srlx $acc0,32,$acc4 | ||
| 1495 | not $poly1,$poly1 | ||
| 1496 | srlx $acc1,32,$acc5 | ||
| 1497 | subcc $acc0,$t0,$acc0 | ||
| 1498 | ld [$bp+20],$t0 | ||
| 1499 | subccc $acc4,$t1,$acc4 | ||
| 1500 | ld [$bp+16],$t1 | ||
| 1501 | subccc $acc1,$t2,$acc1 | ||
| 1502 | ld [$bp+28],$t2 | ||
| 1503 | and $acc0,$poly1,$acc0 | ||
| 1504 | subccc $acc5,$t3,$acc5 | ||
| 1505 | ld [$bp+24],$t3 | ||
| 1506 | sllx $acc4,32,$acc4 | ||
| 1507 | and $acc1,$poly1,$acc1 | ||
| 1508 | sllx $acc5,32,$acc5 | ||
| 1509 | or $acc0,$acc4,$acc0 | ||
| 1510 | srlx $acc2,32,$acc4 | ||
| 1511 | or $acc1,$acc5,$acc1 | ||
| 1512 | srlx $acc3,32,$acc5 | ||
| 1513 | subccc $acc2,$t0,$acc2 | ||
| 1514 | subccc $acc4,$t1,$acc4 | ||
| 1515 | subccc $acc3,$t2,$acc3 | ||
| 1516 | and $acc2,$poly1,$acc2 | ||
| 1517 | subccc $acc5,$t3,$acc5 | ||
| 1518 | sllx $acc4,32,$acc4 | ||
| 1519 | and $acc3,$poly1,$acc3 | ||
| 1520 | sllx $acc5,32,$acc5 | ||
| 1521 | or $acc2,$acc4,$acc2 | ||
| 1522 | subc %g0,%g0,$acc4 ! did it borrow? | ||
| 1523 | b .Lreduce_by_add_vis3 | ||
| 1524 | or $acc3,$acc5,$acc3 | ||
| 1525 | .type __ecp_nistz256_sub_from_vis3,#function | ||
| 1526 | .size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3 | ||
| 1527 | |||
| 1528 | .align 32 | ||
| 1529 | __ecp_nistz256_sub_morf_vis3: | ||
| 1530 | ld [$bp+4],$t0 | ||
| 1531 | ld [$bp+0],$t1 | ||
| 1532 | ld [$bp+12],$t2 | ||
| 1533 | ld [$bp+8],$t3 | ||
| 1534 | |||
| 1535 | srlx $acc0,32,$acc4 | ||
| 1536 | not $poly1,$poly1 | ||
| 1537 | srlx $acc1,32,$acc5 | ||
| 1538 | subcc $t0,$acc0,$acc0 | ||
| 1539 | ld [$bp+20],$t0 | ||
| 1540 | subccc $t1,$acc4,$acc4 | ||
| 1541 | ld [$bp+16],$t1 | ||
| 1542 | subccc $t2,$acc1,$acc1 | ||
| 1543 | ld [$bp+28],$t2 | ||
| 1544 | and $acc0,$poly1,$acc0 | ||
| 1545 | subccc $t3,$acc5,$acc5 | ||
| 1546 | ld [$bp+24],$t3 | ||
| 1547 | sllx $acc4,32,$acc4 | ||
| 1548 | and $acc1,$poly1,$acc1 | ||
| 1549 | sllx $acc5,32,$acc5 | ||
| 1550 | or $acc0,$acc4,$acc0 | ||
| 1551 | srlx $acc2,32,$acc4 | ||
| 1552 | or $acc1,$acc5,$acc1 | ||
| 1553 | srlx $acc3,32,$acc5 | ||
| 1554 | subccc $t0,$acc2,$acc2 | ||
| 1555 | subccc $t1,$acc4,$acc4 | ||
| 1556 | subccc $t2,$acc3,$acc3 | ||
| 1557 | and $acc2,$poly1,$acc2 | ||
| 1558 | subccc $t3,$acc5,$acc5 | ||
| 1559 | sllx $acc4,32,$acc4 | ||
| 1560 | and $acc3,$poly1,$acc3 | ||
| 1561 | sllx $acc5,32,$acc5 | ||
| 1562 | or $acc2,$acc4,$acc2 | ||
| 1563 | subc %g0,%g0,$acc4 ! did it borrow? | ||
| 1564 | or $acc3,$acc5,$acc3 | ||
| 1565 | |||
| 1566 | .Lreduce_by_add_vis3: | ||
| 1567 | |||
| 1568 | addcc $acc0,-1,$t0 ! add modulus | ||
| 1569 | not $poly3,$t3 | ||
| 1570 | addxccc $acc1,$poly1,$t1 | ||
| 1571 | not $poly1,$poly1 ! restore $poly1 | ||
| 1572 | addxccc $acc2,%g0,$t2 | ||
| 1573 | addxc $acc3,$t3,$t3 | ||
| 1574 | |||
| 1575 | movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod | ||
| 1576 | movrnz $acc4,$t1,$acc1 | ||
| 1577 | stx $acc0,[$rp] | ||
| 1578 | movrnz $acc4,$t2,$acc2 | ||
| 1579 | stx $acc1,[$rp+8] | ||
| 1580 | movrnz $acc4,$t3,$acc3 | ||
| 1581 | stx $acc2,[$rp+16] | ||
| 1582 | retl | ||
| 1583 | stx $acc3,[$rp+24] | ||
| 1584 | .type __ecp_nistz256_sub_morf_vis3,#function | ||
| 1585 | .size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3 | ||
| 1586 | |||
| 1587 | .align 32 | ||
| 1588 | __ecp_nistz256_div_by_2_vis3: | ||
| 1589 | ! ret = (a is odd ? a+mod : a) >> 1 | ||
| 1590 | |||
| 1591 | not $poly1,$t1 | ||
| 1592 | not $poly3,$t3 | ||
| 1593 | and $acc0,1,$acc5 | ||
| 1594 | addcc $acc0,-1,$t0 ! add modulus | ||
| 1595 | addxccc $acc1,$t1,$t1 | ||
| 1596 | addxccc $acc2,%g0,$t2 | ||
| 1597 | addxccc $acc3,$t3,$t3 | ||
| 1598 | addxc %g0,%g0,$acc4 ! carry bit | ||
| 1599 | |||
| 1600 | movrnz $acc5,$t0,$acc0 | ||
| 1601 | movrnz $acc5,$t1,$acc1 | ||
| 1602 | movrnz $acc5,$t2,$acc2 | ||
| 1603 | movrnz $acc5,$t3,$acc3 | ||
| 1604 | movrz $acc5,%g0,$acc4 | ||
| 1605 | |||
| 1606 | ! ret >>= 1 | ||
| 1607 | |||
| 1608 | srlx $acc0,1,$acc0 | ||
| 1609 | sllx $acc1,63,$t0 | ||
| 1610 | srlx $acc1,1,$acc1 | ||
| 1611 | or $acc0,$t0,$acc0 | ||
| 1612 | sllx $acc2,63,$t1 | ||
| 1613 | srlx $acc2,1,$acc2 | ||
| 1614 | or $acc1,$t1,$acc1 | ||
| 1615 | sllx $acc3,63,$t2 | ||
| 1616 | stx $acc0,[$rp] | ||
| 1617 | srlx $acc3,1,$acc3 | ||
| 1618 | or $acc2,$t2,$acc2 | ||
| 1619 | sllx $acc4,63,$t3 ! don't forget carry bit | ||
| 1620 | stx $acc1,[$rp+8] | ||
| 1621 | or $acc3,$t3,$acc3 | ||
| 1622 | stx $acc2,[$rp+16] | ||
| 1623 | retl | ||
| 1624 | stx $acc3,[$rp+24] | ||
| 1625 | .type __ecp_nistz256_div_by_2_vis3,#function | ||
| 1626 | .size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3 | ||
| 1627 | |||
| 1628 | ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and | ||
| 1629 | ! 4x faster [on T4]... | ||
| 1630 | .align 32 | ||
| 1631 | __ecp_nistz256_mul_mont_vis3: | ||
| 1632 | mulx $a0,$bi,$acc0 | ||
| 1633 | not $poly3,$poly3 ! 0xFFFFFFFF00000001 | ||
| 1634 | umulxhi $a0,$bi,$t0 | ||
| 1635 | mulx $a1,$bi,$acc1 | ||
| 1636 | umulxhi $a1,$bi,$t1 | ||
| 1637 | mulx $a2,$bi,$acc2 | ||
| 1638 | umulxhi $a2,$bi,$t2 | ||
| 1639 | mulx $a3,$bi,$acc3 | ||
| 1640 | umulxhi $a3,$bi,$t3 | ||
| 1641 | ldx [$bp+8],$bi ! b[1] | ||
| 1642 | |||
| 1643 | addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication | ||
| 1644 | sllx $acc0,32,$t0 | ||
| 1645 | addxccc $acc2,$t1,$acc2 | ||
| 1646 | srlx $acc0,32,$t1 | ||
| 1647 | addxccc $acc3,$t2,$acc3 | ||
| 1648 | addxc %g0,$t3,$acc4 | ||
| 1649 | mov 0,$acc5 | ||
| 1650 | ___ | ||
| 1651 | for($i=1;$i<4;$i++) { | ||
| 1652 | # Reduction iteration is normally performed by accumulating | ||
| 1653 | # result of multiplication of modulus by "magic" digit [and | ||
| 1654 | # omitting least significant word, which is guaranteed to | ||
| 1655 | # be 0], but thanks to special form of modulus and "magic" | ||
| 1656 | # digit being equal to least significant word, it can be | ||
| 1657 | # performed with additions and subtractions alone. Indeed: | ||
| 1658 | # | ||
| 1659 | # ffff0001.00000000.0000ffff.ffffffff | ||
| 1660 | # * abcdefgh | ||
| 1661 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | ||
| 1662 | # | ||
| 1663 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 1664 | # rewrite above as: | ||
| 1665 | # | ||
| 1666 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | ||
| 1667 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 | ||
| 1668 | # - 0000abcd.efgh0000.00000000.00000000.abcdefgh | ||
| 1669 | # | ||
| 1670 | # or marking redundant operations: | ||
| 1671 | # | ||
| 1672 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- | ||
| 1673 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- | ||
| 1674 | # - 0000abcd.efgh0000.--------.--------.-------- | ||
| 1675 | # ^^^^^^^^ but this word is calculated with umulxhi, because | ||
| 1676 | # there is no subtract with 64-bit borrow:-( | ||
| 1677 | |||
| 1678 | $code.=<<___; | ||
| 1679 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1680 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1681 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1682 | mulx $a0,$bi,$t0 | ||
| 1683 | addxccc $acc2,$t1,$acc1 | ||
| 1684 | mulx $a1,$bi,$t1 | ||
| 1685 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1686 | mulx $a2,$bi,$t2 | ||
| 1687 | addxccc $acc4,$t3,$acc3 | ||
| 1688 | mulx $a3,$bi,$t3 | ||
| 1689 | addxc $acc5,%g0,$acc4 | ||
| 1690 | |||
| 1691 | addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication | ||
| 1692 | umulxhi $a0,$bi,$t0 | ||
| 1693 | addxccc $acc1,$t1,$acc1 | ||
| 1694 | umulxhi $a1,$bi,$t1 | ||
| 1695 | addxccc $acc2,$t2,$acc2 | ||
| 1696 | umulxhi $a2,$bi,$t2 | ||
| 1697 | addxccc $acc3,$t3,$acc3 | ||
| 1698 | umulxhi $a3,$bi,$t3 | ||
| 1699 | addxc $acc4,%g0,$acc4 | ||
| 1700 | ___ | ||
| 1701 | $code.=<<___ if ($i<3); | ||
| 1702 | ldx [$bp+8*($i+1)],$bi ! bp[$i+1] | ||
| 1703 | ___ | ||
| 1704 | $code.=<<___; | ||
| 1705 | addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication | ||
| 1706 | sllx $acc0,32,$t0 | ||
| 1707 | addxccc $acc2,$t1,$acc2 | ||
| 1708 | srlx $acc0,32,$t1 | ||
| 1709 | addxccc $acc3,$t2,$acc3 | ||
| 1710 | addxccc $acc4,$t3,$acc4 | ||
| 1711 | addxc %g0,%g0,$acc5 | ||
| 1712 | ___ | ||
| 1713 | } | ||
| 1714 | $code.=<<___; | ||
| 1715 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1716 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1717 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1718 | addxccc $acc2,$t1,$acc1 | ||
| 1719 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1720 | addxccc $acc4,$t3,$acc3 | ||
| 1721 | b .Lmul_final_vis3 ! see below | ||
| 1722 | addxc $acc5,%g0,$acc4 | ||
| 1723 | .type __ecp_nistz256_mul_mont_vis3,#function | ||
| 1724 | .size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3 | ||
| 1725 | |||
| 1726 | ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less | ||
| 1727 | ! instructions, but only 14% faster [on T4]... | ||
| 1728 | .align 32 | ||
| 1729 | __ecp_nistz256_sqr_mont_vis3: | ||
| 1730 | ! | | | | | |a1*a0| | | ||
| 1731 | ! | | | | |a2*a0| | | | ||
| 1732 | ! | |a3*a2|a3*a0| | | | | ||
| 1733 | ! | | | |a2*a1| | | | | ||
| 1734 | ! | | |a3*a1| | | | | | ||
| 1735 | ! *| | | | | | | | 2| | ||
| 1736 | ! +|a3*a3|a2*a2|a1*a1|a0*a0| | ||
| 1737 | ! |--+--+--+--+--+--+--+--| | ||
| 1738 | ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx | ||
| 1739 | ! | ||
| 1740 | ! "can't overflow" below mark carrying into high part of | ||
| 1741 | ! multiplication result, which can't overflow, because it | ||
| 1742 | ! can never be all ones. | ||
| 1743 | |||
| 1744 | mulx $a1,$a0,$acc1 ! a[1]*a[0] | ||
| 1745 | umulxhi $a1,$a0,$t1 | ||
| 1746 | mulx $a2,$a0,$acc2 ! a[2]*a[0] | ||
| 1747 | umulxhi $a2,$a0,$t2 | ||
| 1748 | mulx $a3,$a0,$acc3 ! a[3]*a[0] | ||
| 1749 | umulxhi $a3,$a0,$acc4 | ||
| 1750 | |||
| 1751 | addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication | ||
| 1752 | mulx $a2,$a1,$t0 ! a[2]*a[1] | ||
| 1753 | umulxhi $a2,$a1,$t1 | ||
| 1754 | addxccc $acc3,$t2,$acc3 | ||
| 1755 | mulx $a3,$a1,$t2 ! a[3]*a[1] | ||
| 1756 | umulxhi $a3,$a1,$t3 | ||
| 1757 | addxc $acc4,%g0,$acc4 ! can't overflow | ||
| 1758 | |||
| 1759 | mulx $a3,$a2,$acc5 ! a[3]*a[2] | ||
| 1760 | not $poly3,$poly3 ! 0xFFFFFFFF00000001 | ||
| 1761 | umulxhi $a3,$a2,$acc6 | ||
| 1762 | |||
| 1763 | addcc $t2,$t1,$t1 ! accumulate high parts of multiplication | ||
| 1764 | mulx $a0,$a0,$acc0 ! a[0]*a[0] | ||
| 1765 | addxc $t3,%g0,$t2 ! can't overflow | ||
| 1766 | |||
| 1767 | addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication | ||
| 1768 | umulxhi $a0,$a0,$a0 | ||
| 1769 | addxccc $acc4,$t1,$acc4 | ||
| 1770 | mulx $a1,$a1,$t1 ! a[1]*a[1] | ||
| 1771 | addxccc $acc5,$t2,$acc5 | ||
| 1772 | umulxhi $a1,$a1,$a1 | ||
| 1773 | addxc $acc6,%g0,$acc6 ! can't overflow | ||
| 1774 | |||
| 1775 | addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2 | ||
| 1776 | mulx $a2,$a2,$t2 ! a[2]*a[2] | ||
| 1777 | addxccc $acc2,$acc2,$acc2 | ||
| 1778 | umulxhi $a2,$a2,$a2 | ||
| 1779 | addxccc $acc3,$acc3,$acc3 | ||
| 1780 | mulx $a3,$a3,$t3 ! a[3]*a[3] | ||
| 1781 | addxccc $acc4,$acc4,$acc4 | ||
| 1782 | umulxhi $a3,$a3,$a3 | ||
| 1783 | addxccc $acc5,$acc5,$acc5 | ||
| 1784 | addxccc $acc6,$acc6,$acc6 | ||
| 1785 | addxc %g0,%g0,$acc7 | ||
| 1786 | |||
| 1787 | addcc $acc1,$a0,$acc1 ! +a[i]*a[i] | ||
| 1788 | addxccc $acc2,$t1,$acc2 | ||
| 1789 | addxccc $acc3,$a1,$acc3 | ||
| 1790 | addxccc $acc4,$t2,$acc4 | ||
| 1791 | sllx $acc0,32,$t0 | ||
| 1792 | addxccc $acc5,$a2,$acc5 | ||
| 1793 | srlx $acc0,32,$t1 | ||
| 1794 | addxccc $acc6,$t3,$acc6 | ||
| 1795 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1796 | addxc $acc7,$a3,$acc7 | ||
| 1797 | ___ | ||
| 1798 | for($i=0;$i<3;$i++) { # reductions, see commentary | ||
| 1799 | # in multiplication for details | ||
| 1800 | $code.=<<___; | ||
| 1801 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1802 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1803 | sllx $acc0,32,$t0 | ||
| 1804 | addxccc $acc2,$t1,$acc1 | ||
| 1805 | srlx $acc0,32,$t1 | ||
| 1806 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1807 | sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part | ||
| 1808 | addxc %g0,$t3,$acc3 ! can't overflow | ||
| 1809 | ___ | ||
| 1810 | } | ||
| 1811 | $code.=<<___; | ||
| 1812 | umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part | ||
| 1813 | addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] | ||
| 1814 | addxccc $acc2,$t1,$acc1 | ||
| 1815 | addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 | ||
| 1816 | addxc %g0,$t3,$acc3 ! can't overflow | ||
| 1817 | |||
| 1818 | addcc $acc0,$acc4,$acc0 ! accumulate upper half | ||
| 1819 | addxccc $acc1,$acc5,$acc1 | ||
| 1820 | addxccc $acc2,$acc6,$acc2 | ||
| 1821 | addxccc $acc3,$acc7,$acc3 | ||
| 1822 | addxc %g0,%g0,$acc4 | ||
| 1823 | |||
| 1824 | .Lmul_final_vis3: | ||
| 1825 | |||
| 1826 | ! Final step is "if result > mod, subtract mod", but as comparison | ||
| 1827 | ! means subtraction, we do the subtraction and then copy outcome | ||
| 1828 | ! if it didn't borrow. But note that as we [have to] replace | ||
| 1829 | ! subtraction with addition with negative, carry/borrow logic is | ||
| 1830 | ! inverse. | ||
| 1831 | |||
| 1832 | addcc $acc0,1,$t0 ! add -modulus, i.e. subtract | ||
| 1833 | not $poly3,$poly3 ! restore 0x00000000FFFFFFFE | ||
| 1834 | addxccc $acc1,$poly1,$t1 | ||
| 1835 | addxccc $acc2,$minus1,$t2 | ||
| 1836 | addxccc $acc3,$poly3,$t3 | ||
| 1837 | addxccc $acc4,$minus1,%g0 ! did it carry? | ||
| 1838 | |||
| 1839 | movcs %xcc,$t0,$acc0 | ||
| 1840 | movcs %xcc,$t1,$acc1 | ||
| 1841 | stx $acc0,[$rp] | ||
| 1842 | movcs %xcc,$t2,$acc2 | ||
| 1843 | stx $acc1,[$rp+8] | ||
| 1844 | movcs %xcc,$t3,$acc3 | ||
| 1845 | stx $acc2,[$rp+16] | ||
| 1846 | retl | ||
| 1847 | stx $acc3,[$rp+24] | ||
| 1848 | .type __ecp_nistz256_sqr_mont_vis3,#function | ||
| 1849 | .size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3 | ||
| 1850 | ___ | ||
| 1851 | |||
| 1852 | ######################################################################## | ||
| 1853 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 1854 | # | ||
| 1855 | { | ||
| 1856 | my ($res_x,$res_y,$res_z, | ||
| 1857 | $in_x,$in_y,$in_z, | ||
| 1858 | $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9)); | ||
| 1859 | # above map() describes stack layout with 10 temporary | ||
| 1860 | # 256-bit vectors on top. | ||
| 1861 | |||
| 1862 | $code.=<<___; | ||
| 1863 | .align 32 | ||
| 1864 | ecp_nistz256_point_double_vis3: | ||
| 1865 | save %sp,-STACK64_FRAME-32*10,%sp | ||
| 1866 | |||
| 1867 | mov $rp,$rp_real | ||
| 1868 | .Ldouble_shortcut_vis3: | ||
| 1869 | mov -1,$minus1 | ||
| 1870 | mov -2,$poly3 | ||
| 1871 | sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 | ||
| 1872 | srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE | ||
| 1873 | |||
| 1874 | ! convert input to uint64_t[4] | ||
| 1875 | ld [$ap],$a0 ! in_x | ||
| 1876 | ld [$ap+4],$t0 | ||
| 1877 | ld [$ap+8],$a1 | ||
| 1878 | ld [$ap+12],$t1 | ||
| 1879 | ld [$ap+16],$a2 | ||
| 1880 | ld [$ap+20],$t2 | ||
| 1881 | ld [$ap+24],$a3 | ||
| 1882 | ld [$ap+28],$t3 | ||
| 1883 | sllx $t0,32,$t0 | ||
| 1884 | sllx $t1,32,$t1 | ||
| 1885 | ld [$ap+32],$acc0 ! in_y | ||
| 1886 | or $a0,$t0,$a0 | ||
| 1887 | ld [$ap+32+4],$t0 | ||
| 1888 | sllx $t2,32,$t2 | ||
| 1889 | ld [$ap+32+8],$acc1 | ||
| 1890 | or $a1,$t1,$a1 | ||
| 1891 | ld [$ap+32+12],$t1 | ||
| 1892 | sllx $t3,32,$t3 | ||
| 1893 | ld [$ap+32+16],$acc2 | ||
| 1894 | or $a2,$t2,$a2 | ||
| 1895 | ld [$ap+32+20],$t2 | ||
| 1896 | or $a3,$t3,$a3 | ||
| 1897 | ld [$ap+32+24],$acc3 | ||
| 1898 | sllx $t0,32,$t0 | ||
| 1899 | ld [$ap+32+28],$t3 | ||
| 1900 | sllx $t1,32,$t1 | ||
| 1901 | stx $a0,[%sp+LOCALS64+$in_x] | ||
| 1902 | sllx $t2,32,$t2 | ||
| 1903 | stx $a1,[%sp+LOCALS64+$in_x+8] | ||
| 1904 | sllx $t3,32,$t3 | ||
| 1905 | stx $a2,[%sp+LOCALS64+$in_x+16] | ||
| 1906 | or $acc0,$t0,$acc0 | ||
| 1907 | stx $a3,[%sp+LOCALS64+$in_x+24] | ||
| 1908 | or $acc1,$t1,$acc1 | ||
| 1909 | stx $acc0,[%sp+LOCALS64+$in_y] | ||
| 1910 | or $acc2,$t2,$acc2 | ||
| 1911 | stx $acc1,[%sp+LOCALS64+$in_y+8] | ||
| 1912 | or $acc3,$t3,$acc3 | ||
| 1913 | stx $acc2,[%sp+LOCALS64+$in_y+16] | ||
| 1914 | stx $acc3,[%sp+LOCALS64+$in_y+24] | ||
| 1915 | |||
| 1916 | ld [$ap+64],$a0 ! in_z | ||
| 1917 | ld [$ap+64+4],$t0 | ||
| 1918 | ld [$ap+64+8],$a1 | ||
| 1919 | ld [$ap+64+12],$t1 | ||
| 1920 | ld [$ap+64+16],$a2 | ||
| 1921 | ld [$ap+64+20],$t2 | ||
| 1922 | ld [$ap+64+24],$a3 | ||
| 1923 | ld [$ap+64+28],$t3 | ||
| 1924 | sllx $t0,32,$t0 | ||
| 1925 | sllx $t1,32,$t1 | ||
| 1926 | or $a0,$t0,$a0 | ||
| 1927 | sllx $t2,32,$t2 | ||
| 1928 | or $a1,$t1,$a1 | ||
| 1929 | sllx $t3,32,$t3 | ||
| 1930 | or $a2,$t2,$a2 | ||
| 1931 | or $a3,$t3,$a3 | ||
| 1932 | sllx $t0,32,$t0 | ||
| 1933 | sllx $t1,32,$t1 | ||
| 1934 | stx $a0,[%sp+LOCALS64+$in_z] | ||
| 1935 | sllx $t2,32,$t2 | ||
| 1936 | stx $a1,[%sp+LOCALS64+$in_z+8] | ||
| 1937 | sllx $t3,32,$t3 | ||
| 1938 | stx $a2,[%sp+LOCALS64+$in_z+16] | ||
| 1939 | stx $a3,[%sp+LOCALS64+$in_z+24] | ||
| 1940 | |||
| 1941 | ! in_y is still in $acc0-$acc3 | ||
| 1942 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y); | ||
| 1943 | add %sp,LOCALS64+$S,$rp | ||
| 1944 | |||
| 1945 | ! in_z is still in $a0-$a3 | ||
| 1946 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z); | ||
| 1947 | add %sp,LOCALS64+$Zsqr,$rp | ||
| 1948 | |||
| 1949 | mov $acc0,$a0 ! put Zsqr aside | ||
| 1950 | mov $acc1,$a1 | ||
| 1951 | mov $acc2,$a2 | ||
| 1952 | mov $acc3,$a3 | ||
| 1953 | |||
| 1954 | add %sp,LOCALS64+$in_x,$bp | ||
| 1955 | call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x); | ||
| 1956 | add %sp,LOCALS64+$M,$rp | ||
| 1957 | |||
| 1958 | mov $a0,$acc0 ! restore Zsqr | ||
| 1959 | ldx [%sp+LOCALS64+$S],$a0 ! forward load | ||
| 1960 | mov $a1,$acc1 | ||
| 1961 | ldx [%sp+LOCALS64+$S+8],$a1 | ||
| 1962 | mov $a2,$acc2 | ||
| 1963 | ldx [%sp+LOCALS64+$S+16],$a2 | ||
| 1964 | mov $a3,$acc3 | ||
| 1965 | ldx [%sp+LOCALS64+$S+24],$a3 | ||
| 1966 | |||
| 1967 | add %sp,LOCALS64+$in_x,$bp | ||
| 1968 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr); | ||
| 1969 | add %sp,LOCALS64+$Zsqr,$rp | ||
| 1970 | |||
| 1971 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S); | ||
| 1972 | add %sp,LOCALS64+$S,$rp | ||
| 1973 | |||
| 1974 | ldx [%sp+LOCALS64+$in_z],$bi | ||
| 1975 | ldx [%sp+LOCALS64+$in_y],$a0 | ||
| 1976 | ldx [%sp+LOCALS64+$in_y+8],$a1 | ||
| 1977 | ldx [%sp+LOCALS64+$in_y+16],$a2 | ||
| 1978 | ldx [%sp+LOCALS64+$in_y+24],$a3 | ||
| 1979 | add %sp,LOCALS64+$in_z,$bp | ||
| 1980 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y); | ||
| 1981 | add %sp,LOCALS64+$tmp0,$rp | ||
| 1982 | |||
| 1983 | ldx [%sp+LOCALS64+$M],$bi ! forward load | ||
| 1984 | ldx [%sp+LOCALS64+$Zsqr],$a0 | ||
| 1985 | ldx [%sp+LOCALS64+$Zsqr+8],$a1 | ||
| 1986 | ldx [%sp+LOCALS64+$Zsqr+16],$a2 | ||
| 1987 | ldx [%sp+LOCALS64+$Zsqr+24],$a3 | ||
| 1988 | |||
| 1989 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0); | ||
| 1990 | add %sp,LOCALS64+$res_z,$rp | ||
| 1991 | |||
| 1992 | add %sp,LOCALS64+$M,$bp | ||
| 1993 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr); | ||
| 1994 | add %sp,LOCALS64+$M,$rp | ||
| 1995 | |||
| 1996 | mov $acc0,$a0 ! put aside M | ||
| 1997 | mov $acc1,$a1 | ||
| 1998 | mov $acc2,$a2 | ||
| 1999 | mov $acc3,$a3 | ||
| 2000 | call __ecp_nistz256_mul_by_2_vis3 | ||
| 2001 | add %sp,LOCALS64+$M,$rp | ||
| 2002 | mov $a0,$t0 ! copy M | ||
| 2003 | ldx [%sp+LOCALS64+$S],$a0 ! forward load | ||
| 2004 | mov $a1,$t1 | ||
| 2005 | ldx [%sp+LOCALS64+$S+8],$a1 | ||
| 2006 | mov $a2,$t2 | ||
| 2007 | ldx [%sp+LOCALS64+$S+16],$a2 | ||
| 2008 | mov $a3,$t3 | ||
| 2009 | ldx [%sp+LOCALS64+$S+24],$a3 | ||
| 2010 | call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M); | ||
| 2011 | add %sp,LOCALS64+$M,$rp | ||
| 2012 | |||
| 2013 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S); | ||
| 2014 | add %sp,LOCALS64+$tmp0,$rp | ||
| 2015 | |||
| 2016 | ldx [%sp+LOCALS64+$S],$bi ! forward load | ||
| 2017 | ldx [%sp+LOCALS64+$in_x],$a0 | ||
| 2018 | ldx [%sp+LOCALS64+$in_x+8],$a1 | ||
| 2019 | ldx [%sp+LOCALS64+$in_x+16],$a2 | ||
| 2020 | ldx [%sp+LOCALS64+$in_x+24],$a3 | ||
| 2021 | |||
| 2022 | call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0); | ||
| 2023 | add %sp,LOCALS64+$res_y,$rp | ||
| 2024 | |||
| 2025 | add %sp,LOCALS64+$S,$bp | ||
| 2026 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x); | ||
| 2027 | add %sp,LOCALS64+$S,$rp | ||
| 2028 | |||
| 2029 | ldx [%sp+LOCALS64+$M],$a0 ! forward load | ||
| 2030 | ldx [%sp+LOCALS64+$M+8],$a1 | ||
| 2031 | ldx [%sp+LOCALS64+$M+16],$a2 | ||
| 2032 | ldx [%sp+LOCALS64+$M+24],$a3 | ||
| 2033 | |||
| 2034 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S); | ||
| 2035 | add %sp,LOCALS64+$tmp0,$rp | ||
| 2036 | |||
| 2037 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M); | ||
| 2038 | add %sp,LOCALS64+$res_x,$rp | ||
| 2039 | |||
| 2040 | add %sp,LOCALS64+$tmp0,$bp | ||
| 2041 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0); | ||
| 2042 | add %sp,LOCALS64+$res_x,$rp | ||
| 2043 | |||
| 2044 | ldx [%sp+LOCALS64+$M],$a0 ! forward load | ||
| 2045 | ldx [%sp+LOCALS64+$M+8],$a1 | ||
| 2046 | ldx [%sp+LOCALS64+$M+16],$a2 | ||
| 2047 | ldx [%sp+LOCALS64+$M+24],$a3 | ||
| 2048 | |||
| 2049 | add %sp,LOCALS64+$S,$bp | ||
| 2050 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x); | ||
| 2051 | add %sp,LOCALS64+$S,$rp | ||
| 2052 | |||
| 2053 | mov $acc0,$bi | ||
| 2054 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M); | ||
| 2055 | add %sp,LOCALS64+$S,$rp | ||
| 2056 | |||
| 2057 | ldx [%sp+LOCALS64+$res_x],$a0 ! forward load | ||
| 2058 | ldx [%sp+LOCALS64+$res_x+8],$a1 | ||
| 2059 | ldx [%sp+LOCALS64+$res_x+16],$a2 | ||
| 2060 | ldx [%sp+LOCALS64+$res_x+24],$a3 | ||
| 2061 | |||
| 2062 | add %sp,LOCALS64+$res_y,$bp | ||
| 2063 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y); | ||
| 2064 | add %sp,LOCALS64+$res_y,$bp | ||
| 2065 | |||
| 2066 | ! convert output to uint_32[8] | ||
| 2067 | srlx $a0,32,$t0 | ||
| 2068 | srlx $a1,32,$t1 | ||
| 2069 | st $a0,[$rp_real] ! res_x | ||
| 2070 | srlx $a2,32,$t2 | ||
| 2071 | st $t0,[$rp_real+4] | ||
| 2072 | srlx $a3,32,$t3 | ||
| 2073 | st $a1,[$rp_real+8] | ||
| 2074 | st $t1,[$rp_real+12] | ||
| 2075 | st $a2,[$rp_real+16] | ||
| 2076 | st $t2,[$rp_real+20] | ||
| 2077 | st $a3,[$rp_real+24] | ||
| 2078 | st $t3,[$rp_real+28] | ||
| 2079 | |||
| 2080 | ldx [%sp+LOCALS64+$res_z],$a0 ! forward load | ||
| 2081 | srlx $acc0,32,$t0 | ||
| 2082 | ldx [%sp+LOCALS64+$res_z+8],$a1 | ||
| 2083 | srlx $acc1,32,$t1 | ||
| 2084 | ldx [%sp+LOCALS64+$res_z+16],$a2 | ||
| 2085 | srlx $acc2,32,$t2 | ||
| 2086 | ldx [%sp+LOCALS64+$res_z+24],$a3 | ||
| 2087 | srlx $acc3,32,$t3 | ||
| 2088 | st $acc0,[$rp_real+32] ! res_y | ||
| 2089 | st $t0, [$rp_real+32+4] | ||
| 2090 | st $acc1,[$rp_real+32+8] | ||
| 2091 | st $t1, [$rp_real+32+12] | ||
| 2092 | st $acc2,[$rp_real+32+16] | ||
| 2093 | st $t2, [$rp_real+32+20] | ||
| 2094 | st $acc3,[$rp_real+32+24] | ||
| 2095 | st $t3, [$rp_real+32+28] | ||
| 2096 | |||
| 2097 | srlx $a0,32,$t0 | ||
| 2098 | srlx $a1,32,$t1 | ||
| 2099 | st $a0,[$rp_real+64] ! res_z | ||
| 2100 | srlx $a2,32,$t2 | ||
| 2101 | st $t0,[$rp_real+64+4] | ||
| 2102 | srlx $a3,32,$t3 | ||
| 2103 | st $a1,[$rp_real+64+8] | ||
| 2104 | st $t1,[$rp_real+64+12] | ||
| 2105 | st $a2,[$rp_real+64+16] | ||
| 2106 | st $t2,[$rp_real+64+20] | ||
| 2107 | st $a3,[$rp_real+64+24] | ||
| 2108 | st $t3,[$rp_real+64+28] | ||
| 2109 | |||
| 2110 | ret | ||
| 2111 | restore | ||
| 2112 | .type ecp_nistz256_point_double_vis3,#function | ||
| 2113 | .size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3 | ||
| 2114 | ___ | ||
| 2115 | } | ||
| 2116 | ######################################################################## | ||
| 2117 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 2118 | # const P256_POINT *in2); | ||
| 2119 | { | ||
| 2120 | my ($res_x,$res_y,$res_z, | ||
| 2121 | $in1_x,$in1_y,$in1_z, | ||
| 2122 | $in2_x,$in2_y,$in2_z, | ||
| 2123 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 2124 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | ||
| 2125 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 2126 | |||
| 2127 | # above map() describes stack layout with 18 temporary | ||
| 2128 | # 256-bit vectors on top. Then we reserve some space for | ||
| 2129 | # !in1infty, !in2infty and result of check for zero. | ||
| 2130 | |||
| 2131 | $code.=<<___; | ||
| 2132 | .globl ecp_nistz256_point_add_vis3 | ||
| 2133 | .align 32 | ||
| 2134 | ecp_nistz256_point_add_vis3: | ||
| 2135 | save %sp,-STACK64_FRAME-32*18-32,%sp | ||
| 2136 | |||
| 2137 | mov $rp,$rp_real | ||
| 2138 | mov -1,$minus1 | ||
| 2139 | mov -2,$poly3 | ||
| 2140 | sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 | ||
| 2141 | srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE | ||
| 2142 | |||
| 2143 | ! convert input to uint64_t[4] | ||
| 2144 | ld [$bp],$a0 ! in2_x | ||
| 2145 | ld [$bp+4],$t0 | ||
| 2146 | ld [$bp+8],$a1 | ||
| 2147 | ld [$bp+12],$t1 | ||
| 2148 | ld [$bp+16],$a2 | ||
| 2149 | ld [$bp+20],$t2 | ||
| 2150 | ld [$bp+24],$a3 | ||
| 2151 | ld [$bp+28],$t3 | ||
| 2152 | sllx $t0,32,$t0 | ||
| 2153 | sllx $t1,32,$t1 | ||
| 2154 | ld [$bp+32],$acc0 ! in2_y | ||
| 2155 | or $a0,$t0,$a0 | ||
| 2156 | ld [$bp+32+4],$t0 | ||
| 2157 | sllx $t2,32,$t2 | ||
| 2158 | ld [$bp+32+8],$acc1 | ||
| 2159 | or $a1,$t1,$a1 | ||
| 2160 | ld [$bp+32+12],$t1 | ||
| 2161 | sllx $t3,32,$t3 | ||
| 2162 | ld [$bp+32+16],$acc2 | ||
| 2163 | or $a2,$t2,$a2 | ||
| 2164 | ld [$bp+32+20],$t2 | ||
| 2165 | or $a3,$t3,$a3 | ||
| 2166 | ld [$bp+32+24],$acc3 | ||
| 2167 | sllx $t0,32,$t0 | ||
| 2168 | ld [$bp+32+28],$t3 | ||
| 2169 | sllx $t1,32,$t1 | ||
| 2170 | stx $a0,[%sp+LOCALS64+$in2_x] | ||
| 2171 | sllx $t2,32,$t2 | ||
| 2172 | stx $a1,[%sp+LOCALS64+$in2_x+8] | ||
| 2173 | sllx $t3,32,$t3 | ||
| 2174 | stx $a2,[%sp+LOCALS64+$in2_x+16] | ||
| 2175 | or $acc0,$t0,$acc0 | ||
| 2176 | stx $a3,[%sp+LOCALS64+$in2_x+24] | ||
| 2177 | or $acc1,$t1,$acc1 | ||
| 2178 | stx $acc0,[%sp+LOCALS64+$in2_y] | ||
| 2179 | or $acc2,$t2,$acc2 | ||
| 2180 | stx $acc1,[%sp+LOCALS64+$in2_y+8] | ||
| 2181 | or $acc3,$t3,$acc3 | ||
| 2182 | stx $acc2,[%sp+LOCALS64+$in2_y+16] | ||
| 2183 | stx $acc3,[%sp+LOCALS64+$in2_y+24] | ||
| 2184 | |||
| 2185 | ld [$bp+64],$acc0 ! in2_z | ||
| 2186 | ld [$bp+64+4],$t0 | ||
| 2187 | ld [$bp+64+8],$acc1 | ||
| 2188 | ld [$bp+64+12],$t1 | ||
| 2189 | ld [$bp+64+16],$acc2 | ||
| 2190 | ld [$bp+64+20],$t2 | ||
| 2191 | ld [$bp+64+24],$acc3 | ||
| 2192 | ld [$bp+64+28],$t3 | ||
| 2193 | sllx $t0,32,$t0 | ||
| 2194 | sllx $t1,32,$t1 | ||
| 2195 | ld [$ap],$a0 ! in1_x | ||
| 2196 | or $acc0,$t0,$acc0 | ||
| 2197 | ld [$ap+4],$t0 | ||
| 2198 | sllx $t2,32,$t2 | ||
| 2199 | ld [$ap+8],$a1 | ||
| 2200 | or $acc1,$t1,$acc1 | ||
| 2201 | ld [$ap+12],$t1 | ||
| 2202 | sllx $t3,32,$t3 | ||
| 2203 | ld [$ap+16],$a2 | ||
| 2204 | or $acc2,$t2,$acc2 | ||
| 2205 | ld [$ap+20],$t2 | ||
| 2206 | or $acc3,$t3,$acc3 | ||
| 2207 | ld [$ap+24],$a3 | ||
| 2208 | sllx $t0,32,$t0 | ||
| 2209 | ld [$ap+28],$t3 | ||
| 2210 | sllx $t1,32,$t1 | ||
| 2211 | stx $acc0,[%sp+LOCALS64+$in2_z] | ||
| 2212 | sllx $t2,32,$t2 | ||
| 2213 | stx $acc1,[%sp+LOCALS64+$in2_z+8] | ||
| 2214 | sllx $t3,32,$t3 | ||
| 2215 | stx $acc2,[%sp+LOCALS64+$in2_z+16] | ||
| 2216 | stx $acc3,[%sp+LOCALS64+$in2_z+24] | ||
| 2217 | |||
| 2218 | or $acc1,$acc0,$acc0 | ||
| 2219 | or $acc3,$acc2,$acc2 | ||
| 2220 | or $acc2,$acc0,$acc0 | ||
| 2221 | movrnz $acc0,-1,$acc0 ! !in2infty | ||
| 2222 | stx $acc0,[%fp+STACK_BIAS-8] | ||
| 2223 | |||
| 2224 | or $a0,$t0,$a0 | ||
| 2225 | ld [$ap+32],$acc0 ! in1_y | ||
| 2226 | or $a1,$t1,$a1 | ||
| 2227 | ld [$ap+32+4],$t0 | ||
| 2228 | or $a2,$t2,$a2 | ||
| 2229 | ld [$ap+32+8],$acc1 | ||
| 2230 | or $a3,$t3,$a3 | ||
| 2231 | ld [$ap+32+12],$t1 | ||
| 2232 | ld [$ap+32+16],$acc2 | ||
| 2233 | ld [$ap+32+20],$t2 | ||
| 2234 | ld [$ap+32+24],$acc3 | ||
| 2235 | sllx $t0,32,$t0 | ||
| 2236 | ld [$ap+32+28],$t3 | ||
| 2237 | sllx $t1,32,$t1 | ||
| 2238 | stx $a0,[%sp+LOCALS64+$in1_x] | ||
| 2239 | sllx $t2,32,$t2 | ||
| 2240 | stx $a1,[%sp+LOCALS64+$in1_x+8] | ||
| 2241 | sllx $t3,32,$t3 | ||
| 2242 | stx $a2,[%sp+LOCALS64+$in1_x+16] | ||
| 2243 | or $acc0,$t0,$acc0 | ||
| 2244 | stx $a3,[%sp+LOCALS64+$in1_x+24] | ||
| 2245 | or $acc1,$t1,$acc1 | ||
| 2246 | stx $acc0,[%sp+LOCALS64+$in1_y] | ||
| 2247 | or $acc2,$t2,$acc2 | ||
| 2248 | stx $acc1,[%sp+LOCALS64+$in1_y+8] | ||
| 2249 | or $acc3,$t3,$acc3 | ||
| 2250 | stx $acc2,[%sp+LOCALS64+$in1_y+16] | ||
| 2251 | stx $acc3,[%sp+LOCALS64+$in1_y+24] | ||
| 2252 | |||
| 2253 | ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load | ||
| 2254 | ldx [%sp+LOCALS64+$in2_z+8],$a1 | ||
| 2255 | ldx [%sp+LOCALS64+$in2_z+16],$a2 | ||
| 2256 | ldx [%sp+LOCALS64+$in2_z+24],$a3 | ||
| 2257 | |||
| 2258 | ld [$ap+64],$acc0 ! in1_z | ||
| 2259 | ld [$ap+64+4],$t0 | ||
| 2260 | ld [$ap+64+8],$acc1 | ||
| 2261 | ld [$ap+64+12],$t1 | ||
| 2262 | ld [$ap+64+16],$acc2 | ||
| 2263 | ld [$ap+64+20],$t2 | ||
| 2264 | ld [$ap+64+24],$acc3 | ||
| 2265 | ld [$ap+64+28],$t3 | ||
| 2266 | sllx $t0,32,$t0 | ||
| 2267 | sllx $t1,32,$t1 | ||
| 2268 | or $acc0,$t0,$acc0 | ||
| 2269 | sllx $t2,32,$t2 | ||
| 2270 | or $acc1,$t1,$acc1 | ||
| 2271 | sllx $t3,32,$t3 | ||
| 2272 | stx $acc0,[%sp+LOCALS64+$in1_z] | ||
| 2273 | or $acc2,$t2,$acc2 | ||
| 2274 | stx $acc1,[%sp+LOCALS64+$in1_z+8] | ||
| 2275 | or $acc3,$t3,$acc3 | ||
| 2276 | stx $acc2,[%sp+LOCALS64+$in1_z+16] | ||
| 2277 | stx $acc3,[%sp+LOCALS64+$in1_z+24] | ||
| 2278 | |||
| 2279 | or $acc1,$acc0,$acc0 | ||
| 2280 | or $acc3,$acc2,$acc2 | ||
| 2281 | or $acc2,$acc0,$acc0 | ||
| 2282 | movrnz $acc0,-1,$acc0 ! !in1infty | ||
| 2283 | stx $acc0,[%fp+STACK_BIAS-16] | ||
| 2284 | |||
| 2285 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z); | ||
| 2286 | add %sp,LOCALS64+$Z2sqr,$rp | ||
| 2287 | |||
| 2288 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2289 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2290 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2291 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2292 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 2293 | add %sp,LOCALS64+$Z1sqr,$rp | ||
| 2294 | |||
| 2295 | ldx [%sp+LOCALS64+$Z2sqr],$bi | ||
| 2296 | ldx [%sp+LOCALS64+$in2_z],$a0 | ||
| 2297 | ldx [%sp+LOCALS64+$in2_z+8],$a1 | ||
| 2298 | ldx [%sp+LOCALS64+$in2_z+16],$a2 | ||
| 2299 | ldx [%sp+LOCALS64+$in2_z+24],$a3 | ||
| 2300 | add %sp,LOCALS64+$Z2sqr,$bp | ||
| 2301 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 2302 | add %sp,LOCALS64+$S1,$rp | ||
| 2303 | |||
| 2304 | ldx [%sp+LOCALS64+$Z1sqr],$bi | ||
| 2305 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2306 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2307 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2308 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2309 | add %sp,LOCALS64+$Z1sqr,$bp | ||
| 2310 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 2311 | add %sp,LOCALS64+$S2,$rp | ||
| 2312 | |||
| 2313 | ldx [%sp+LOCALS64+$S1],$bi | ||
| 2314 | ldx [%sp+LOCALS64+$in1_y],$a0 | ||
| 2315 | ldx [%sp+LOCALS64+$in1_y+8],$a1 | ||
| 2316 | ldx [%sp+LOCALS64+$in1_y+16],$a2 | ||
| 2317 | ldx [%sp+LOCALS64+$in1_y+24],$a3 | ||
| 2318 | add %sp,LOCALS64+$S1,$bp | ||
| 2319 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y); | ||
| 2320 | add %sp,LOCALS64+$S1,$rp | ||
| 2321 | |||
| 2322 | ldx [%sp+LOCALS64+$S2],$bi | ||
| 2323 | ldx [%sp+LOCALS64+$in2_y],$a0 | ||
| 2324 | ldx [%sp+LOCALS64+$in2_y+8],$a1 | ||
| 2325 | ldx [%sp+LOCALS64+$in2_y+16],$a2 | ||
| 2326 | ldx [%sp+LOCALS64+$in2_y+24],$a3 | ||
| 2327 | add %sp,LOCALS64+$S2,$bp | ||
| 2328 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); | ||
| 2329 | add %sp,LOCALS64+$S2,$rp | ||
| 2330 | |||
| 2331 | ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load | ||
| 2332 | ldx [%sp+LOCALS64+$in1_x],$a0 | ||
| 2333 | ldx [%sp+LOCALS64+$in1_x+8],$a1 | ||
| 2334 | ldx [%sp+LOCALS64+$in1_x+16],$a2 | ||
| 2335 | ldx [%sp+LOCALS64+$in1_x+24],$a3 | ||
| 2336 | |||
| 2337 | add %sp,LOCALS64+$S1,$bp | ||
| 2338 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1); | ||
| 2339 | add %sp,LOCALS64+$R,$rp | ||
| 2340 | |||
| 2341 | or $acc1,$acc0,$acc0 ! see if result is zero | ||
| 2342 | or $acc3,$acc2,$acc2 | ||
| 2343 | or $acc2,$acc0,$acc0 | ||
| 2344 | stx $acc0,[%fp+STACK_BIAS-24] | ||
| 2345 | |||
| 2346 | add %sp,LOCALS64+$Z2sqr,$bp | ||
| 2347 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 2348 | add %sp,LOCALS64+$U1,$rp | ||
| 2349 | |||
| 2350 | ldx [%sp+LOCALS64+$Z1sqr],$bi | ||
| 2351 | ldx [%sp+LOCALS64+$in2_x],$a0 | ||
| 2352 | ldx [%sp+LOCALS64+$in2_x+8],$a1 | ||
| 2353 | ldx [%sp+LOCALS64+$in2_x+16],$a2 | ||
| 2354 | ldx [%sp+LOCALS64+$in2_x+24],$a3 | ||
| 2355 | add %sp,LOCALS64+$Z1sqr,$bp | ||
| 2356 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 2357 | add %sp,LOCALS64+$U2,$rp | ||
| 2358 | |||
| 2359 | ldx [%sp+LOCALS64+$R],$a0 ! forward load | ||
| 2360 | ldx [%sp+LOCALS64+$R+8],$a1 | ||
| 2361 | ldx [%sp+LOCALS64+$R+16],$a2 | ||
| 2362 | ldx [%sp+LOCALS64+$R+24],$a3 | ||
| 2363 | |||
| 2364 | add %sp,LOCALS64+$U1,$bp | ||
| 2365 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1); | ||
| 2366 | add %sp,LOCALS64+$H,$rp | ||
| 2367 | |||
| 2368 | or $acc1,$acc0,$acc0 ! see if result is zero | ||
| 2369 | or $acc3,$acc2,$acc2 | ||
| 2370 | orcc $acc2,$acc0,$acc0 | ||
| 2371 | |||
| 2372 | bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)? | ||
| 2373 | nop | ||
| 2374 | |||
| 2375 | ldx [%fp+STACK_BIAS-8],$t0 | ||
| 2376 | ldx [%fp+STACK_BIAS-16],$t1 | ||
| 2377 | ldx [%fp+STACK_BIAS-24],$t2 | ||
| 2378 | andcc $t0,$t1,%g0 | ||
| 2379 | be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)? | ||
| 2380 | nop | ||
| 2381 | andcc $t2,$t2,%g0 | ||
| 2382 | be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)? | ||
| 2383 | add %sp,32*(12-10)+32,%sp ! difference in frame sizes | ||
| 2384 | |||
| 2385 | st %g0,[$rp_real] | ||
| 2386 | st %g0,[$rp_real+4] | ||
| 2387 | st %g0,[$rp_real+8] | ||
| 2388 | st %g0,[$rp_real+12] | ||
| 2389 | st %g0,[$rp_real+16] | ||
| 2390 | st %g0,[$rp_real+20] | ||
| 2391 | st %g0,[$rp_real+24] | ||
| 2392 | st %g0,[$rp_real+28] | ||
| 2393 | st %g0,[$rp_real+32] | ||
| 2394 | st %g0,[$rp_real+32+4] | ||
| 2395 | st %g0,[$rp_real+32+8] | ||
| 2396 | st %g0,[$rp_real+32+12] | ||
| 2397 | st %g0,[$rp_real+32+16] | ||
| 2398 | st %g0,[$rp_real+32+20] | ||
| 2399 | st %g0,[$rp_real+32+24] | ||
| 2400 | st %g0,[$rp_real+32+28] | ||
| 2401 | st %g0,[$rp_real+64] | ||
| 2402 | st %g0,[$rp_real+64+4] | ||
| 2403 | st %g0,[$rp_real+64+8] | ||
| 2404 | st %g0,[$rp_real+64+12] | ||
| 2405 | st %g0,[$rp_real+64+16] | ||
| 2406 | st %g0,[$rp_real+64+20] | ||
| 2407 | st %g0,[$rp_real+64+24] | ||
| 2408 | st %g0,[$rp_real+64+28] | ||
| 2409 | b .Ladd_done_vis3 | ||
| 2410 | nop | ||
| 2411 | |||
| 2412 | .align 16 | ||
| 2413 | .Ladd_proceed_vis3: | ||
| 2414 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); | ||
| 2415 | add %sp,LOCALS64+$Rsqr,$rp | ||
| 2416 | |||
| 2417 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2418 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2419 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2420 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2421 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2422 | add %sp,LOCALS64+$H,$bp | ||
| 2423 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); | ||
| 2424 | add %sp,LOCALS64+$res_z,$rp | ||
| 2425 | |||
| 2426 | ldx [%sp+LOCALS64+$H],$a0 | ||
| 2427 | ldx [%sp+LOCALS64+$H+8],$a1 | ||
| 2428 | ldx [%sp+LOCALS64+$H+16],$a2 | ||
| 2429 | ldx [%sp+LOCALS64+$H+24],$a3 | ||
| 2430 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); | ||
| 2431 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2432 | |||
| 2433 | ldx [%sp+LOCALS64+$res_z],$bi | ||
| 2434 | ldx [%sp+LOCALS64+$in2_z],$a0 | ||
| 2435 | ldx [%sp+LOCALS64+$in2_z+8],$a1 | ||
| 2436 | ldx [%sp+LOCALS64+$in2_z+16],$a2 | ||
| 2437 | ldx [%sp+LOCALS64+$in2_z+24],$a3 | ||
| 2438 | add %sp,LOCALS64+$res_z,$bp | ||
| 2439 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z); | ||
| 2440 | add %sp,LOCALS64+$res_z,$rp | ||
| 2441 | |||
| 2442 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2443 | ldx [%sp+LOCALS64+$Hsqr],$a0 | ||
| 2444 | ldx [%sp+LOCALS64+$Hsqr+8],$a1 | ||
| 2445 | ldx [%sp+LOCALS64+$Hsqr+16],$a2 | ||
| 2446 | ldx [%sp+LOCALS64+$Hsqr+24],$a3 | ||
| 2447 | add %sp,LOCALS64+$H,$bp | ||
| 2448 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 2449 | add %sp,LOCALS64+$Hcub,$rp | ||
| 2450 | |||
| 2451 | ldx [%sp+LOCALS64+$U1],$bi | ||
| 2452 | ldx [%sp+LOCALS64+$Hsqr],$a0 | ||
| 2453 | ldx [%sp+LOCALS64+$Hsqr+8],$a1 | ||
| 2454 | ldx [%sp+LOCALS64+$Hsqr+16],$a2 | ||
| 2455 | ldx [%sp+LOCALS64+$Hsqr+24],$a3 | ||
| 2456 | add %sp,LOCALS64+$U1,$bp | ||
| 2457 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr); | ||
| 2458 | add %sp,LOCALS64+$U2,$rp | ||
| 2459 | |||
| 2460 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); | ||
| 2461 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2462 | |||
| 2463 | add %sp,LOCALS64+$Rsqr,$bp | ||
| 2464 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 2465 | add %sp,LOCALS64+$res_x,$rp | ||
| 2466 | |||
| 2467 | add %sp,LOCALS64+$Hcub,$bp | ||
| 2468 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); | ||
| 2469 | add %sp,LOCALS64+$res_x,$rp | ||
| 2470 | |||
| 2471 | ldx [%sp+LOCALS64+$S1],$bi ! forward load | ||
| 2472 | ldx [%sp+LOCALS64+$Hcub],$a0 | ||
| 2473 | ldx [%sp+LOCALS64+$Hcub+8],$a1 | ||
| 2474 | ldx [%sp+LOCALS64+$Hcub+16],$a2 | ||
| 2475 | ldx [%sp+LOCALS64+$Hcub+24],$a3 | ||
| 2476 | |||
| 2477 | add %sp,LOCALS64+$U2,$bp | ||
| 2478 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); | ||
| 2479 | add %sp,LOCALS64+$res_y,$rp | ||
| 2480 | |||
| 2481 | add %sp,LOCALS64+$S1,$bp | ||
| 2482 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub); | ||
| 2483 | add %sp,LOCALS64+$S2,$rp | ||
| 2484 | |||
| 2485 | ldx [%sp+LOCALS64+$R],$bi | ||
| 2486 | ldx [%sp+LOCALS64+$res_y],$a0 | ||
| 2487 | ldx [%sp+LOCALS64+$res_y+8],$a1 | ||
| 2488 | ldx [%sp+LOCALS64+$res_y+16],$a2 | ||
| 2489 | ldx [%sp+LOCALS64+$res_y+24],$a3 | ||
| 2490 | add %sp,LOCALS64+$R,$bp | ||
| 2491 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); | ||
| 2492 | add %sp,LOCALS64+$res_y,$rp | ||
| 2493 | |||
| 2494 | add %sp,LOCALS64+$S2,$bp | ||
| 2495 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); | ||
| 2496 | add %sp,LOCALS64+$res_y,$rp | ||
| 2497 | |||
| 2498 | ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 2499 | ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty | ||
| 2500 | ___ | ||
| 2501 | for($i=0;$i<96;$i+=16) { # conditional moves | ||
| 2502 | $code.=<<___; | ||
| 2503 | ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res | ||
| 2504 | ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 | ||
| 2505 | ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 | ||
| 2506 | ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 | ||
| 2507 | ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 | ||
| 2508 | ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 | ||
| 2509 | movrz $t1,$acc2,$acc0 | ||
| 2510 | movrz $t1,$acc3,$acc1 | ||
| 2511 | movrz $t2,$acc4,$acc0 | ||
| 2512 | movrz $t2,$acc5,$acc1 | ||
| 2513 | srlx $acc0,32,$acc2 | ||
| 2514 | srlx $acc1,32,$acc3 | ||
| 2515 | st $acc0,[$rp_real+$i] | ||
| 2516 | st $acc2,[$rp_real+$i+4] | ||
| 2517 | st $acc1,[$rp_real+$i+8] | ||
| 2518 | st $acc3,[$rp_real+$i+12] | ||
| 2519 | ___ | ||
| 2520 | } | ||
| 2521 | $code.=<<___; | ||
| 2522 | .Ladd_done_vis3: | ||
| 2523 | ret | ||
| 2524 | restore | ||
| 2525 | .type ecp_nistz256_point_add_vis3,#function | ||
| 2526 | .size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3 | ||
| 2527 | ___ | ||
| 2528 | } | ||
| 2529 | ######################################################################## | ||
| 2530 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | ||
| 2531 | # const P256_POINT_AFFINE *in2); | ||
| 2532 | { | ||
| 2533 | my ($res_x,$res_y,$res_z, | ||
| 2534 | $in1_x,$in1_y,$in1_z, | ||
| 2535 | $in2_x,$in2_y, | ||
| 2536 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | ||
| 2537 | my $Z1sqr = $S2; | ||
| 2538 | # above map() describes stack layout with 15 temporary | ||
| 2539 | # 256-bit vectors on top. Then we reserve some space for | ||
| 2540 | # !in1infty and !in2infty. | ||
| 2541 | |||
| 2542 | $code.=<<___; | ||
| 2543 | .align 32 | ||
| 2544 | ecp_nistz256_point_add_affine_vis3: | ||
| 2545 | save %sp,-STACK64_FRAME-32*15-32,%sp | ||
| 2546 | |||
| 2547 | mov $rp,$rp_real | ||
| 2548 | mov -1,$minus1 | ||
| 2549 | mov -2,$poly3 | ||
| 2550 | sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 | ||
| 2551 | srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE | ||
| 2552 | |||
| 2553 | ! convert input to uint64_t[4] | ||
| 2554 | ld [$bp],$a0 ! in2_x | ||
| 2555 | ld [$bp+4],$t0 | ||
| 2556 | ld [$bp+8],$a1 | ||
| 2557 | ld [$bp+12],$t1 | ||
| 2558 | ld [$bp+16],$a2 | ||
| 2559 | ld [$bp+20],$t2 | ||
| 2560 | ld [$bp+24],$a3 | ||
| 2561 | ld [$bp+28],$t3 | ||
| 2562 | sllx $t0,32,$t0 | ||
| 2563 | sllx $t1,32,$t1 | ||
| 2564 | ld [$bp+32],$acc0 ! in2_y | ||
| 2565 | or $a0,$t0,$a0 | ||
| 2566 | ld [$bp+32+4],$t0 | ||
| 2567 | sllx $t2,32,$t2 | ||
| 2568 | ld [$bp+32+8],$acc1 | ||
| 2569 | or $a1,$t1,$a1 | ||
| 2570 | ld [$bp+32+12],$t1 | ||
| 2571 | sllx $t3,32,$t3 | ||
| 2572 | ld [$bp+32+16],$acc2 | ||
| 2573 | or $a2,$t2,$a2 | ||
| 2574 | ld [$bp+32+20],$t2 | ||
| 2575 | or $a3,$t3,$a3 | ||
| 2576 | ld [$bp+32+24],$acc3 | ||
| 2577 | sllx $t0,32,$t0 | ||
| 2578 | ld [$bp+32+28],$t3 | ||
| 2579 | sllx $t1,32,$t1 | ||
| 2580 | stx $a0,[%sp+LOCALS64+$in2_x] | ||
| 2581 | sllx $t2,32,$t2 | ||
| 2582 | stx $a1,[%sp+LOCALS64+$in2_x+8] | ||
| 2583 | sllx $t3,32,$t3 | ||
| 2584 | stx $a2,[%sp+LOCALS64+$in2_x+16] | ||
| 2585 | or $acc0,$t0,$acc0 | ||
| 2586 | stx $a3,[%sp+LOCALS64+$in2_x+24] | ||
| 2587 | or $acc1,$t1,$acc1 | ||
| 2588 | stx $acc0,[%sp+LOCALS64+$in2_y] | ||
| 2589 | or $acc2,$t2,$acc2 | ||
| 2590 | stx $acc1,[%sp+LOCALS64+$in2_y+8] | ||
| 2591 | or $acc3,$t3,$acc3 | ||
| 2592 | stx $acc2,[%sp+LOCALS64+$in2_y+16] | ||
| 2593 | stx $acc3,[%sp+LOCALS64+$in2_y+24] | ||
| 2594 | |||
| 2595 | or $a1,$a0,$a0 | ||
| 2596 | or $a3,$a2,$a2 | ||
| 2597 | or $acc1,$acc0,$acc0 | ||
| 2598 | or $acc3,$acc2,$acc2 | ||
| 2599 | or $a2,$a0,$a0 | ||
| 2600 | or $acc2,$acc0,$acc0 | ||
| 2601 | or $acc0,$a0,$a0 | ||
| 2602 | movrnz $a0,-1,$a0 ! !in2infty | ||
| 2603 | stx $a0,[%fp+STACK_BIAS-8] | ||
| 2604 | |||
| 2605 | ld [$ap],$a0 ! in1_x | ||
| 2606 | ld [$ap+4],$t0 | ||
| 2607 | ld [$ap+8],$a1 | ||
| 2608 | ld [$ap+12],$t1 | ||
| 2609 | ld [$ap+16],$a2 | ||
| 2610 | ld [$ap+20],$t2 | ||
| 2611 | ld [$ap+24],$a3 | ||
| 2612 | ld [$ap+28],$t3 | ||
| 2613 | sllx $t0,32,$t0 | ||
| 2614 | sllx $t1,32,$t1 | ||
| 2615 | ld [$ap+32],$acc0 ! in1_y | ||
| 2616 | or $a0,$t0,$a0 | ||
| 2617 | ld [$ap+32+4],$t0 | ||
| 2618 | sllx $t2,32,$t2 | ||
| 2619 | ld [$ap+32+8],$acc1 | ||
| 2620 | or $a1,$t1,$a1 | ||
| 2621 | ld [$ap+32+12],$t1 | ||
| 2622 | sllx $t3,32,$t3 | ||
| 2623 | ld [$ap+32+16],$acc2 | ||
| 2624 | or $a2,$t2,$a2 | ||
| 2625 | ld [$ap+32+20],$t2 | ||
| 2626 | or $a3,$t3,$a3 | ||
| 2627 | ld [$ap+32+24],$acc3 | ||
| 2628 | sllx $t0,32,$t0 | ||
| 2629 | ld [$ap+32+28],$t3 | ||
| 2630 | sllx $t1,32,$t1 | ||
| 2631 | stx $a0,[%sp+LOCALS64+$in1_x] | ||
| 2632 | sllx $t2,32,$t2 | ||
| 2633 | stx $a1,[%sp+LOCALS64+$in1_x+8] | ||
| 2634 | sllx $t3,32,$t3 | ||
| 2635 | stx $a2,[%sp+LOCALS64+$in1_x+16] | ||
| 2636 | or $acc0,$t0,$acc0 | ||
| 2637 | stx $a3,[%sp+LOCALS64+$in1_x+24] | ||
| 2638 | or $acc1,$t1,$acc1 | ||
| 2639 | stx $acc0,[%sp+LOCALS64+$in1_y] | ||
| 2640 | or $acc2,$t2,$acc2 | ||
| 2641 | stx $acc1,[%sp+LOCALS64+$in1_y+8] | ||
| 2642 | or $acc3,$t3,$acc3 | ||
| 2643 | stx $acc2,[%sp+LOCALS64+$in1_y+16] | ||
| 2644 | stx $acc3,[%sp+LOCALS64+$in1_y+24] | ||
| 2645 | |||
| 2646 | ld [$ap+64],$a0 ! in1_z | ||
| 2647 | ld [$ap+64+4],$t0 | ||
| 2648 | ld [$ap+64+8],$a1 | ||
| 2649 | ld [$ap+64+12],$t1 | ||
| 2650 | ld [$ap+64+16],$a2 | ||
| 2651 | ld [$ap+64+20],$t2 | ||
| 2652 | ld [$ap+64+24],$a3 | ||
| 2653 | ld [$ap+64+28],$t3 | ||
| 2654 | sllx $t0,32,$t0 | ||
| 2655 | sllx $t1,32,$t1 | ||
| 2656 | or $a0,$t0,$a0 | ||
| 2657 | sllx $t2,32,$t2 | ||
| 2658 | or $a1,$t1,$a1 | ||
| 2659 | sllx $t3,32,$t3 | ||
| 2660 | stx $a0,[%sp+LOCALS64+$in1_z] | ||
| 2661 | or $a2,$t2,$a2 | ||
| 2662 | stx $a1,[%sp+LOCALS64+$in1_z+8] | ||
| 2663 | or $a3,$t3,$a3 | ||
| 2664 | stx $a2,[%sp+LOCALS64+$in1_z+16] | ||
| 2665 | stx $a3,[%sp+LOCALS64+$in1_z+24] | ||
| 2666 | |||
| 2667 | or $a1,$a0,$t0 | ||
| 2668 | or $a3,$a2,$t2 | ||
| 2669 | or $t2,$t0,$t0 | ||
| 2670 | movrnz $t0,-1,$t0 ! !in1infty | ||
| 2671 | stx $t0,[%fp+STACK_BIAS-16] | ||
| 2672 | |||
| 2673 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); | ||
| 2674 | add %sp,LOCALS64+$Z1sqr,$rp | ||
| 2675 | |||
| 2676 | ldx [%sp+LOCALS64+$in2_x],$bi | ||
| 2677 | mov $acc0,$a0 | ||
| 2678 | mov $acc1,$a1 | ||
| 2679 | mov $acc2,$a2 | ||
| 2680 | mov $acc3,$a3 | ||
| 2681 | add %sp,LOCALS64+$in2_x,$bp | ||
| 2682 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 2683 | add %sp,LOCALS64+$U2,$rp | ||
| 2684 | |||
| 2685 | ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load | ||
| 2686 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2687 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2688 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2689 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2690 | |||
| 2691 | add %sp,LOCALS64+$in1_x,$bp | ||
| 2692 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x); | ||
| 2693 | add %sp,LOCALS64+$H,$rp | ||
| 2694 | |||
| 2695 | add %sp,LOCALS64+$Z1sqr,$bp | ||
| 2696 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 2697 | add %sp,LOCALS64+$S2,$rp | ||
| 2698 | |||
| 2699 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2700 | ldx [%sp+LOCALS64+$in1_z],$a0 | ||
| 2701 | ldx [%sp+LOCALS64+$in1_z+8],$a1 | ||
| 2702 | ldx [%sp+LOCALS64+$in1_z+16],$a2 | ||
| 2703 | ldx [%sp+LOCALS64+$in1_z+24],$a3 | ||
| 2704 | add %sp,LOCALS64+$H,$bp | ||
| 2705 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); | ||
| 2706 | add %sp,LOCALS64+$res_z,$rp | ||
| 2707 | |||
| 2708 | ldx [%sp+LOCALS64+$S2],$bi | ||
| 2709 | ldx [%sp+LOCALS64+$in2_y],$a0 | ||
| 2710 | ldx [%sp+LOCALS64+$in2_y+8],$a1 | ||
| 2711 | ldx [%sp+LOCALS64+$in2_y+16],$a2 | ||
| 2712 | ldx [%sp+LOCALS64+$in2_y+24],$a3 | ||
| 2713 | add %sp,LOCALS64+$S2,$bp | ||
| 2714 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); | ||
| 2715 | add %sp,LOCALS64+$S2,$rp | ||
| 2716 | |||
| 2717 | ldx [%sp+LOCALS64+$H],$a0 ! forward load | ||
| 2718 | ldx [%sp+LOCALS64+$H+8],$a1 | ||
| 2719 | ldx [%sp+LOCALS64+$H+16],$a2 | ||
| 2720 | ldx [%sp+LOCALS64+$H+24],$a3 | ||
| 2721 | |||
| 2722 | add %sp,LOCALS64+$in1_y,$bp | ||
| 2723 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y); | ||
| 2724 | add %sp,LOCALS64+$R,$rp | ||
| 2725 | |||
| 2726 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); | ||
| 2727 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2728 | |||
| 2729 | ldx [%sp+LOCALS64+$R],$a0 | ||
| 2730 | ldx [%sp+LOCALS64+$R+8],$a1 | ||
| 2731 | ldx [%sp+LOCALS64+$R+16],$a2 | ||
| 2732 | ldx [%sp+LOCALS64+$R+24],$a3 | ||
| 2733 | call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); | ||
| 2734 | add %sp,LOCALS64+$Rsqr,$rp | ||
| 2735 | |||
| 2736 | ldx [%sp+LOCALS64+$H],$bi | ||
| 2737 | ldx [%sp+LOCALS64+$Hsqr],$a0 | ||
| 2738 | ldx [%sp+LOCALS64+$Hsqr+8],$a1 | ||
| 2739 | ldx [%sp+LOCALS64+$Hsqr+16],$a2 | ||
| 2740 | ldx [%sp+LOCALS64+$Hsqr+24],$a3 | ||
| 2741 | add %sp,LOCALS64+$H,$bp | ||
| 2742 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); | ||
| 2743 | add %sp,LOCALS64+$Hcub,$rp | ||
| 2744 | |||
| 2745 | ldx [%sp+LOCALS64+$Hsqr],$bi | ||
| 2746 | ldx [%sp+LOCALS64+$in1_x],$a0 | ||
| 2747 | ldx [%sp+LOCALS64+$in1_x+8],$a1 | ||
| 2748 | ldx [%sp+LOCALS64+$in1_x+16],$a2 | ||
| 2749 | ldx [%sp+LOCALS64+$in1_x+24],$a3 | ||
| 2750 | add %sp,LOCALS64+$Hsqr,$bp | ||
| 2751 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr); | ||
| 2752 | add %sp,LOCALS64+$U2,$rp | ||
| 2753 | |||
| 2754 | call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); | ||
| 2755 | add %sp,LOCALS64+$Hsqr,$rp | ||
| 2756 | |||
| 2757 | add %sp,LOCALS64+$Rsqr,$bp | ||
| 2758 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); | ||
| 2759 | add %sp,LOCALS64+$res_x,$rp | ||
| 2760 | |||
| 2761 | add %sp,LOCALS64+$Hcub,$bp | ||
| 2762 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); | ||
| 2763 | add %sp,LOCALS64+$res_x,$rp | ||
| 2764 | |||
| 2765 | ldx [%sp+LOCALS64+$Hcub],$bi ! forward load | ||
| 2766 | ldx [%sp+LOCALS64+$in1_y],$a0 | ||
| 2767 | ldx [%sp+LOCALS64+$in1_y+8],$a1 | ||
| 2768 | ldx [%sp+LOCALS64+$in1_y+16],$a2 | ||
| 2769 | ldx [%sp+LOCALS64+$in1_y+24],$a3 | ||
| 2770 | |||
| 2771 | add %sp,LOCALS64+$U2,$bp | ||
| 2772 | call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); | ||
| 2773 | add %sp,LOCALS64+$res_y,$rp | ||
| 2774 | |||
| 2775 | add %sp,LOCALS64+$Hcub,$bp | ||
| 2776 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub); | ||
| 2777 | add %sp,LOCALS64+$S2,$rp | ||
| 2778 | |||
| 2779 | ldx [%sp+LOCALS64+$R],$bi | ||
| 2780 | ldx [%sp+LOCALS64+$res_y],$a0 | ||
| 2781 | ldx [%sp+LOCALS64+$res_y+8],$a1 | ||
| 2782 | ldx [%sp+LOCALS64+$res_y+16],$a2 | ||
| 2783 | ldx [%sp+LOCALS64+$res_y+24],$a3 | ||
| 2784 | add %sp,LOCALS64+$R,$bp | ||
| 2785 | call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); | ||
| 2786 | add %sp,LOCALS64+$res_y,$rp | ||
| 2787 | |||
| 2788 | add %sp,LOCALS64+$S2,$bp | ||
| 2789 | call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); | ||
| 2790 | add %sp,LOCALS64+$res_y,$rp | ||
| 2791 | |||
| 2792 | ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty | ||
| 2793 | ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty | ||
| 2794 | 1: call .+8 | ||
| 2795 | add %o7,.Lone_mont_vis3-1b,$bp | ||
| 2796 | ___ | ||
| 2797 | for($i=0;$i<64;$i+=16) { # conditional moves | ||
| 2798 | $code.=<<___; | ||
| 2799 | ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res | ||
| 2800 | ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 | ||
| 2801 | ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 | ||
| 2802 | ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 | ||
| 2803 | ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 | ||
| 2804 | ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 | ||
| 2805 | movrz $t1,$acc2,$acc0 | ||
| 2806 | movrz $t1,$acc3,$acc1 | ||
| 2807 | movrz $t2,$acc4,$acc0 | ||
| 2808 | movrz $t2,$acc5,$acc1 | ||
| 2809 | srlx $acc0,32,$acc2 | ||
| 2810 | srlx $acc1,32,$acc3 | ||
| 2811 | st $acc0,[$rp_real+$i] | ||
| 2812 | st $acc2,[$rp_real+$i+4] | ||
| 2813 | st $acc1,[$rp_real+$i+8] | ||
| 2814 | st $acc3,[$rp_real+$i+12] | ||
| 2815 | ___ | ||
| 2816 | } | ||
| 2817 | for(;$i<96;$i+=16) { | ||
| 2818 | $code.=<<___; | ||
| 2819 | ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res | ||
| 2820 | ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 | ||
| 2821 | ldx [$bp+$i-64],$acc2 ! "in2" | ||
| 2822 | ldx [$bp+$i-64+8],$acc3 | ||
| 2823 | ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 | ||
| 2824 | ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 | ||
| 2825 | movrz $t1,$acc2,$acc0 | ||
| 2826 | movrz $t1,$acc3,$acc1 | ||
| 2827 | movrz $t2,$acc4,$acc0 | ||
| 2828 | movrz $t2,$acc5,$acc1 | ||
| 2829 | srlx $acc0,32,$acc2 | ||
| 2830 | srlx $acc1,32,$acc3 | ||
| 2831 | st $acc0,[$rp_real+$i] | ||
| 2832 | st $acc2,[$rp_real+$i+4] | ||
| 2833 | st $acc1,[$rp_real+$i+8] | ||
| 2834 | st $acc3,[$rp_real+$i+12] | ||
| 2835 | ___ | ||
| 2836 | } | ||
| 2837 | $code.=<<___; | ||
| 2838 | ret | ||
| 2839 | restore | ||
| 2840 | .type ecp_nistz256_point_add_affine_vis3,#function | ||
| 2841 | .size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3 | ||
| 2842 | .align 64 | ||
| 2843 | .Lone_mont_vis3: | ||
| 2844 | .long 0x00000000,0x00000001, 0xffffffff,0x00000000 | ||
| 2845 | .long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe | ||
| 2846 | .align 64 | ||
| 2847 | #endif | ||
| 2848 | ___ | ||
| 2849 | } }}} | ||
| 2850 | |||
| 2851 | # Purpose of these subroutines is to explicitly encode VIS instructions, | ||
| 2852 | # so that one can compile the module without having to specify VIS | ||
| 2853 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. | ||
| 2854 | # Idea is to reserve for option to produce "universal" binary and let | ||
| 2855 | # programmer detect if current CPU is VIS capable at run-time. | ||
| 2856 | sub unvis3 { | ||
| 2857 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | ||
| 2858 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | ||
| 2859 | my ($ref,$opf); | ||
| 2860 | my %visopf = ( "addxc" => 0x011, | ||
| 2861 | "addxccc" => 0x013, | ||
| 2862 | "umulxhi" => 0x016 ); | ||
| 2863 | |||
| 2864 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | ||
| 2865 | |||
| 2866 | if ($opf=$visopf{$mnemonic}) { | ||
| 2867 | foreach ($rs1,$rs2,$rd) { | ||
| 2868 | return $ref if (!/%([goli])([0-9])/); | ||
| 2869 | $_=$bias{$1}+$2; | ||
| 2870 | } | ||
| 2871 | |||
| 2872 | return sprintf ".word\t0x%08x !%s", | ||
| 2873 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | ||
| 2874 | $ref; | ||
| 2875 | } else { | ||
| 2876 | return $ref; | ||
| 2877 | } | ||
| 2878 | } | ||
| 2879 | |||
| 2880 | foreach (split("\n",$code)) { | ||
| 2881 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 2882 | |||
| 2883 | s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | ||
| 2884 | &unvis3($1,$2,$3,$4) | ||
| 2885 | /ge; | ||
| 2886 | |||
| 2887 | print $_,"\n"; | ||
| 2888 | } | ||
| 2889 | |||
| 2890 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl deleted file mode 100644 index 085d637e5d..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl +++ /dev/null | |||
| @@ -1,1740 +0,0 @@ | |||
| 1 | #! /usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-x86.pl,v 1.1 2016/11/04 17:33:20 miod Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | |||
| 12 | # ==================================================================== | ||
| 13 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 14 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 15 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 16 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 17 | # ==================================================================== | ||
| 18 | # | ||
| 19 | # ECP_NISTZ256 module for x86/SSE2. | ||
| 20 | # | ||
| 21 | # October 2014. | ||
| 22 | # | ||
| 23 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | ||
| 24 | # http://eprint.iacr.org/2013/816. In the process of adaptation | ||
| 25 | # original .c module was made 32-bit savvy in order to make this | ||
| 26 | # implementation possible. | ||
| 27 | # | ||
| 28 | # with/without -DECP_NISTZ256_ASM | ||
| 29 | # Pentium +66-163% | ||
| 30 | # PIII +72-172% | ||
| 31 | # P4 +65-132% | ||
| 32 | # Core2 +90-215% | ||
| 33 | # Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) | ||
| 34 | # Atom +65-155% | ||
| 35 | # Opteron +54-110% | ||
| 36 | # Bulldozer +99-240% | ||
| 37 | # VIA Nano +93-290% | ||
| 38 | # | ||
| 39 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 40 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | ||
| 41 | # operation. Keep in mind that +200% means 3x improvement. | ||
| 42 | |||
| 43 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 44 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 45 | require "x86asm.pl"; | ||
| 46 | |||
| 47 | # Uncomment when all i386 assembly generators are updated to take the output | ||
| 48 | # file as last argument... | ||
| 49 | # $output=pop; | ||
| 50 | # open STDOUT,">$output"; | ||
| 51 | |||
| 52 | &asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386"); | ||
| 53 | |||
| 54 | $sse2=0; | ||
| 55 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 56 | |||
| 57 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 58 | |||
| 59 | |||
| 60 | ######################################################################## | ||
| 61 | # Keep in mind that constants are stored least to most significant word | ||
| 62 | &static_label("ONE"); | ||
| 63 | &set_label("ONE",64); | ||
| 64 | &data_word(1,0,0,0,0,0,0,0); | ||
| 65 | &align(64); | ||
| 66 | |||
| 67 | ######################################################################## | ||
| 68 | # void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 69 | &function_begin("ecp_nistz256_mul_by_2"); | ||
| 70 | &mov ("esi",&wparam(1)); | ||
| 71 | &mov ("edi",&wparam(0)); | ||
| 72 | &mov ("ebp","esi"); | ||
| 73 | ######################################################################## | ||
| 74 | # common pattern for internal functions is that %edi is result pointer, | ||
| 75 | # %esi and %ebp are input ones, %ebp being optional. %edi is preserved. | ||
| 76 | &call ("_ecp_nistz256_add"); | ||
| 77 | &function_end("ecp_nistz256_mul_by_2"); | ||
| 78 | |||
| 79 | ######################################################################## | ||
| 80 | # void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 81 | &function_begin("ecp_nistz256_div_by_2"); | ||
| 82 | &mov ("esi",&wparam(1)); | ||
| 83 | &mov ("edi",&wparam(0)); | ||
| 84 | &call ("_ecp_nistz256_div_by_2"); | ||
| 85 | &function_end("ecp_nistz256_div_by_2"); | ||
| 86 | |||
| 87 | &function_begin_B("_ecp_nistz256_div_by_2"); | ||
| 88 | # tmp = a is odd ? a+mod : a | ||
| 89 | # | ||
| 90 | # note that because mod has special form, i.e. consists of | ||
| 91 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 92 | # assigning least significant bit of input to one register, | ||
| 93 | # %ebp, and its negative to another, %edx. | ||
| 94 | |||
| 95 | &mov ("ebp",&DWP(0,"esi")); | ||
| 96 | &xor ("edx","edx"); | ||
| 97 | &mov ("ebx",&DWP(4,"esi")); | ||
| 98 | &mov ("eax","ebp"); | ||
| 99 | &and ("ebp",1); | ||
| 100 | &mov ("ecx",&DWP(8,"esi")); | ||
| 101 | &sub ("edx","ebp"); | ||
| 102 | |||
| 103 | &add ("eax","edx"); | ||
| 104 | &adc ("ebx","edx"); | ||
| 105 | &mov (&DWP(0,"edi"),"eax"); | ||
| 106 | &adc ("ecx","edx"); | ||
| 107 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 108 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 109 | |||
| 110 | &mov ("eax",&DWP(12,"esi")); | ||
| 111 | &mov ("ebx",&DWP(16,"esi")); | ||
| 112 | &adc ("eax",0); | ||
| 113 | &mov ("ecx",&DWP(20,"esi")); | ||
| 114 | &adc ("ebx",0); | ||
| 115 | &mov (&DWP(12,"edi"),"eax"); | ||
| 116 | &adc ("ecx",0); | ||
| 117 | &mov (&DWP(16,"edi"),"ebx"); | ||
| 118 | &mov (&DWP(20,"edi"),"ecx"); | ||
| 119 | |||
| 120 | &mov ("eax",&DWP(24,"esi")); | ||
| 121 | &mov ("ebx",&DWP(28,"esi")); | ||
| 122 | &adc ("eax","ebp"); | ||
| 123 | &adc ("ebx","edx"); | ||
| 124 | &mov (&DWP(24,"edi"),"eax"); | ||
| 125 | &sbb ("esi","esi"); # broadcast carry bit | ||
| 126 | &mov (&DWP(28,"edi"),"ebx"); | ||
| 127 | |||
| 128 | # ret = tmp >> 1 | ||
| 129 | |||
| 130 | &mov ("eax",&DWP(0,"edi")); | ||
| 131 | &mov ("ebx",&DWP(4,"edi")); | ||
| 132 | &mov ("ecx",&DWP(8,"edi")); | ||
| 133 | &mov ("edx",&DWP(12,"edi")); | ||
| 134 | |||
| 135 | &shr ("eax",1); | ||
| 136 | &mov ("ebp","ebx"); | ||
| 137 | &shl ("ebx",31); | ||
| 138 | &or ("eax","ebx"); | ||
| 139 | |||
| 140 | &shr ("ebp",1); | ||
| 141 | &mov ("ebx","ecx"); | ||
| 142 | &shl ("ecx",31); | ||
| 143 | &mov (&DWP(0,"edi"),"eax"); | ||
| 144 | &or ("ebp","ecx"); | ||
| 145 | &mov ("eax",&DWP(16,"edi")); | ||
| 146 | |||
| 147 | &shr ("ebx",1); | ||
| 148 | &mov ("ecx","edx"); | ||
| 149 | &shl ("edx",31); | ||
| 150 | &mov (&DWP(4,"edi"),"ebp"); | ||
| 151 | &or ("ebx","edx"); | ||
| 152 | &mov ("ebp",&DWP(20,"edi")); | ||
| 153 | |||
| 154 | &shr ("ecx",1); | ||
| 155 | &mov ("edx","eax"); | ||
| 156 | &shl ("eax",31); | ||
| 157 | &mov (&DWP(8,"edi"),"ebx"); | ||
| 158 | &or ("ecx","eax"); | ||
| 159 | &mov ("ebx",&DWP(24,"edi")); | ||
| 160 | |||
| 161 | &shr ("edx",1); | ||
| 162 | &mov ("eax","ebp"); | ||
| 163 | &shl ("ebp",31); | ||
| 164 | &mov (&DWP(12,"edi"),"ecx"); | ||
| 165 | &or ("edx","ebp"); | ||
| 166 | &mov ("ecx",&DWP(28,"edi")); | ||
| 167 | |||
| 168 | &shr ("eax",1); | ||
| 169 | &mov ("ebp","ebx"); | ||
| 170 | &shl ("ebx",31); | ||
| 171 | &mov (&DWP(16,"edi"),"edx"); | ||
| 172 | &or ("eax","ebx"); | ||
| 173 | |||
| 174 | &shr ("ebp",1); | ||
| 175 | &mov ("ebx","ecx"); | ||
| 176 | &shl ("ecx",31); | ||
| 177 | &mov (&DWP(20,"edi"),"eax"); | ||
| 178 | &or ("ebp","ecx"); | ||
| 179 | |||
| 180 | &shr ("ebx",1); | ||
| 181 | &shl ("esi",31); | ||
| 182 | &mov (&DWP(24,"edi"),"ebp"); | ||
| 183 | &or ("ebx","esi"); # handle top-most carry bit | ||
| 184 | &mov (&DWP(28,"edi"),"ebx"); | ||
| 185 | |||
| 186 | &ret (); | ||
| 187 | &function_end_B("_ecp_nistz256_div_by_2"); | ||
| 188 | |||
| 189 | ######################################################################## | ||
| 190 | # void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], | ||
| 191 | # const BN_ULONG ebp[8]); | ||
| 192 | &function_begin("ecp_nistz256_add"); | ||
| 193 | &mov ("esi",&wparam(1)); | ||
| 194 | &mov ("ebp",&wparam(2)); | ||
| 195 | &mov ("edi",&wparam(0)); | ||
| 196 | &call ("_ecp_nistz256_add"); | ||
| 197 | &function_end("ecp_nistz256_add"); | ||
| 198 | |||
| 199 | &function_begin_B("_ecp_nistz256_add"); | ||
| 200 | &mov ("eax",&DWP(0,"esi")); | ||
| 201 | &mov ("ebx",&DWP(4,"esi")); | ||
| 202 | &mov ("ecx",&DWP(8,"esi")); | ||
| 203 | &add ("eax",&DWP(0,"ebp")); | ||
| 204 | &mov ("edx",&DWP(12,"esi")); | ||
| 205 | &adc ("ebx",&DWP(4,"ebp")); | ||
| 206 | &mov (&DWP(0,"edi"),"eax"); | ||
| 207 | &adc ("ecx",&DWP(8,"ebp")); | ||
| 208 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 209 | &adc ("edx",&DWP(12,"ebp")); | ||
| 210 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 211 | &mov (&DWP(12,"edi"),"edx"); | ||
| 212 | |||
| 213 | &mov ("eax",&DWP(16,"esi")); | ||
| 214 | &mov ("ebx",&DWP(20,"esi")); | ||
| 215 | &mov ("ecx",&DWP(24,"esi")); | ||
| 216 | &adc ("eax",&DWP(16,"ebp")); | ||
| 217 | &mov ("edx",&DWP(28,"esi")); | ||
| 218 | &adc ("ebx",&DWP(20,"ebp")); | ||
| 219 | &mov (&DWP(16,"edi"),"eax"); | ||
| 220 | &adc ("ecx",&DWP(24,"ebp")); | ||
| 221 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 222 | &mov ("esi",0); | ||
| 223 | &adc ("edx",&DWP(28,"ebp")); | ||
| 224 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 225 | &adc ("esi",0); | ||
| 226 | &mov (&DWP(28,"edi"),"edx"); | ||
| 227 | |||
| 228 | # if a+b >= modulus, subtract modulus. | ||
| 229 | # | ||
| 230 | # But since comparison implies subtraction, we subtract modulus | ||
| 231 | # to see if it borrows, and then subtract it for real if | ||
| 232 | # subtraction didn't borrow. | ||
| 233 | |||
| 234 | &mov ("eax",&DWP(0,"edi")); | ||
| 235 | &mov ("ebx",&DWP(4,"edi")); | ||
| 236 | &mov ("ecx",&DWP(8,"edi")); | ||
| 237 | &sub ("eax",-1); | ||
| 238 | &mov ("edx",&DWP(12,"edi")); | ||
| 239 | &sbb ("ebx",-1); | ||
| 240 | &mov ("eax",&DWP(16,"edi")); | ||
| 241 | &sbb ("ecx",-1); | ||
| 242 | &mov ("ebx",&DWP(20,"edi")); | ||
| 243 | &sbb ("edx",0); | ||
| 244 | &mov ("ecx",&DWP(24,"edi")); | ||
| 245 | &sbb ("eax",0); | ||
| 246 | &mov ("edx",&DWP(28,"edi")); | ||
| 247 | &sbb ("ebx",0); | ||
| 248 | &sbb ("ecx",1); | ||
| 249 | &sbb ("edx",-1); | ||
| 250 | &sbb ("esi",0); | ||
| 251 | |||
| 252 | # Note that because mod has special form, i.e. consists of | ||
| 253 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 254 | # by using borrow. | ||
| 255 | |||
| 256 | ¬ ("esi"); | ||
| 257 | &mov ("eax",&DWP(0,"edi")); | ||
| 258 | &mov ("ebp","esi"); | ||
| 259 | &mov ("ebx",&DWP(4,"edi")); | ||
| 260 | &shr ("ebp",31); | ||
| 261 | &mov ("ecx",&DWP(8,"edi")); | ||
| 262 | &sub ("eax","esi"); | ||
| 263 | &mov ("edx",&DWP(12,"edi")); | ||
| 264 | &sbb ("ebx","esi"); | ||
| 265 | &mov (&DWP(0,"edi"),"eax"); | ||
| 266 | &sbb ("ecx","esi"); | ||
| 267 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 268 | &sbb ("edx",0); | ||
| 269 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 270 | &mov (&DWP(12,"edi"),"edx"); | ||
| 271 | |||
| 272 | &mov ("eax",&DWP(16,"edi")); | ||
| 273 | &mov ("ebx",&DWP(20,"edi")); | ||
| 274 | &mov ("ecx",&DWP(24,"edi")); | ||
| 275 | &sbb ("eax",0); | ||
| 276 | &mov ("edx",&DWP(28,"edi")); | ||
| 277 | &sbb ("ebx",0); | ||
| 278 | &mov (&DWP(16,"edi"),"eax"); | ||
| 279 | &sbb ("ecx","ebp"); | ||
| 280 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 281 | &sbb ("edx","esi"); | ||
| 282 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 283 | &mov (&DWP(28,"edi"),"edx"); | ||
| 284 | |||
| 285 | &ret (); | ||
| 286 | &function_end_B("_ecp_nistz256_add"); | ||
| 287 | |||
| 288 | ######################################################################## | ||
| 289 | # void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], | ||
| 290 | # const BN_ULONG ebp[8]); | ||
| 291 | &function_begin("ecp_nistz256_sub"); | ||
| 292 | &mov ("esi",&wparam(1)); | ||
| 293 | &mov ("ebp",&wparam(2)); | ||
| 294 | &mov ("edi",&wparam(0)); | ||
| 295 | &call ("_ecp_nistz256_sub"); | ||
| 296 | &function_end("ecp_nistz256_sub"); | ||
| 297 | |||
| 298 | &function_begin_B("_ecp_nistz256_sub"); | ||
| 299 | &mov ("eax",&DWP(0,"esi")); | ||
| 300 | &mov ("ebx",&DWP(4,"esi")); | ||
| 301 | &mov ("ecx",&DWP(8,"esi")); | ||
| 302 | &sub ("eax",&DWP(0,"ebp")); | ||
| 303 | &mov ("edx",&DWP(12,"esi")); | ||
| 304 | &sbb ("ebx",&DWP(4,"ebp")); | ||
| 305 | &mov (&DWP(0,"edi"),"eax"); | ||
| 306 | &sbb ("ecx",&DWP(8,"ebp")); | ||
| 307 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 308 | &sbb ("edx",&DWP(12,"ebp")); | ||
| 309 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 310 | &mov (&DWP(12,"edi"),"edx"); | ||
| 311 | |||
| 312 | &mov ("eax",&DWP(16,"esi")); | ||
| 313 | &mov ("ebx",&DWP(20,"esi")); | ||
| 314 | &mov ("ecx",&DWP(24,"esi")); | ||
| 315 | &sbb ("eax",&DWP(16,"ebp")); | ||
| 316 | &mov ("edx",&DWP(28,"esi")); | ||
| 317 | &sbb ("ebx",&DWP(20,"ebp")); | ||
| 318 | &sbb ("ecx",&DWP(24,"ebp")); | ||
| 319 | &mov (&DWP(16,"edi"),"eax"); | ||
| 320 | &sbb ("edx",&DWP(28,"ebp")); | ||
| 321 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 322 | &sbb ("esi","esi"); # broadcast borrow bit | ||
| 323 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 324 | &mov (&DWP(28,"edi"),"edx"); | ||
| 325 | |||
| 326 | # if a-b borrows, add modulus. | ||
| 327 | # | ||
| 328 | # Note that because mod has special form, i.e. consists of | ||
| 329 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 330 | # assigning borrow bit to one register, %ebp, and its negative | ||
| 331 | # to another, %esi. But we started by calculating %esi... | ||
| 332 | |||
| 333 | &mov ("eax",&DWP(0,"edi")); | ||
| 334 | &mov ("ebp","esi"); | ||
| 335 | &mov ("ebx",&DWP(4,"edi")); | ||
| 336 | &shr ("ebp",31); | ||
| 337 | &mov ("ecx",&DWP(8,"edi")); | ||
| 338 | &add ("eax","esi"); | ||
| 339 | &mov ("edx",&DWP(12,"edi")); | ||
| 340 | &adc ("ebx","esi"); | ||
| 341 | &mov (&DWP(0,"edi"),"eax"); | ||
| 342 | &adc ("ecx","esi"); | ||
| 343 | &mov (&DWP(4,"edi"),"ebx"); | ||
| 344 | &adc ("edx",0); | ||
| 345 | &mov (&DWP(8,"edi"),"ecx"); | ||
| 346 | &mov (&DWP(12,"edi"),"edx"); | ||
| 347 | |||
| 348 | &mov ("eax",&DWP(16,"edi")); | ||
| 349 | &mov ("ebx",&DWP(20,"edi")); | ||
| 350 | &mov ("ecx",&DWP(24,"edi")); | ||
| 351 | &adc ("eax",0); | ||
| 352 | &mov ("edx",&DWP(28,"edi")); | ||
| 353 | &adc ("ebx",0); | ||
| 354 | &mov (&DWP(16,"edi"),"eax"); | ||
| 355 | &adc ("ecx","ebp"); | ||
| 356 | &mov (&DWP(20,"edi"),"ebx"); | ||
| 357 | &adc ("edx","esi"); | ||
| 358 | &mov (&DWP(24,"edi"),"ecx"); | ||
| 359 | &mov (&DWP(28,"edi"),"edx"); | ||
| 360 | |||
| 361 | &ret (); | ||
| 362 | &function_end_B("_ecp_nistz256_sub"); | ||
| 363 | |||
| 364 | ######################################################################## | ||
| 365 | # void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 366 | &function_begin("ecp_nistz256_neg"); | ||
| 367 | &mov ("ebp",&wparam(1)); | ||
| 368 | &mov ("edi",&wparam(0)); | ||
| 369 | |||
| 370 | &xor ("eax","eax"); | ||
| 371 | &stack_push(8); | ||
| 372 | &mov (&DWP(0,"esp"),"eax"); | ||
| 373 | &mov ("esi","esp"); | ||
| 374 | &mov (&DWP(4,"esp"),"eax"); | ||
| 375 | &mov (&DWP(8,"esp"),"eax"); | ||
| 376 | &mov (&DWP(12,"esp"),"eax"); | ||
| 377 | &mov (&DWP(16,"esp"),"eax"); | ||
| 378 | &mov (&DWP(20,"esp"),"eax"); | ||
| 379 | &mov (&DWP(24,"esp"),"eax"); | ||
| 380 | &mov (&DWP(28,"esp"),"eax"); | ||
| 381 | |||
| 382 | &call ("_ecp_nistz256_sub"); | ||
| 383 | |||
| 384 | &stack_pop(8); | ||
| 385 | &function_end("ecp_nistz256_neg"); | ||
| 386 | |||
| 387 | &function_begin_B("_picup_eax"); | ||
| 388 | &mov ("eax",&DWP(0,"esp")); | ||
| 389 | &ret (); | ||
| 390 | &function_end_B("_picup_eax"); | ||
| 391 | |||
| 392 | ######################################################################## | ||
| 393 | # void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 394 | &function_begin("ecp_nistz256_from_mont"); | ||
| 395 | &mov ("esi",&wparam(1)); | ||
| 396 | &call ("_picup_eax"); | ||
| 397 | &set_label("pic"); | ||
| 398 | &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); | ||
| 399 | if ($sse2) { | ||
| 400 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 401 | &mov ("eax",&DWP(0,"eax")); } | ||
| 402 | &mov ("edi",&wparam(0)); | ||
| 403 | &call ("_ecp_nistz256_mul_mont"); | ||
| 404 | &function_end("ecp_nistz256_from_mont"); | ||
| 405 | |||
| 406 | ######################################################################## | ||
| 407 | # void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], | ||
| 408 | # const BN_ULONG ebp[8]); | ||
| 409 | &function_begin("ecp_nistz256_mul_mont"); | ||
| 410 | &mov ("esi",&wparam(1)); | ||
| 411 | &mov ("ebp",&wparam(2)); | ||
| 412 | if ($sse2) { | ||
| 413 | &call ("_picup_eax"); | ||
| 414 | &set_label("pic"); | ||
| 415 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 416 | &mov ("eax",&DWP(0,"eax")); } | ||
| 417 | &mov ("edi",&wparam(0)); | ||
| 418 | &call ("_ecp_nistz256_mul_mont"); | ||
| 419 | &function_end("ecp_nistz256_mul_mont"); | ||
| 420 | |||
| 421 | ######################################################################## | ||
| 422 | # void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); | ||
| 423 | &function_begin("ecp_nistz256_sqr_mont"); | ||
| 424 | &mov ("esi",&wparam(1)); | ||
| 425 | if ($sse2) { | ||
| 426 | &call ("_picup_eax"); | ||
| 427 | &set_label("pic"); | ||
| 428 | &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 429 | &mov ("eax",&DWP(0,"eax")); } | ||
| 430 | &mov ("edi",&wparam(0)); | ||
| 431 | &mov ("ebp","esi"); | ||
| 432 | &call ("_ecp_nistz256_mul_mont"); | ||
| 433 | &function_end("ecp_nistz256_sqr_mont"); | ||
| 434 | |||
| 435 | &function_begin_B("_ecp_nistz256_mul_mont"); | ||
| 436 | if ($sse2) { | ||
| 437 | # see if XMM+SSE2 is on | ||
| 438 | &and ("eax","\$(IA32CAP_MASK0_FXSR | IA32CAP_MASK0_SSE2)"); | ||
| 439 | &cmp ("eax","\$(IA32CAP_MASK0_FXSR | IA32CAP_MASK0_SSE2)"); | ||
| 440 | &jne (&label("mul_mont_ialu")); | ||
| 441 | |||
| 442 | ######################################## | ||
| 443 | # SSE2 code path featuring 32x16-bit | ||
| 444 | # multiplications is ~2x faster than | ||
| 445 | # IALU counterpart (except on Atom)... | ||
| 446 | ######################################## | ||
| 447 | # stack layout: | ||
| 448 | # +------------------------------------+< %esp | ||
| 449 | # | 7 16-byte temporary XMM words, | | ||
| 450 | # | "sliding" toward lower address | | ||
| 451 | # . . | ||
| 452 | # +------------------------------------+ | ||
| 453 | # | unused XMM word | | ||
| 454 | # +------------------------------------+< +128,%ebx | ||
| 455 | # | 8 16-byte XMM words holding copies | | ||
| 456 | # | of a[i]<<64|a[i] | | ||
| 457 | # . . | ||
| 458 | # . . | ||
| 459 | # +------------------------------------+< +256 | ||
| 460 | &mov ("edx","esp"); | ||
| 461 | &sub ("esp",0x100); | ||
| 462 | |||
| 463 | &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy | ||
| 464 | &lea ("ebp",&DWP(4,"ebp")); | ||
| 465 | &pcmpeqd("xmm6","xmm6"); | ||
| 466 | &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff | ||
| 467 | |||
| 468 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | ||
| 469 | &and ("esp",-64); | ||
| 470 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | ||
| 471 | &lea ("ebx",&DWP(0x80,"esp")); | ||
| 472 | |||
| 473 | &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy | ||
| 474 | &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy | ||
| 475 | &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... | ||
| 476 | &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] | ||
| 477 | &pmuludq("xmm0","xmm7"); # a[0]*b[0] | ||
| 478 | |||
| 479 | &movd ("xmm2",&DWP(4*2,"esi")); | ||
| 480 | &pshufd ("xmm1","xmm1",0b11001100); | ||
| 481 | &movdqa (&QWP(0x10,"ebx"),"xmm1"); | ||
| 482 | &pmuludq("xmm1","xmm7"); # a[1]*b[0] | ||
| 483 | |||
| 484 | &movq ("xmm4","xmm0"); # clear upper 64 bits | ||
| 485 | &pslldq("xmm4",6); | ||
| 486 | &paddq ("xmm4","xmm0"); | ||
| 487 | &movdqa("xmm5","xmm4"); | ||
| 488 | &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] | ||
| 489 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] | ||
| 490 | |||
| 491 | # Upper half of a[0]*b[i] is carried into next multiplication | ||
| 492 | # iteration, while lower one "participates" in actual reduction. | ||
| 493 | # Normally latter is done by accumulating result of multiplication | ||
| 494 | # of modulus by "magic" digit, but thanks to special form of modulus | ||
| 495 | # and "magic" digit it can be performed only with additions and | ||
| 496 | # subtractions (see note in IALU section below). Note that we are | ||
| 497 | # not bothered with carry bits, they are accumulated in "flatten" | ||
| 498 | # phase after all multiplications and reductions. | ||
| 499 | |||
| 500 | &movd ("xmm3",&DWP(4*3,"esi")); | ||
| 501 | &pshufd ("xmm2","xmm2",0b11001100); | ||
| 502 | &movdqa (&QWP(0x20,"ebx"),"xmm2"); | ||
| 503 | &pmuludq("xmm2","xmm7"); # a[2]*b[0] | ||
| 504 | &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry | ||
| 505 | &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] | ||
| 506 | |||
| 507 | &movd ("xmm0",&DWP(4*4,"esi")); | ||
| 508 | &pshufd ("xmm3","xmm3",0b11001100); | ||
| 509 | &movdqa (&QWP(0x30,"ebx"),"xmm3"); | ||
| 510 | &pmuludq("xmm3","xmm7"); # a[3]*b[0] | ||
| 511 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | ||
| 512 | |||
| 513 | &movd ("xmm1",&DWP(4*5,"esi")); | ||
| 514 | &pshufd ("xmm0","xmm0",0b11001100); | ||
| 515 | &movdqa (&QWP(0x40,"ebx"),"xmm0"); | ||
| 516 | &pmuludq("xmm0","xmm7"); # a[4]*b[0] | ||
| 517 | &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step | ||
| 518 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | ||
| 519 | |||
| 520 | &movd ("xmm2",&DWP(4*6,"esi")); | ||
| 521 | &pshufd ("xmm1","xmm1",0b11001100); | ||
| 522 | &movdqa (&QWP(0x50,"ebx"),"xmm1"); | ||
| 523 | &pmuludq("xmm1","xmm7"); # a[5]*b[0] | ||
| 524 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | ||
| 525 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | ||
| 526 | |||
| 527 | &movd ("xmm3",&DWP(4*7,"esi")); | ||
| 528 | &pshufd ("xmm2","xmm2",0b11001100); | ||
| 529 | &movdqa (&QWP(0x60,"ebx"),"xmm2"); | ||
| 530 | &pmuludq("xmm2","xmm7"); # a[6]*b[0] | ||
| 531 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | ||
| 532 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | ||
| 533 | |||
| 534 | &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy | ||
| 535 | &pshufd ("xmm3","xmm3",0b11001100); | ||
| 536 | &movdqa (&QWP(0x70,"ebx"),"xmm3"); | ||
| 537 | &pmuludq("xmm3","xmm7"); # a[7]*b[0] | ||
| 538 | |||
| 539 | &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y | ||
| 540 | &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | ||
| 541 | &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | ||
| 542 | |||
| 543 | &mov ("ecx",6); | ||
| 544 | &lea ("ebp",&DWP(4,"ebp")); | ||
| 545 | &jmp (&label("madd_sse2")); | ||
| 546 | |||
| 547 | &set_label("madd_sse2",16); | ||
| 548 | &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] | ||
| 549 | &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] | ||
| 550 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | ||
| 551 | &pmuludq("xmm0","xmm7"); # a[0]*b[i] | ||
| 552 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | ||
| 553 | |||
| 554 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | ||
| 555 | &pmuludq("xmm1","xmm7"); # a[1]*b[i] | ||
| 556 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | ||
| 557 | &paddq ("xmm0",&QWP(0x00,"esp")); | ||
| 558 | |||
| 559 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | ||
| 560 | &pmuludq("xmm2","xmm7"); # a[2]*b[i] | ||
| 561 | &movq ("xmm4","xmm0"); # clear upper 64 bits | ||
| 562 | &pslldq("xmm4",6); | ||
| 563 | &paddq ("xmm1",&QWP(0x10,"esp")); | ||
| 564 | &paddq ("xmm4","xmm0"); | ||
| 565 | &movdqa("xmm5","xmm4"); | ||
| 566 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | ||
| 567 | |||
| 568 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | ||
| 569 | &pmuludq("xmm3","xmm7"); # a[3]*b[i] | ||
| 570 | &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry | ||
| 571 | &paddq ("xmm2",&QWP(0x20,"esp")); | ||
| 572 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | ||
| 573 | |||
| 574 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | ||
| 575 | &pmuludq("xmm0","xmm7"); # a[4]*b[i] | ||
| 576 | &paddq ("xmm3",&QWP(0x30,"esp")); | ||
| 577 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | ||
| 578 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | ||
| 579 | |||
| 580 | &movdqa ("xmm2",&QWP(0x60,"ebx")); | ||
| 581 | &pmuludq("xmm1","xmm7"); # a[5]*b[i] | ||
| 582 | &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step | ||
| 583 | &paddq ("xmm0",&QWP(0x40,"esp")); | ||
| 584 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | ||
| 585 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | ||
| 586 | |||
| 587 | &movdqa ("xmm3","xmm7"); | ||
| 588 | &pmuludq("xmm2","xmm7"); # a[6]*b[i] | ||
| 589 | &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy | ||
| 590 | &lea ("ebp",&DWP(4,"ebp")); | ||
| 591 | &paddq ("xmm1",&QWP(0x50,"esp")); | ||
| 592 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | ||
| 593 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | ||
| 594 | &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y | ||
| 595 | |||
| 596 | &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] | ||
| 597 | &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y | ||
| 598 | &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] | ||
| 599 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | ||
| 600 | &paddq ("xmm2",&QWP(0x60,"esp")); | ||
| 601 | |||
| 602 | &dec ("ecx"); | ||
| 603 | &jnz (&label("madd_sse2")); | ||
| 604 | |||
| 605 | &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] | ||
| 606 | &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] | ||
| 607 | &movdqa ("xmm1",&QWP(0x10,"ebx")); | ||
| 608 | &pmuludq("xmm0","xmm7"); # a[0]*b[7] | ||
| 609 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | ||
| 610 | |||
| 611 | &movdqa ("xmm2",&QWP(0x20,"ebx")); | ||
| 612 | &pmuludq("xmm1","xmm7"); # a[1]*b[7] | ||
| 613 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | ||
| 614 | &paddq ("xmm0",&QWP(0x00,"esp")); | ||
| 615 | |||
| 616 | &movdqa ("xmm3",&QWP(0x30,"ebx")); | ||
| 617 | &pmuludq("xmm2","xmm7"); # a[2]*b[7] | ||
| 618 | &movq ("xmm4","xmm0"); # clear upper 64 bits | ||
| 619 | &pslldq("xmm4",6); | ||
| 620 | &paddq ("xmm1",&QWP(0x10,"esp")); | ||
| 621 | &paddq ("xmm4","xmm0"); | ||
| 622 | &movdqa("xmm5","xmm4"); | ||
| 623 | &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] | ||
| 624 | |||
| 625 | &movdqa ("xmm0",&QWP(0x40,"ebx")); | ||
| 626 | &pmuludq("xmm3","xmm7"); # a[3]*b[7] | ||
| 627 | &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry | ||
| 628 | &paddq ("xmm2",&QWP(0x20,"esp")); | ||
| 629 | &movdqa (&QWP(0x00,"esp"),"xmm1"); | ||
| 630 | |||
| 631 | &movdqa ("xmm1",&QWP(0x50,"ebx")); | ||
| 632 | &pmuludq("xmm0","xmm7"); # a[4]*b[7] | ||
| 633 | &paddq ("xmm3",&QWP(0x30,"esp")); | ||
| 634 | &movdqa (&QWP(0x10,"esp"),"xmm2"); | ||
| 635 | &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] | ||
| 636 | |||
| 637 | &movdqa ("xmm2",&QWP(0x60,"ebx")); | ||
| 638 | &pmuludq("xmm1","xmm7"); # a[5]*b[7] | ||
| 639 | &paddq ("xmm3","xmm5"); # reduction step | ||
| 640 | &paddq ("xmm0",&QWP(0x40,"esp")); | ||
| 641 | &movdqa (&QWP(0x20,"esp"),"xmm3"); | ||
| 642 | &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step | ||
| 643 | |||
| 644 | &movdqa ("xmm3",&QWP(0x70,"ebx")); | ||
| 645 | &pmuludq("xmm2","xmm7"); # a[6]*b[7] | ||
| 646 | &paddq ("xmm1",&QWP(0x50,"esp")); | ||
| 647 | &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step | ||
| 648 | &movdqa (&QWP(0x30,"esp"),"xmm0"); | ||
| 649 | |||
| 650 | &pmuludq("xmm3","xmm7"); # a[7]*b[7] | ||
| 651 | &pcmpeqd("xmm7","xmm7"); | ||
| 652 | &movdqa ("xmm0",&QWP(0x00,"esp")); | ||
| 653 | &pslldq ("xmm7",8); | ||
| 654 | &movdqa (&QWP(0x40,"esp"),"xmm1"); | ||
| 655 | &paddq ("xmm2",&QWP(0x60,"esp")); | ||
| 656 | |||
| 657 | &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step | ||
| 658 | &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step | ||
| 659 | &movdqa(&QWP(0x50,"esp"),"xmm2"); | ||
| 660 | &movdqa(&QWP(0x60,"esp"),"xmm3"); | ||
| 661 | |||
| 662 | &movdqa ("xmm1",&QWP(0x10,"esp")); | ||
| 663 | &movdqa ("xmm2",&QWP(0x20,"esp")); | ||
| 664 | &movdqa ("xmm3",&QWP(0x30,"esp")); | ||
| 665 | |||
| 666 | &movq ("xmm4","xmm0"); # "flatten" | ||
| 667 | &pand ("xmm0","xmm7"); | ||
| 668 | &xor ("ebp","ebp"); | ||
| 669 | &pslldq ("xmm4",6); | ||
| 670 | &movq ("xmm5","xmm1"); | ||
| 671 | &paddq ("xmm0","xmm4"); | ||
| 672 | &pand ("xmm1","xmm7"); | ||
| 673 | &psrldq ("xmm0",6); | ||
| 674 | &movd ("eax","xmm0"); | ||
| 675 | &psrldq ("xmm0",4); | ||
| 676 | |||
| 677 | &paddq ("xmm5","xmm0"); | ||
| 678 | &movdqa ("xmm0",&QWP(0x40,"esp")); | ||
| 679 | &sub ("eax",-1); # start subtracting modulus, | ||
| 680 | # this is used to determine | ||
| 681 | # if result is larger/smaller | ||
| 682 | # than modulus (see below) | ||
| 683 | &pslldq ("xmm5",6); | ||
| 684 | &movq ("xmm4","xmm2"); | ||
| 685 | &paddq ("xmm1","xmm5"); | ||
| 686 | &pand ("xmm2","xmm7"); | ||
| 687 | &psrldq ("xmm1",6); | ||
| 688 | &mov (&DWP(4*0,"edi"),"eax"); | ||
| 689 | &movd ("eax","xmm1"); | ||
| 690 | &psrldq ("xmm1",4); | ||
| 691 | |||
| 692 | &paddq ("xmm4","xmm1"); | ||
| 693 | &movdqa ("xmm1",&QWP(0x50,"esp")); | ||
| 694 | &sbb ("eax",-1); | ||
| 695 | &pslldq ("xmm4",6); | ||
| 696 | &movq ("xmm5","xmm3"); | ||
| 697 | &paddq ("xmm2","xmm4"); | ||
| 698 | &pand ("xmm3","xmm7"); | ||
| 699 | &psrldq ("xmm2",6); | ||
| 700 | &mov (&DWP(4*1,"edi"),"eax"); | ||
| 701 | &movd ("eax","xmm2"); | ||
| 702 | &psrldq ("xmm2",4); | ||
| 703 | |||
| 704 | &paddq ("xmm5","xmm2"); | ||
| 705 | &movdqa ("xmm2",&QWP(0x60,"esp")); | ||
| 706 | &sbb ("eax",-1); | ||
| 707 | &pslldq ("xmm5",6); | ||
| 708 | &movq ("xmm4","xmm0"); | ||
| 709 | &paddq ("xmm3","xmm5"); | ||
| 710 | &pand ("xmm0","xmm7"); | ||
| 711 | &psrldq ("xmm3",6); | ||
| 712 | &mov (&DWP(4*2,"edi"),"eax"); | ||
| 713 | &movd ("eax","xmm3"); | ||
| 714 | &psrldq ("xmm3",4); | ||
| 715 | |||
| 716 | &paddq ("xmm4","xmm3"); | ||
| 717 | &sbb ("eax",0); | ||
| 718 | &pslldq ("xmm4",6); | ||
| 719 | &movq ("xmm5","xmm1"); | ||
| 720 | &paddq ("xmm0","xmm4"); | ||
| 721 | &pand ("xmm1","xmm7"); | ||
| 722 | &psrldq ("xmm0",6); | ||
| 723 | &mov (&DWP(4*3,"edi"),"eax"); | ||
| 724 | &movd ("eax","xmm0"); | ||
| 725 | &psrldq ("xmm0",4); | ||
| 726 | |||
| 727 | &paddq ("xmm5","xmm0"); | ||
| 728 | &sbb ("eax",0); | ||
| 729 | &pslldq ("xmm5",6); | ||
| 730 | &movq ("xmm4","xmm2"); | ||
| 731 | &paddq ("xmm1","xmm5"); | ||
| 732 | &pand ("xmm2","xmm7"); | ||
| 733 | &psrldq ("xmm1",6); | ||
| 734 | &movd ("ebx","xmm1"); | ||
| 735 | &psrldq ("xmm1",4); | ||
| 736 | &mov ("esp","edx"); | ||
| 737 | |||
| 738 | &paddq ("xmm4","xmm1"); | ||
| 739 | &pslldq ("xmm4",6); | ||
| 740 | &paddq ("xmm2","xmm4"); | ||
| 741 | &psrldq ("xmm2",6); | ||
| 742 | &movd ("ecx","xmm2"); | ||
| 743 | &psrldq ("xmm2",4); | ||
| 744 | &sbb ("ebx",0); | ||
| 745 | &movd ("edx","xmm2"); | ||
| 746 | &pextrw ("esi","xmm2",2); # top-most overflow bit | ||
| 747 | &sbb ("ecx",1); | ||
| 748 | &sbb ("edx",-1); | ||
| 749 | &sbb ("esi",0); # borrow from subtraction | ||
| 750 | |||
| 751 | # Final step is "if result > mod, subtract mod", and at this point | ||
| 752 | # we have result - mod written to output buffer, as well as borrow | ||
| 753 | # bit from this subtraction, and if borrow bit is set, we add | ||
| 754 | # modulus back. | ||
| 755 | # | ||
| 756 | # Note that because mod has special form, i.e. consists of | ||
| 757 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 758 | # assigning borrow bit to one register, %ebp, and its negative | ||
| 759 | # to another, %esi. But we started by calculating %esi... | ||
| 760 | |||
| 761 | &sub ("ebp","esi"); | ||
| 762 | &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero | ||
| 763 | &adc (&DWP(4*1,"edi"),"esi"); | ||
| 764 | &adc (&DWP(4*2,"edi"),"esi"); | ||
| 765 | &adc (&DWP(4*3,"edi"),0); | ||
| 766 | &adc ("eax",0); | ||
| 767 | &adc ("ebx",0); | ||
| 768 | &mov (&DWP(4*4,"edi"),"eax"); | ||
| 769 | &adc ("ecx","ebp"); | ||
| 770 | &mov (&DWP(4*5,"edi"),"ebx"); | ||
| 771 | &adc ("edx","esi"); | ||
| 772 | &mov (&DWP(4*6,"edi"),"ecx"); | ||
| 773 | &mov (&DWP(4*7,"edi"),"edx"); | ||
| 774 | |||
| 775 | &ret (); | ||
| 776 | |||
| 777 | &set_label("mul_mont_ialu",16); } | ||
| 778 | |||
| 779 | ######################################## | ||
| 780 | # IALU code path suitable for all CPUs. | ||
| 781 | ######################################## | ||
| 782 | # stack layout: | ||
| 783 | # +------------------------------------+< %esp | ||
| 784 | # | 8 32-bit temporary words, accessed | | ||
| 785 | # | as circular buffer | | ||
| 786 | # . . | ||
| 787 | # . . | ||
| 788 | # +------------------------------------+< +32 | ||
| 789 | # | offloaded destination pointer | | ||
| 790 | # +------------------------------------+ | ||
| 791 | # | unused | | ||
| 792 | # +------------------------------------+< +40 | ||
| 793 | &sub ("esp",10*4); | ||
| 794 | |||
| 795 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | ||
| 796 | &mov ("ebx",&DWP(0*4,"ebp")); # b[0] | ||
| 797 | &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr | ||
| 798 | |||
| 799 | &mul ("ebx"); # a[0]*b[0] | ||
| 800 | &mov (&DWP(0*4,"esp"),"eax"); # t[0] | ||
| 801 | &mov ("eax",&DWP(1*4,"esi")); | ||
| 802 | &mov ("ecx","edx") | ||
| 803 | |||
| 804 | &mul ("ebx"); # a[1]*b[0] | ||
| 805 | &add ("ecx","eax"); | ||
| 806 | &mov ("eax",&DWP(2*4,"esi")); | ||
| 807 | &adc ("edx",0); | ||
| 808 | &mov (&DWP(1*4,"esp"),"ecx"); # t[1] | ||
| 809 | &mov ("ecx","edx"); | ||
| 810 | |||
| 811 | &mul ("ebx"); # a[2]*b[0] | ||
| 812 | &add ("ecx","eax"); | ||
| 813 | &mov ("eax",&DWP(3*4,"esi")); | ||
| 814 | &adc ("edx",0); | ||
| 815 | &mov (&DWP(2*4,"esp"),"ecx"); # t[2] | ||
| 816 | &mov ("ecx","edx"); | ||
| 817 | |||
| 818 | &mul ("ebx"); # a[3]*b[0] | ||
| 819 | &add ("ecx","eax"); | ||
| 820 | &mov ("eax",&DWP(4*4,"esi")); | ||
| 821 | &adc ("edx",0); | ||
| 822 | &mov (&DWP(3*4,"esp"),"ecx"); # t[3] | ||
| 823 | &mov ("ecx","edx"); | ||
| 824 | |||
| 825 | &mul ("ebx"); # a[4]*b[0] | ||
| 826 | &add ("ecx","eax"); | ||
| 827 | &mov ("eax",&DWP(5*4,"esi")); | ||
| 828 | &adc ("edx",0); | ||
| 829 | &mov (&DWP(4*4,"esp"),"ecx"); # t[4] | ||
| 830 | &mov ("ecx","edx"); | ||
| 831 | |||
| 832 | &mul ("ebx"); # a[5]*b[0] | ||
| 833 | &add ("ecx","eax"); | ||
| 834 | &mov ("eax",&DWP(6*4,"esi")); | ||
| 835 | &adc ("edx",0); | ||
| 836 | &mov (&DWP(5*4,"esp"),"ecx"); # t[5] | ||
| 837 | &mov ("ecx","edx"); | ||
| 838 | |||
| 839 | &mul ("ebx"); # a[6]*b[0] | ||
| 840 | &add ("ecx","eax"); | ||
| 841 | &mov ("eax",&DWP(7*4,"esi")); | ||
| 842 | &adc ("edx",0); | ||
| 843 | &mov (&DWP(6*4,"esp"),"ecx"); # t[6] | ||
| 844 | &mov ("ecx","edx"); | ||
| 845 | |||
| 846 | &xor ("edi","edi"); # initial top-most carry | ||
| 847 | &mul ("ebx"); # a[7]*b[0] | ||
| 848 | &add ("ecx","eax"); # t[7] | ||
| 849 | &mov ("eax",&DWP(0*4,"esp")); # t[0] | ||
| 850 | &adc ("edx",0); # t[8] | ||
| 851 | |||
| 852 | for ($i=0;$i<7;$i++) { | ||
| 853 | my $j=$i+1; | ||
| 854 | |||
| 855 | # Reduction iteration is normally performed by accumulating | ||
| 856 | # result of multiplication of modulus by "magic" digit [and | ||
| 857 | # omitting least significant word, which is guaranteed to | ||
| 858 | # be 0], but thanks to special form of modulus and "magic" | ||
| 859 | # digit being equal to least significant word, it can be | ||
| 860 | # performed with additions and subtractions alone. Indeed: | ||
| 861 | # | ||
| 862 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | ||
| 863 | # * abcd | ||
| 864 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 865 | # | ||
| 866 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | ||
| 867 | # rewrite above as: | ||
| 868 | # | ||
| 869 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | ||
| 870 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | ||
| 871 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | ||
| 872 | # | ||
| 873 | # or marking redundant operations: | ||
| 874 | # | ||
| 875 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | ||
| 876 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | ||
| 877 | # - abcd.----.----.----.----.----.----.---- | ||
| 878 | |||
| 879 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | ||
| 880 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | ||
| 881 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | ||
| 882 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | ||
| 883 | &adc ("ecx",0); # t[7]+=0 | ||
| 884 | &adc ("edx","eax"); # t[8]+=t[0] | ||
| 885 | &adc ("edi",0); # top-most carry | ||
| 886 | &mov ("ebx",&DWP($j*4,"ebp")); # b[i] | ||
| 887 | &sub ("ecx","eax"); # t[7]-=t[0] | ||
| 888 | &mov ("eax",&DWP(0*4,"esi")); # a[0] | ||
| 889 | &sbb ("edx",0); # t[8]-=0 | ||
| 890 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | ||
| 891 | &sbb ("edi",0); # top-most carry, | ||
| 892 | # keep in mind that | ||
| 893 | # netto result is | ||
| 894 | # *addition* of value | ||
| 895 | # with (abcd<<32)-abcd | ||
| 896 | # on top, so that | ||
| 897 | # underflow is | ||
| 898 | # impossible, because | ||
| 899 | # (abcd<<32)-abcd | ||
| 900 | # doesn't underflow | ||
| 901 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | ||
| 902 | |||
| 903 | &mul ("ebx"); # a[0]*b[i] | ||
| 904 | &add ("eax",&DWP((($j+0)%8)*4,"esp")); | ||
| 905 | &adc ("edx",0); | ||
| 906 | &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); | ||
| 907 | &mov ("eax",&DWP(1*4,"esi")); | ||
| 908 | &mov ("ecx","edx") | ||
| 909 | |||
| 910 | &mul ("ebx"); # a[1]*b[i] | ||
| 911 | &add ("ecx",&DWP((($j+1)%8)*4,"esp")); | ||
| 912 | &adc ("edx",0); | ||
| 913 | &add ("ecx","eax"); | ||
| 914 | &adc ("edx",0); | ||
| 915 | &mov ("eax",&DWP(2*4,"esi")); | ||
| 916 | &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); | ||
| 917 | &mov ("ecx","edx"); | ||
| 918 | |||
| 919 | &mul ("ebx"); # a[2]*b[i] | ||
| 920 | &add ("ecx",&DWP((($j+2)%8)*4,"esp")); | ||
| 921 | &adc ("edx",0); | ||
| 922 | &add ("ecx","eax"); | ||
| 923 | &adc ("edx",0); | ||
| 924 | &mov ("eax",&DWP(3*4,"esi")); | ||
| 925 | &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); | ||
| 926 | &mov ("ecx","edx"); | ||
| 927 | |||
| 928 | &mul ("ebx"); # a[3]*b[i] | ||
| 929 | &add ("ecx",&DWP((($j+3)%8)*4,"esp")); | ||
| 930 | &adc ("edx",0); | ||
| 931 | &add ("ecx","eax"); | ||
| 932 | &adc ("edx",0); | ||
| 933 | &mov ("eax",&DWP(4*4,"esi")); | ||
| 934 | &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); | ||
| 935 | &mov ("ecx","edx"); | ||
| 936 | |||
| 937 | &mul ("ebx"); # a[4]*b[i] | ||
| 938 | &add ("ecx",&DWP((($j+4)%8)*4,"esp")); | ||
| 939 | &adc ("edx",0); | ||
| 940 | &add ("ecx","eax"); | ||
| 941 | &adc ("edx",0); | ||
| 942 | &mov ("eax",&DWP(5*4,"esi")); | ||
| 943 | &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); | ||
| 944 | &mov ("ecx","edx"); | ||
| 945 | |||
| 946 | &mul ("ebx"); # a[5]*b[i] | ||
| 947 | &add ("ecx",&DWP((($j+5)%8)*4,"esp")); | ||
| 948 | &adc ("edx",0); | ||
| 949 | &add ("ecx","eax"); | ||
| 950 | &adc ("edx",0); | ||
| 951 | &mov ("eax",&DWP(6*4,"esi")); | ||
| 952 | &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); | ||
| 953 | &mov ("ecx","edx"); | ||
| 954 | |||
| 955 | &mul ("ebx"); # a[6]*b[i] | ||
| 956 | &add ("ecx",&DWP((($j+6)%8)*4,"esp")); | ||
| 957 | &adc ("edx",0); | ||
| 958 | &add ("ecx","eax"); | ||
| 959 | &adc ("edx",0); | ||
| 960 | &mov ("eax",&DWP(7*4,"esi")); | ||
| 961 | &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); | ||
| 962 | &mov ("ecx","edx"); | ||
| 963 | |||
| 964 | &mul ("ebx"); # a[7]*b[i] | ||
| 965 | &add ("ecx",&DWP((($j+7)%8)*4,"esp")); | ||
| 966 | &adc ("edx",0); | ||
| 967 | &add ("ecx","eax"); # t[7] | ||
| 968 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] | ||
| 969 | &adc ("edx","edi"); # t[8] | ||
| 970 | &mov ("edi",0); | ||
| 971 | &adc ("edi",0); # top-most carry | ||
| 972 | } | ||
| 973 | &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr | ||
| 974 | &xor ("esi","esi"); | ||
| 975 | my $j=$i+1; | ||
| 976 | |||
| 977 | # last multiplication-less reduction | ||
| 978 | &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] | ||
| 979 | &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 | ||
| 980 | &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 | ||
| 981 | &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] | ||
| 982 | &adc ("ecx",0); # t[7]+=0 | ||
| 983 | &adc ("edx","eax"); # t[8]+=t[0] | ||
| 984 | &adc ("edi",0); # top-most carry | ||
| 985 | &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); | ||
| 986 | &sub ("ecx","eax"); # t[7]-=t[0] | ||
| 987 | &mov ("eax",&DWP((($j+0)%8)*4,"esp")); | ||
| 988 | &sbb ("edx",0); # t[8]-=0 | ||
| 989 | &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); | ||
| 990 | &sbb ("edi",0); # top-most carry | ||
| 991 | &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); | ||
| 992 | |||
| 993 | # Final step is "if result > mod, subtract mod", but we do it | ||
| 994 | # "other way around", namely write result - mod to output buffer | ||
| 995 | # and if subtraction borrowed, add modulus back. | ||
| 996 | |||
| 997 | &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); | ||
| 998 | &sub ("eax",-1); | ||
| 999 | &mov ("edx",&DWP((($j+3)%8)*4,"esp")); | ||
| 1000 | &sbb ("ebx",-1); | ||
| 1001 | &mov (&DWP(0*4,"ebp"),"eax"); | ||
| 1002 | &sbb ("ecx",-1); | ||
| 1003 | &mov (&DWP(1*4,"ebp"),"ebx"); | ||
| 1004 | &sbb ("edx",0); | ||
| 1005 | &mov (&DWP(2*4,"ebp"),"ecx"); | ||
| 1006 | &mov (&DWP(3*4,"ebp"),"edx"); | ||
| 1007 | |||
| 1008 | &mov ("eax",&DWP((($j+4)%8)*4,"esp")); | ||
| 1009 | &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); | ||
| 1010 | &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); | ||
| 1011 | &sbb ("eax",0); | ||
| 1012 | &mov ("edx",&DWP((($j+7)%8)*4,"esp")); | ||
| 1013 | &sbb ("ebx",0); | ||
| 1014 | &sbb ("ecx",1); | ||
| 1015 | &sbb ("edx",-1); | ||
| 1016 | &sbb ("edi",0); | ||
| 1017 | |||
| 1018 | # Note that because mod has special form, i.e. consists of | ||
| 1019 | # 0xffffffff, 1 and 0s, we can conditionally synthesize it by | ||
| 1020 | # assigning borrow bit to one register, %ebp, and its negative | ||
| 1021 | # to another, %esi. But we started by calculating %esi... | ||
| 1022 | |||
| 1023 | &sub ("esi","edi"); | ||
| 1024 | &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero | ||
| 1025 | &adc (&DWP(1*4,"ebp"),"edi"); | ||
| 1026 | &adc (&DWP(2*4,"ebp"),"edi"); | ||
| 1027 | &adc (&DWP(3*4,"ebp"),0); | ||
| 1028 | &adc ("eax",0); | ||
| 1029 | &adc ("ebx",0); | ||
| 1030 | &mov (&DWP(4*4,"ebp"),"eax"); | ||
| 1031 | &adc ("ecx","esi"); | ||
| 1032 | &mov (&DWP(5*4,"ebp"),"ebx"); | ||
| 1033 | &adc ("edx","edi"); | ||
| 1034 | &mov (&DWP(6*4,"ebp"),"ecx"); | ||
| 1035 | &mov ("edi","ebp"); # fulfill contract | ||
| 1036 | &mov (&DWP(7*4,"ebp"),"edx"); | ||
| 1037 | |||
| 1038 | &add ("esp",10*4); | ||
| 1039 | &ret (); | ||
| 1040 | &function_end_B("_ecp_nistz256_mul_mont"); | ||
| 1041 | |||
| 1042 | ######################################################################## | ||
| 1043 | # void ecp_nistz256_select_w5(P256_POINT *edi,const void *esi, | ||
| 1044 | # int ebp); | ||
| 1045 | &function_begin("ecp_nistz256_select_w5"); | ||
| 1046 | &mov ("esi",&wparam(1)); | ||
| 1047 | &mov ("ebp",&wparam(2)); | ||
| 1048 | |||
| 1049 | &lea ("esi",&DWP(0,"esi","ebp",4)); | ||
| 1050 | &neg ("ebp"); | ||
| 1051 | &sar ("ebp",31); | ||
| 1052 | &mov ("edi",&wparam(0)); | ||
| 1053 | &lea ("esi",&DWP(0,"esi","ebp",4)); | ||
| 1054 | |||
| 1055 | for($i=0;$i<24;$i+=4) { | ||
| 1056 | &mov ("eax",&DWP(64*($i+0),"esi")); | ||
| 1057 | &mov ("ebx",&DWP(64*($i+1),"esi")); | ||
| 1058 | &mov ("ecx",&DWP(64*($i+2),"esi")); | ||
| 1059 | &mov ("edx",&DWP(64*($i+3),"esi")); | ||
| 1060 | &and ("eax","ebp"); | ||
| 1061 | &and ("ebx","ebp"); | ||
| 1062 | &and ("ecx","ebp"); | ||
| 1063 | &and ("edx","ebp"); | ||
| 1064 | &mov (&DWP(4*($i+0),"edi"),"eax"); | ||
| 1065 | &mov (&DWP(4*($i+1),"edi"),"ebx"); | ||
| 1066 | &mov (&DWP(4*($i+2),"edi"),"ecx"); | ||
| 1067 | &mov (&DWP(4*($i+3),"edi"),"edx"); | ||
| 1068 | } | ||
| 1069 | &function_end("ecp_nistz256_select_w5"); | ||
| 1070 | |||
| 1071 | ######################################################################## | ||
| 1072 | # void ecp_nistz256_select_w7(P256_POINT_AFFINE *edi,const void *esi, | ||
| 1073 | # int ebp); | ||
| 1074 | &function_begin("ecp_nistz256_select_w7"); | ||
| 1075 | &mov ("esi",&wparam(1)); | ||
| 1076 | &mov ("ebp",&wparam(2)); | ||
| 1077 | |||
| 1078 | &add ("esi","ebp"); | ||
| 1079 | &neg ("ebp"), | ||
| 1080 | &sar ("ebp",31); | ||
| 1081 | &mov ("edi",&wparam(0)); | ||
| 1082 | &lea ("esi",&DWP(0,"esi","ebp")); | ||
| 1083 | |||
| 1084 | for($i=0;$i<64;$i+=4) { | ||
| 1085 | &movz ("eax",&BP(64*($i+0),"esi")); | ||
| 1086 | &movz ("ebx",&BP(64*($i+1),"esi")); | ||
| 1087 | &movz ("ecx",&BP(64*($i+2),"esi")); | ||
| 1088 | &and ("eax","ebp"); | ||
| 1089 | &movz ("edx",&BP(64*($i+3),"esi")); | ||
| 1090 | &and ("ebx","ebp"); | ||
| 1091 | &mov (&BP($i+0,"edi"),"al"); | ||
| 1092 | &and ("ecx","ebp"); | ||
| 1093 | &mov (&BP($i+1,"edi"),"bl"); | ||
| 1094 | &and ("edx","ebp"); | ||
| 1095 | &mov (&BP($i+2,"edi"),"cl"); | ||
| 1096 | &mov (&BP($i+3,"edi"),"dl"); | ||
| 1097 | } | ||
| 1098 | &function_end("ecp_nistz256_select_w7"); | ||
| 1099 | |||
| 1100 | ######################################################################## | ||
| 1101 | # following subroutines are "literal" implementation of those found in | ||
| 1102 | # ecp_nistz256.c | ||
| 1103 | # | ||
| 1104 | ######################################################################## | ||
| 1105 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | ||
| 1106 | # | ||
| 1107 | &static_label("point_double_shortcut"); | ||
| 1108 | &function_begin("ecp_nistz256_point_double"); | ||
| 1109 | { my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | ||
| 1110 | |||
| 1111 | &mov ("esi",&wparam(1)); | ||
| 1112 | |||
| 1113 | # above map() describes stack layout with 5 temporary | ||
| 1114 | # 256-bit vectors on top, then we take extra word for | ||
| 1115 | # OPENSSL_ia32cap_P copy. | ||
| 1116 | &stack_push(8*5+1); | ||
| 1117 | if ($sse2) { | ||
| 1118 | &call ("_picup_eax"); | ||
| 1119 | &set_label("pic"); | ||
| 1120 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 1121 | &mov ("ebp",&DWP(0,"edx")); } | ||
| 1122 | |||
| 1123 | &set_label("point_double_shortcut"); | ||
| 1124 | &mov ("eax",&DWP(0,"esi")); # copy in_x | ||
| 1125 | &mov ("ebx",&DWP(4,"esi")); | ||
| 1126 | &mov ("ecx",&DWP(8,"esi")); | ||
| 1127 | &mov ("edx",&DWP(12,"esi")); | ||
| 1128 | &mov (&DWP($in_x+0,"esp"),"eax"); | ||
| 1129 | &mov (&DWP($in_x+4,"esp"),"ebx"); | ||
| 1130 | &mov (&DWP($in_x+8,"esp"),"ecx"); | ||
| 1131 | &mov (&DWP($in_x+12,"esp"),"edx"); | ||
| 1132 | &mov ("eax",&DWP(16,"esi")); | ||
| 1133 | &mov ("ebx",&DWP(20,"esi")); | ||
| 1134 | &mov ("ecx",&DWP(24,"esi")); | ||
| 1135 | &mov ("edx",&DWP(28,"esi")); | ||
| 1136 | &mov (&DWP($in_x+16,"esp"),"eax"); | ||
| 1137 | &mov (&DWP($in_x+20,"esp"),"ebx"); | ||
| 1138 | &mov (&DWP($in_x+24,"esp"),"ecx"); | ||
| 1139 | &mov (&DWP($in_x+28,"esp"),"edx"); | ||
| 1140 | &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy | ||
| 1141 | |||
| 1142 | &lea ("ebp",&DWP(32,"esi")); | ||
| 1143 | &lea ("esi",&DWP(32,"esi")); | ||
| 1144 | &lea ("edi",&DWP($S,"esp")); | ||
| 1145 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); | ||
| 1146 | |||
| 1147 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1148 | &mov ("esi",64); | ||
| 1149 | &add ("esi",&wparam(1)); | ||
| 1150 | &lea ("edi",&DWP($Zsqr,"esp")); | ||
| 1151 | &mov ("ebp","esi"); | ||
| 1152 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); | ||
| 1153 | |||
| 1154 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1155 | &lea ("esi",&DWP($S,"esp")); | ||
| 1156 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1157 | &lea ("edi",&DWP($S,"esp")); | ||
| 1158 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); | ||
| 1159 | |||
| 1160 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1161 | &mov ("ebp",&wparam(1)); | ||
| 1162 | &lea ("esi",&DWP(32,"ebp")); | ||
| 1163 | &lea ("ebp",&DWP(64,"ebp")); | ||
| 1164 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1165 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); | ||
| 1166 | |||
| 1167 | &lea ("esi",&DWP($in_x,"esp")); | ||
| 1168 | &lea ("ebp",&DWP($Zsqr,"esp")); | ||
| 1169 | &lea ("edi",&DWP($M,"esp")); | ||
| 1170 | &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); | ||
| 1171 | |||
| 1172 | &mov ("edi",64); | ||
| 1173 | &lea ("esi",&DWP($tmp0,"esp")); | ||
| 1174 | &lea ("ebp",&DWP($tmp0,"esp")); | ||
| 1175 | &add ("edi",&wparam(0)); | ||
| 1176 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); | ||
| 1177 | |||
| 1178 | &lea ("esi",&DWP($in_x,"esp")); | ||
| 1179 | &lea ("ebp",&DWP($Zsqr,"esp")); | ||
| 1180 | &lea ("edi",&DWP($Zsqr,"esp")); | ||
| 1181 | &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); | ||
| 1182 | |||
| 1183 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1184 | &lea ("esi",&DWP($S,"esp")); | ||
| 1185 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1186 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1187 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); | ||
| 1188 | |||
| 1189 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1190 | &lea ("esi",&DWP($M,"esp")); | ||
| 1191 | &lea ("ebp",&DWP($Zsqr,"esp")); | ||
| 1192 | &lea ("edi",&DWP($M,"esp")); | ||
| 1193 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); | ||
| 1194 | |||
| 1195 | &mov ("edi",32); | ||
| 1196 | &lea ("esi",&DWP($tmp0,"esp")); | ||
| 1197 | &add ("edi",&wparam(0)); | ||
| 1198 | &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); | ||
| 1199 | |||
| 1200 | &lea ("esi",&DWP($M,"esp")); | ||
| 1201 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1202 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1203 | &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); | ||
| 1204 | |||
| 1205 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1206 | &lea ("esi",&DWP($in_x,"esp")); | ||
| 1207 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1208 | &lea ("edi",&DWP($S,"esp")); | ||
| 1209 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); | ||
| 1210 | |||
| 1211 | &lea ("esi",&DWP($tmp0,"esp")); | ||
| 1212 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1213 | &lea ("edi",&DWP($M,"esp")); | ||
| 1214 | &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); | ||
| 1215 | |||
| 1216 | &lea ("esi",&DWP($S,"esp")); | ||
| 1217 | &lea ("ebp",&DWP($S,"esp")); | ||
| 1218 | &lea ("edi",&DWP($tmp0,"esp")); | ||
| 1219 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); | ||
| 1220 | |||
| 1221 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1222 | &lea ("esi",&DWP($M,"esp")); | ||
| 1223 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1224 | &mov ("edi",&wparam(0)); | ||
| 1225 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); | ||
| 1226 | |||
| 1227 | &mov ("esi","edi"); # %edi is still res_x here | ||
| 1228 | &lea ("ebp",&DWP($tmp0,"esp")); | ||
| 1229 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); | ||
| 1230 | |||
| 1231 | &lea ("esi",&DWP($S,"esp")); | ||
| 1232 | &mov ("ebp","edi"); # %edi is still res_x | ||
| 1233 | &lea ("edi",&DWP($S,"esp")); | ||
| 1234 | &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); | ||
| 1235 | |||
| 1236 | &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1237 | &mov ("esi","edi"); # %edi is still &S | ||
| 1238 | &lea ("ebp",&DWP($M,"esp")); | ||
| 1239 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); | ||
| 1240 | |||
| 1241 | &mov ("ebp",32); | ||
| 1242 | &lea ("esi",&DWP($S,"esp")); | ||
| 1243 | &add ("ebp",&wparam(0)); | ||
| 1244 | &mov ("edi","ebp"); | ||
| 1245 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); | ||
| 1246 | |||
| 1247 | &stack_pop(8*5+1); | ||
| 1248 | } &function_end("ecp_nistz256_point_double"); | ||
| 1249 | |||
| 1250 | ######################################################################## | ||
| 1251 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | ||
| 1252 | # const P256_POINT *in2); | ||
| 1253 | &function_begin("ecp_nistz256_point_add"); | ||
| 1254 | { my ($res_x,$res_y,$res_z, | ||
| 1255 | $in1_x,$in1_y,$in1_z, | ||
| 1256 | $in2_x,$in2_y,$in2_z, | ||
| 1257 | $H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 1258 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | ||
| 1259 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 1260 | |||
| 1261 | &mov ("esi",&wparam(2)); | ||
| 1262 | |||
| 1263 | # above map() describes stack layout with 18 temporary | ||
| 1264 | # 256-bit vectors on top, then we take extra words for | ||
| 1265 | # !in1infty, !in2infty, result of check for zero and | ||
| 1266 | # OPENSSL_ia32cap_P copy. [one unused word for padding] | ||
| 1267 | &stack_push(8*18+5); | ||
| 1268 | if ($sse2) { | ||
| 1269 | &call ("_picup_eax"); | ||
| 1270 | &set_label("pic"); | ||
| 1271 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 1272 | &mov ("ebp",&DWP(0,"edx")); } | ||
| 1273 | |||
| 1274 | &lea ("edi",&DWP($in2_x,"esp")); | ||
| 1275 | for($i=0;$i<96;$i+=16) { | ||
| 1276 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | ||
| 1277 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1278 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1279 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1280 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1281 | &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); | ||
| 1282 | &mov ("ebp","eax") if ($i==64); | ||
| 1283 | &or ("ebp","eax") if ($i>64); | ||
| 1284 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1285 | &or ("ebp","ebx") if ($i>=64); | ||
| 1286 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1287 | &or ("ebp","ecx") if ($i>=64); | ||
| 1288 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1289 | &or ("ebp","edx") if ($i>=64); | ||
| 1290 | } | ||
| 1291 | &xor ("eax","eax"); | ||
| 1292 | &mov ("esi",&wparam(1)); | ||
| 1293 | &sub ("eax","ebp"); | ||
| 1294 | &or ("ebp","eax"); | ||
| 1295 | &sar ("ebp",31); | ||
| 1296 | &mov (&DWP(32*18+4,"esp"),"ebp"); # !in2infty | ||
| 1297 | |||
| 1298 | &lea ("edi",&DWP($in1_x,"esp")); | ||
| 1299 | for($i=0;$i<96;$i+=16) { | ||
| 1300 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | ||
| 1301 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1302 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1303 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1304 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1305 | &mov ("ebp","eax") if ($i==64); | ||
| 1306 | &or ("ebp","eax") if ($i>64); | ||
| 1307 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1308 | &or ("ebp","ebx") if ($i>=64); | ||
| 1309 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1310 | &or ("ebp","ecx") if ($i>=64); | ||
| 1311 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1312 | &or ("ebp","edx") if ($i>=64); | ||
| 1313 | } | ||
| 1314 | &xor ("eax","eax"); | ||
| 1315 | &sub ("eax","ebp"); | ||
| 1316 | &or ("ebp","eax"); | ||
| 1317 | &sar ("ebp",31); | ||
| 1318 | &mov (&DWP(32*18+0,"esp"),"ebp"); # !in1infty | ||
| 1319 | |||
| 1320 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1321 | &lea ("esi",&DWP($in2_z,"esp")); | ||
| 1322 | &lea ("ebp",&DWP($in2_z,"esp")); | ||
| 1323 | &lea ("edi",&DWP($Z2sqr,"esp")); | ||
| 1324 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); | ||
| 1325 | |||
| 1326 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1327 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1328 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1329 | &lea ("edi",&DWP($Z1sqr,"esp")); | ||
| 1330 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1331 | |||
| 1332 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1333 | &lea ("esi",&DWP($Z2sqr,"esp")); | ||
| 1334 | &lea ("ebp",&DWP($in2_z,"esp")); | ||
| 1335 | &lea ("edi",&DWP($S1,"esp")); | ||
| 1336 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 1337 | |||
| 1338 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1339 | &lea ("esi",&DWP($Z1sqr,"esp")); | ||
| 1340 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1341 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1342 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1343 | |||
| 1344 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1345 | &lea ("esi",&DWP($in1_y,"esp")); | ||
| 1346 | &lea ("ebp",&DWP($S1,"esp")); | ||
| 1347 | &lea ("edi",&DWP($S1,"esp")); | ||
| 1348 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); | ||
| 1349 | |||
| 1350 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1351 | &lea ("esi",&DWP($in2_y,"esp")); | ||
| 1352 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1353 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1354 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | ||
| 1355 | |||
| 1356 | &lea ("esi",&DWP($S2,"esp")); | ||
| 1357 | &lea ("ebp",&DWP($S1,"esp")); | ||
| 1358 | &lea ("edi",&DWP($R,"esp")); | ||
| 1359 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); | ||
| 1360 | |||
| 1361 | &or ("ebx","eax"); # see if result is zero | ||
| 1362 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1363 | &or ("ebx","ecx"); | ||
| 1364 | &or ("ebx","edx"); | ||
| 1365 | &or ("ebx",&DWP(0,"edi")); | ||
| 1366 | &or ("ebx",&DWP(4,"edi")); | ||
| 1367 | &lea ("esi",&DWP($in1_x,"esp")); | ||
| 1368 | &or ("ebx",&DWP(8,"edi")); | ||
| 1369 | &lea ("ebp",&DWP($Z2sqr,"esp")); | ||
| 1370 | &or ("ebx",&DWP(12,"edi")); | ||
| 1371 | &lea ("edi",&DWP($U1,"esp")); | ||
| 1372 | &mov (&DWP(32*18+8,"esp"),"ebx"); | ||
| 1373 | |||
| 1374 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 1375 | |||
| 1376 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1377 | &lea ("esi",&DWP($in2_x,"esp")); | ||
| 1378 | &lea ("ebp",&DWP($Z1sqr,"esp")); | ||
| 1379 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1380 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 1381 | |||
| 1382 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1383 | &lea ("ebp",&DWP($U1,"esp")); | ||
| 1384 | &lea ("edi",&DWP($H,"esp")); | ||
| 1385 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); | ||
| 1386 | |||
| 1387 | &or ("eax","ebx"); # see if result is zero | ||
| 1388 | &or ("eax","ecx"); | ||
| 1389 | &or ("eax","edx"); | ||
| 1390 | &or ("eax",&DWP(0,"edi")); | ||
| 1391 | &or ("eax",&DWP(4,"edi")); | ||
| 1392 | &or ("eax",&DWP(8,"edi")); | ||
| 1393 | &or ("eax",&DWP(12,"edi")); | ||
| 1394 | |||
| 1395 | &data_byte(0x3e); # predict taken | ||
| 1396 | &jnz (&label("add_proceed")); # is_equal(U1,U2)? | ||
| 1397 | |||
| 1398 | &mov ("eax",&DWP(32*18+0,"esp")); | ||
| 1399 | &and ("eax",&DWP(32*18+4,"esp")); | ||
| 1400 | &mov ("ebx",&DWP(32*18+8,"esp")); | ||
| 1401 | &jz (&label("add_proceed")); # (in1infty || in2infty)? | ||
| 1402 | &test ("ebx","ebx"); | ||
| 1403 | &jz (&label("add_double")); # is_equal(S1,S2)? | ||
| 1404 | |||
| 1405 | &mov ("edi",&wparam(0)); | ||
| 1406 | &xor ("eax","eax"); | ||
| 1407 | &mov ("ecx",96/4); | ||
| 1408 | &data_byte(0xfc,0xf3,0xab); # cld; stosd | ||
| 1409 | &jmp (&label("add_done")); | ||
| 1410 | |||
| 1411 | &set_label("add_double",16); | ||
| 1412 | &mov ("esi",&wparam(1)); | ||
| 1413 | &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1414 | &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes | ||
| 1415 | &jmp (&label("point_double_shortcut")); | ||
| 1416 | |||
| 1417 | &set_label("add_proceed",16); | ||
| 1418 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1419 | &lea ("esi",&DWP($R,"esp")); | ||
| 1420 | &lea ("ebp",&DWP($R,"esp")); | ||
| 1421 | &lea ("edi",&DWP($Rsqr,"esp")); | ||
| 1422 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | ||
| 1423 | |||
| 1424 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1425 | &lea ("esi",&DWP($H,"esp")); | ||
| 1426 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1427 | &lea ("edi",&DWP($res_z,"esp")); | ||
| 1428 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | ||
| 1429 | |||
| 1430 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1431 | &lea ("esi",&DWP($H,"esp")); | ||
| 1432 | &lea ("ebp",&DWP($H,"esp")); | ||
| 1433 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1434 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | ||
| 1435 | |||
| 1436 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1437 | &lea ("esi",&DWP($in2_z,"esp")); | ||
| 1438 | &lea ("ebp",&DWP($res_z,"esp")); | ||
| 1439 | &lea ("edi",&DWP($res_z,"esp")); | ||
| 1440 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); | ||
| 1441 | |||
| 1442 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1443 | &lea ("esi",&DWP($Hsqr,"esp")); | ||
| 1444 | &lea ("ebp",&DWP($U1,"esp")); | ||
| 1445 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1446 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); | ||
| 1447 | |||
| 1448 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1449 | &lea ("esi",&DWP($H,"esp")); | ||
| 1450 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1451 | &lea ("edi",&DWP($Hcub,"esp")); | ||
| 1452 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1453 | |||
| 1454 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1455 | &lea ("ebp",&DWP($U2,"esp")); | ||
| 1456 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1457 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | ||
| 1458 | |||
| 1459 | &lea ("esi",&DWP($Rsqr,"esp")); | ||
| 1460 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1461 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1462 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1463 | |||
| 1464 | &lea ("esi",&DWP($res_x,"esp")); | ||
| 1465 | &lea ("ebp",&DWP($Hcub,"esp")); | ||
| 1466 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1467 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | ||
| 1468 | |||
| 1469 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1470 | &lea ("ebp",&DWP($res_x,"esp")); | ||
| 1471 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1472 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | ||
| 1473 | |||
| 1474 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1475 | &lea ("esi",&DWP($Hcub,"esp")); | ||
| 1476 | &lea ("ebp",&DWP($S1,"esp")); | ||
| 1477 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1478 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); | ||
| 1479 | |||
| 1480 | &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1481 | &lea ("esi",&DWP($R,"esp")); | ||
| 1482 | &lea ("ebp",&DWP($res_y,"esp")); | ||
| 1483 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1484 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); | ||
| 1485 | |||
| 1486 | &lea ("esi",&DWP($res_y,"esp")); | ||
| 1487 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1488 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1489 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | ||
| 1490 | |||
| 1491 | &mov ("ebp",&DWP(32*18+0,"esp")); # !in1infty | ||
| 1492 | &mov ("esi",&DWP(32*18+4,"esp")); # !in2infty | ||
| 1493 | &mov ("edi",&wparam(0)); | ||
| 1494 | &mov ("edx","ebp"); | ||
| 1495 | ¬ ("ebp"); | ||
| 1496 | &and ("edx","esi"); | ||
| 1497 | &and ("ebp","esi"); | ||
| 1498 | ¬ ("esi"); | ||
| 1499 | |||
| 1500 | ######################################## | ||
| 1501 | # conditional moves | ||
| 1502 | for($i=64;$i<96;$i+=4) { | ||
| 1503 | &mov ("eax","edx"); | ||
| 1504 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1505 | &mov ("ebx","ebp"); | ||
| 1506 | &and ("ebx",&DWP($in2_x+$i,"esp")); | ||
| 1507 | &mov ("ecx","esi"); | ||
| 1508 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1509 | &or ("eax","ebx"); | ||
| 1510 | &or ("eax","ecx"); | ||
| 1511 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1512 | } | ||
| 1513 | for($i=0;$i<64;$i+=4) { | ||
| 1514 | &mov ("eax","edx"); | ||
| 1515 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1516 | &mov ("ebx","ebp"); | ||
| 1517 | &and ("ebx",&DWP($in2_x+$i,"esp")); | ||
| 1518 | &mov ("ecx","esi"); | ||
| 1519 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1520 | &or ("eax","ebx"); | ||
| 1521 | &or ("eax","ecx"); | ||
| 1522 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1523 | } | ||
| 1524 | &set_label("add_done"); | ||
| 1525 | &stack_pop(8*18+5); | ||
| 1526 | } &function_end("ecp_nistz256_point_add"); | ||
| 1527 | |||
| 1528 | ######################################################################## | ||
| 1529 | # void ecp_nistz256_point_add_affine(P256_POINT *out, | ||
| 1530 | # const P256_POINT *in1, | ||
| 1531 | # const P256_POINT_AFFINE *in2); | ||
| 1532 | &function_begin("ecp_nistz256_point_add_affine"); | ||
| 1533 | { | ||
| 1534 | my ($res_x,$res_y,$res_z, | ||
| 1535 | $in1_x,$in1_y,$in1_z, | ||
| 1536 | $in2_x,$in2_y, | ||
| 1537 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | ||
| 1538 | my $Z1sqr = $S2; | ||
| 1539 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | ||
| 1540 | |||
| 1541 | &mov ("esi",&wparam(1)); | ||
| 1542 | |||
| 1543 | # above map() describes stack layout with 15 temporary | ||
| 1544 | # 256-bit vectors on top, then we take extra words for | ||
| 1545 | # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy. | ||
| 1546 | &stack_push(8*15+3); | ||
| 1547 | if ($sse2) { | ||
| 1548 | &call ("_picup_eax"); | ||
| 1549 | &set_label("pic"); | ||
| 1550 | &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); | ||
| 1551 | &mov ("ebp",&DWP(0,"edx")); } | ||
| 1552 | |||
| 1553 | &lea ("edi",&DWP($in1_x,"esp")); | ||
| 1554 | for($i=0;$i<96;$i+=16) { | ||
| 1555 | &mov ("eax",&DWP($i+0,"esi")); # copy in1 | ||
| 1556 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1557 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1558 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1559 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1560 | &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); | ||
| 1561 | &mov ("ebp","eax") if ($i==64); | ||
| 1562 | &or ("ebp","eax") if ($i>64); | ||
| 1563 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1564 | &or ("ebp","ebx") if ($i>=64); | ||
| 1565 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1566 | &or ("ebp","ecx") if ($i>=64); | ||
| 1567 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1568 | &or ("ebp","edx") if ($i>=64); | ||
| 1569 | } | ||
| 1570 | &xor ("eax","eax"); | ||
| 1571 | &mov ("esi",&wparam(2)); | ||
| 1572 | &sub ("eax","ebp"); | ||
| 1573 | &or ("ebp","eax"); | ||
| 1574 | &sar ("ebp",31); | ||
| 1575 | &mov (&DWP(32*15+0,"esp"),"ebp"); # !in1infty | ||
| 1576 | |||
| 1577 | &lea ("edi",&DWP($in2_x,"esp")); | ||
| 1578 | for($i=0;$i<64;$i+=16) { | ||
| 1579 | &mov ("eax",&DWP($i+0,"esi")); # copy in2 | ||
| 1580 | &mov ("ebx",&DWP($i+4,"esi")); | ||
| 1581 | &mov ("ecx",&DWP($i+8,"esi")); | ||
| 1582 | &mov ("edx",&DWP($i+12,"esi")); | ||
| 1583 | &mov (&DWP($i+0,"edi"),"eax"); | ||
| 1584 | &mov ("ebp","eax") if ($i==0); | ||
| 1585 | &or ("ebp","eax") if ($i!=0); | ||
| 1586 | &mov (&DWP($i+4,"edi"),"ebx"); | ||
| 1587 | &or ("ebp","ebx"); | ||
| 1588 | &mov (&DWP($i+8,"edi"),"ecx"); | ||
| 1589 | &or ("ebp","ecx"); | ||
| 1590 | &mov (&DWP($i+12,"edi"),"edx"); | ||
| 1591 | &or ("ebp","edx"); | ||
| 1592 | } | ||
| 1593 | &xor ("ebx","ebx"); | ||
| 1594 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1595 | &sub ("ebx","ebp"); | ||
| 1596 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1597 | &or ("ebx","ebp"); | ||
| 1598 | &lea ("ebp",&DWP($in1_z,"esp")); | ||
| 1599 | &sar ("ebx",31); | ||
| 1600 | &lea ("edi",&DWP($Z1sqr,"esp")); | ||
| 1601 | &mov (&DWP(32*15+4,"esp"),"ebx"); # !in2infty | ||
| 1602 | |||
| 1603 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1604 | |||
| 1605 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1606 | &lea ("esi",&DWP($in2_x,"esp")); | ||
| 1607 | &mov ("ebp","edi"); # %esi is stull &Z1sqr | ||
| 1608 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1609 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1610 | |||
| 1611 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1612 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1613 | &lea ("ebp",&DWP($Z1sqr,"esp")); | ||
| 1614 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1615 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1616 | |||
| 1617 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1618 | &lea ("ebp",&DWP($in1_x,"esp")); | ||
| 1619 | &lea ("edi",&DWP($H,"esp")); | ||
| 1620 | &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); | ||
| 1621 | |||
| 1622 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1623 | &lea ("esi",&DWP($in2_y,"esp")); | ||
| 1624 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1625 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1626 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); | ||
| 1627 | |||
| 1628 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1629 | &lea ("esi",&DWP($in1_z,"esp")); | ||
| 1630 | &lea ("ebp",&DWP($H,"esp")); | ||
| 1631 | &lea ("edi",&DWP($res_z,"esp")); | ||
| 1632 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); | ||
| 1633 | |||
| 1634 | &lea ("esi",&DWP($S2,"esp")); | ||
| 1635 | &lea ("ebp",&DWP($in1_y,"esp")); | ||
| 1636 | &lea ("edi",&DWP($R,"esp")); | ||
| 1637 | &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); | ||
| 1638 | |||
| 1639 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1640 | &lea ("esi",&DWP($H,"esp")); | ||
| 1641 | &lea ("ebp",&DWP($H,"esp")); | ||
| 1642 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1643 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); | ||
| 1644 | |||
| 1645 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1646 | &lea ("esi",&DWP($R,"esp")); | ||
| 1647 | &lea ("ebp",&DWP($R,"esp")); | ||
| 1648 | &lea ("edi",&DWP($Rsqr,"esp")); | ||
| 1649 | &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); | ||
| 1650 | |||
| 1651 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1652 | &lea ("esi",&DWP($in1_x,"esp")); | ||
| 1653 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1654 | &lea ("edi",&DWP($U2,"esp")); | ||
| 1655 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1656 | |||
| 1657 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1658 | &lea ("esi",&DWP($H,"esp")); | ||
| 1659 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1660 | &lea ("edi",&DWP($Hcub,"esp")); | ||
| 1661 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1662 | |||
| 1663 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1664 | &lea ("ebp",&DWP($U2,"esp")); | ||
| 1665 | &lea ("edi",&DWP($Hsqr,"esp")); | ||
| 1666 | &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); | ||
| 1667 | |||
| 1668 | &lea ("esi",&DWP($Rsqr,"esp")); | ||
| 1669 | &lea ("ebp",&DWP($Hsqr,"esp")); | ||
| 1670 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1671 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1672 | |||
| 1673 | &lea ("esi",&DWP($res_x,"esp")); | ||
| 1674 | &lea ("ebp",&DWP($Hcub,"esp")); | ||
| 1675 | &lea ("edi",&DWP($res_x,"esp")); | ||
| 1676 | &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); | ||
| 1677 | |||
| 1678 | &lea ("esi",&DWP($U2,"esp")); | ||
| 1679 | &lea ("ebp",&DWP($res_x,"esp")); | ||
| 1680 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1681 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); | ||
| 1682 | |||
| 1683 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1684 | &lea ("esi",&DWP($Hcub,"esp")); | ||
| 1685 | &lea ("ebp",&DWP($in1_y,"esp")); | ||
| 1686 | &lea ("edi",&DWP($S2,"esp")); | ||
| 1687 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); | ||
| 1688 | |||
| 1689 | &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy | ||
| 1690 | &lea ("esi",&DWP($R,"esp")); | ||
| 1691 | &lea ("ebp",&DWP($res_y,"esp")); | ||
| 1692 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1693 | &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); | ||
| 1694 | |||
| 1695 | &lea ("esi",&DWP($res_y,"esp")); | ||
| 1696 | &lea ("ebp",&DWP($S2,"esp")); | ||
| 1697 | &lea ("edi",&DWP($res_y,"esp")); | ||
| 1698 | &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); | ||
| 1699 | |||
| 1700 | &mov ("ebp",&DWP(32*15+0,"esp")); # !in1infty | ||
| 1701 | &mov ("esi",&DWP(32*15+4,"esp")); # !in2infty | ||
| 1702 | &mov ("edi",&wparam(0)); | ||
| 1703 | &mov ("edx","ebp"); | ||
| 1704 | ¬ ("ebp"); | ||
| 1705 | &and ("edx","esi"); | ||
| 1706 | &and ("ebp","esi"); | ||
| 1707 | ¬ ("esi"); | ||
| 1708 | |||
| 1709 | ######################################## | ||
| 1710 | # conditional moves | ||
| 1711 | for($i=64;$i<96;$i+=4) { | ||
| 1712 | my $one=@ONE_mont[($i-64)/4]; | ||
| 1713 | |||
| 1714 | &mov ("eax","edx"); | ||
| 1715 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1716 | &mov ("ebx","ebp") if ($one && $one!=-1); | ||
| 1717 | &and ("ebx",$one) if ($one && $one!=-1); | ||
| 1718 | &mov ("ecx","esi"); | ||
| 1719 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1720 | &or ("eax",$one==-1?"ebp":"ebx") if ($one); | ||
| 1721 | &or ("eax","ecx"); | ||
| 1722 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1723 | } | ||
| 1724 | for($i=0;$i<64;$i+=4) { | ||
| 1725 | &mov ("eax","edx"); | ||
| 1726 | &and ("eax",&DWP($res_x+$i,"esp")); | ||
| 1727 | &mov ("ebx","ebp"); | ||
| 1728 | &and ("ebx",&DWP($in2_x+$i,"esp")); | ||
| 1729 | &mov ("ecx","esi"); | ||
| 1730 | &and ("ecx",&DWP($in1_x+$i,"esp")); | ||
| 1731 | &or ("eax","ebx"); | ||
| 1732 | &or ("eax","ecx"); | ||
| 1733 | &mov (&DWP($i,"edi"),"eax"); | ||
| 1734 | } | ||
| 1735 | &stack_pop(8*15+3); | ||
| 1736 | } &function_end("ecp_nistz256_point_add_affine"); | ||
| 1737 | |||
| 1738 | &asm_finish(); | ||
| 1739 | |||
| 1740 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl deleted file mode 100644 index b772aae742..0000000000 --- a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl +++ /dev/null | |||
| @@ -1,1971 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # $OpenBSD: ecp_nistz256-x86_64.pl,v 1.1 2016/11/04 17:33:20 miod Exp $ | ||
| 3 | # | ||
| 4 | # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. | ||
| 5 | # | ||
| 6 | # Licensed under the OpenSSL license (the "License"). You may not use | ||
| 7 | # this file except in compliance with the License. You can obtain a copy | ||
| 8 | # in the file LICENSE in the source distribution or at | ||
| 9 | # https://www.openssl.org/source/license.html | ||
| 10 | |||
| 11 | # Copyright (c) 2014, Intel Corporation. | ||
| 12 | # | ||
| 13 | # Permission to use, copy, modify, and/or distribute this software for any | ||
| 14 | # purpose with or without fee is hereby granted, provided that the above | ||
| 15 | # copyright notice and this permission notice appear in all copies. | ||
| 16 | # | ||
| 17 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
| 18 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
| 19 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | ||
| 20 | # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
| 21 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION | ||
| 22 | # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN | ||
| 23 | # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
| 24 | |||
| 25 | # Developers and authors: | ||
| 26 | # Shay Gueron (1, 2), and Vlad Krasnov (1) | ||
| 27 | # (1) Intel Corporation, Israel Development Center | ||
| 28 | # (2) University of Haifa | ||
| 29 | |||
| 30 | # Reference: | ||
| 31 | # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with | ||
| 32 | # 256 Bit Primes" | ||
| 33 | |||
| 34 | # Further optimization by <appro@openssl.org>: | ||
| 35 | # | ||
| 36 | # this/original with/without -DECP_NISTZ256_ASM(*) | ||
| 37 | # Opteron +12-49% +110-150% | ||
| 38 | # Bulldozer +14-45% +175-210% | ||
| 39 | # P4 +18-46% n/a :-( | ||
| 40 | # Westmere +12-34% +80-87% | ||
| 41 | # Sandy Bridge +9-35% +110-120% | ||
| 42 | # Ivy Bridge +9-35% +110-125% | ||
| 43 | # Haswell +8-37% +140-160% | ||
| 44 | # Broadwell +18-58% +145-210% | ||
| 45 | # Atom +15-50% +130-180% | ||
| 46 | # VIA Nano +43-160% +300-480% | ||
| 47 | # | ||
| 48 | # (*) "without -DECP_NISTZ256_ASM" refers to build with | ||
| 49 | # "enable-ec_nistp_64_gcc_128"; | ||
| 50 | # | ||
| 51 | # Ranges denote minimum and maximum improvement coefficients depending | ||
| 52 | # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest | ||
| 53 | # server-side operation. Keep in mind that +100% means 2x improvement. | ||
| 54 | |||
| 55 | $flavour = shift; | ||
| 56 | $output = shift; | ||
| 57 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 58 | |||
| 59 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 60 | |||
| 61 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 62 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 63 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 64 | die "can't locate x86_64-xlate.pl"; | ||
| 65 | |||
| 66 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; | ||
| 67 | *STDOUT=*OUT; | ||
| 68 | |||
| 69 | $code.=<<___; | ||
| 70 | .text | ||
| 71 | |||
| 72 | # The polynomial | ||
| 73 | .align 64 | ||
| 74 | .Lpoly: | ||
| 75 | .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 | ||
| 76 | |||
| 77 | .LOne: | ||
| 78 | .long 1,1,1,1,1,1,1,1 | ||
| 79 | .LTwo: | ||
| 80 | .long 2,2,2,2,2,2,2,2 | ||
| 81 | .LThree: | ||
| 82 | .long 3,3,3,3,3,3,3,3 | ||
| 83 | .LONE_mont: | ||
| 84 | .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe | ||
| 85 | ___ | ||
| 86 | |||
| 87 | { | ||
| 88 | ################################################################################ | ||
| 89 | # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); | ||
| 90 | |||
| 91 | my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); | ||
| 92 | my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); | ||
| 93 | my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); | ||
| 94 | |||
| 95 | $code.=<<___; | ||
| 96 | |||
| 97 | .globl ecp_nistz256_mul_by_2 | ||
| 98 | .type ecp_nistz256_mul_by_2,\@function,2 | ||
| 99 | .align 64 | ||
| 100 | ecp_nistz256_mul_by_2: | ||
| 101 | push %r12 | ||
| 102 | push %r13 | ||
| 103 | |||
| 104 | mov 8*0($a_ptr), $a0 | ||
| 105 | mov 8*1($a_ptr), $a1 | ||
| 106 | add $a0, $a0 # a0:a3+a0:a3 | ||
| 107 | mov 8*2($a_ptr), $a2 | ||
| 108 | adc $a1, $a1 | ||
| 109 | mov 8*3($a_ptr), $a3 | ||
| 110 | lea .Lpoly(%rip), $a_ptr | ||
| 111 | mov $a0, $t0 | ||
| 112 | adc $a2, $a2 | ||
| 113 | adc $a3, $a3 | ||
| 114 | mov $a1, $t1 | ||
| 115 | sbb $t4, $t4 | ||
| 116 | |||
| 117 | sub 8*0($a_ptr), $a0 | ||
| 118 | mov $a2, $t2 | ||
| 119 | sbb 8*1($a_ptr), $a1 | ||
| 120 | sbb 8*2($a_ptr), $a2 | ||
| 121 | mov $a3, $t3 | ||
| 122 | sbb 8*3($a_ptr), $a3 | ||
| 123 | test $t4, $t4 | ||
| 124 | |||
| 125 | cmovz $t0, $a0 | ||
| 126 | cmovz $t1, $a1 | ||
| 127 | mov $a0, 8*0($r_ptr) | ||
| 128 | cmovz $t2, $a2 | ||
| 129 | mov $a1, 8*1($r_ptr) | ||
| 130 | cmovz $t3, $a3 | ||
| 131 | mov $a2, 8*2($r_ptr) | ||
| 132 | mov $a3, 8*3($r_ptr) | ||
| 133 | |||
| 134 | pop %r13 | ||
| 135 | pop %r12 | ||
| 136 | ret | ||
| 137 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | ||
| 138 | |||
| 139 | ################################################################################ | ||
| 140 | # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); | ||
| 141 | .globl ecp_nistz256_neg | ||
| 142 | .type ecp_nistz256_neg,\@function,2 | ||
| 143 | .align 32 | ||
| 144 | ecp_nistz256_neg: | ||
| 145 | push %r12 | ||
| 146 | push %r13 | ||
| 147 | |||
| 148 | xor $a0, $a0 | ||
| 149 | xor $a1, $a1 | ||
| 150 | xor $a2, $a2 | ||
| 151 | xor $a3, $a3 | ||
| 152 | xor $t4, $t4 | ||
| 153 | |||
| 154 | sub 8*0($a_ptr), $a0 | ||
| 155 | sbb 8*1($a_ptr), $a1 | ||
| 156 | sbb 8*2($a_ptr), $a2 | ||
| 157 | mov $a0, $t0 | ||
| 158 | sbb 8*3($a_ptr), $a3 | ||
| 159 | lea .Lpoly(%rip), $a_ptr | ||
| 160 | mov $a1, $t1 | ||
| 161 | sbb \$0, $t4 | ||
| 162 | |||
| 163 | add 8*0($a_ptr), $a0 | ||
| 164 | mov $a2, $t2 | ||
| 165 | adc 8*1($a_ptr), $a1 | ||
| 166 | adc 8*2($a_ptr), $a2 | ||
| 167 | mov $a3, $t3 | ||
| 168 | adc 8*3($a_ptr), $a3 | ||
| 169 | test $t4, $t4 | ||
| 170 | |||
| 171 | cmovz $t0, $a0 | ||
| 172 | cmovz $t1, $a1 | ||
| 173 | mov $a0, 8*0($r_ptr) | ||
| 174 | cmovz $t2, $a2 | ||
| 175 | mov $a1, 8*1($r_ptr) | ||
| 176 | cmovz $t3, $a3 | ||
| 177 | mov $a2, 8*2($r_ptr) | ||
| 178 | mov $a3, 8*3($r_ptr) | ||
| 179 | |||
| 180 | pop %r13 | ||
| 181 | pop %r12 | ||
| 182 | ret | ||
| 183 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | ||
| 184 | ___ | ||
| 185 | } | ||
| 186 | { | ||
| 187 | my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); | ||
| 188 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); | ||
| 189 | my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); | ||
| 190 | my ($poly1,$poly3)=($acc6,$acc7); | ||
| 191 | |||
| 192 | $code.=<<___; | ||
| 193 | ################################################################################ | ||
| 194 | # void ecp_nistz256_mul_mont( | ||
| 195 | # uint64_t res[4], | ||
| 196 | # uint64_t a[4], | ||
| 197 | # uint64_t b[4]); | ||
| 198 | |||
| 199 | .globl ecp_nistz256_mul_mont | ||
| 200 | .type ecp_nistz256_mul_mont,\@function,3 | ||
| 201 | .align 32 | ||
| 202 | ecp_nistz256_mul_mont: | ||
| 203 | .Lmul_mont: | ||
| 204 | push %rbp | ||
| 205 | push %rbx | ||
| 206 | push %r12 | ||
| 207 | push %r13 | ||
| 208 | push %r14 | ||
| 209 | push %r15 | ||
| 210 | |||
| 211 | mov $b_org, $b_ptr | ||
| 212 | mov 8*0($b_org), %rax | ||
| 213 | mov 8*0($a_ptr), $acc1 | ||
| 214 | mov 8*1($a_ptr), $acc2 | ||
| 215 | mov 8*2($a_ptr), $acc3 | ||
| 216 | mov 8*3($a_ptr), $acc4 | ||
| 217 | |||
| 218 | call __ecp_nistz256_mul_montq | ||
| 219 | |||
| 220 | pop %r15 | ||
| 221 | pop %r14 | ||
| 222 | pop %r13 | ||
| 223 | pop %r12 | ||
| 224 | pop %rbx | ||
| 225 | pop %rbp | ||
| 226 | ret | ||
| 227 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | ||
| 228 | |||
| 229 | .type __ecp_nistz256_mul_montq,\@abi-omnipotent | ||
| 230 | .align 32 | ||
| 231 | __ecp_nistz256_mul_montq: | ||
| 232 | ######################################################################## | ||
| 233 | # Multiply a by b[0] | ||
| 234 | mov %rax, $t1 | ||
| 235 | mulq $acc1 | ||
| 236 | mov .Lpoly+8*1(%rip),$poly1 | ||
| 237 | mov %rax, $acc0 | ||
| 238 | mov $t1, %rax | ||
| 239 | mov %rdx, $acc1 | ||
| 240 | |||
| 241 | mulq $acc2 | ||
| 242 | mov .Lpoly+8*3(%rip),$poly3 | ||
| 243 | add %rax, $acc1 | ||
| 244 | mov $t1, %rax | ||
| 245 | adc \$0, %rdx | ||
| 246 | mov %rdx, $acc2 | ||
| 247 | |||
| 248 | mulq $acc3 | ||
| 249 | add %rax, $acc2 | ||
| 250 | mov $t1, %rax | ||
| 251 | adc \$0, %rdx | ||
| 252 | mov %rdx, $acc3 | ||
| 253 | |||
| 254 | mulq $acc4 | ||
| 255 | add %rax, $acc3 | ||
| 256 | mov $acc0, %rax | ||
| 257 | adc \$0, %rdx | ||
| 258 | xor $acc5, $acc5 | ||
| 259 | mov %rdx, $acc4 | ||
| 260 | |||
| 261 | ######################################################################## | ||
| 262 | # First reduction step | ||
| 263 | # Basically now we want to multiply acc[0] by p256, | ||
| 264 | # and add the result to the acc. | ||
| 265 | # Due to the special form of p256 we do some optimizations | ||
| 266 | # | ||
| 267 | # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] | ||
| 268 | # then we add acc[0] and get acc[0] x 2^96 | ||
| 269 | |||
| 270 | mov $acc0, $t1 | ||
| 271 | shl \$32, $acc0 | ||
| 272 | mulq $poly3 | ||
| 273 | shr \$32, $t1 | ||
| 274 | add $acc0, $acc1 # +=acc[0]<<96 | ||
| 275 | adc $t1, $acc2 | ||
| 276 | adc %rax, $acc3 | ||
| 277 | mov 8*1($b_ptr), %rax | ||
| 278 | adc %rdx, $acc4 | ||
| 279 | adc \$0, $acc5 | ||
| 280 | xor $acc0, $acc0 | ||
| 281 | |||
| 282 | ######################################################################## | ||
| 283 | # Multiply by b[1] | ||
| 284 | mov %rax, $t1 | ||
| 285 | mulq 8*0($a_ptr) | ||
| 286 | add %rax, $acc1 | ||
| 287 | mov $t1, %rax | ||
| 288 | adc \$0, %rdx | ||
| 289 | mov %rdx, $t0 | ||
| 290 | |||
| 291 | mulq 8*1($a_ptr) | ||
| 292 | add $t0, $acc2 | ||
| 293 | adc \$0, %rdx | ||
| 294 | add %rax, $acc2 | ||
| 295 | mov $t1, %rax | ||
| 296 | adc \$0, %rdx | ||
| 297 | mov %rdx, $t0 | ||
| 298 | |||
| 299 | mulq 8*2($a_ptr) | ||
| 300 | add $t0, $acc3 | ||
| 301 | adc \$0, %rdx | ||
| 302 | add %rax, $acc3 | ||
| 303 | mov $t1, %rax | ||
| 304 | adc \$0, %rdx | ||
| 305 | mov %rdx, $t0 | ||
| 306 | |||
| 307 | mulq 8*3($a_ptr) | ||
| 308 | add $t0, $acc4 | ||
| 309 | adc \$0, %rdx | ||
| 310 | add %rax, $acc4 | ||
| 311 | mov $acc1, %rax | ||
| 312 | adc %rdx, $acc5 | ||
| 313 | adc \$0, $acc0 | ||
| 314 | |||
| 315 | ######################################################################## | ||
| 316 | # Second reduction step | ||
| 317 | mov $acc1, $t1 | ||
| 318 | shl \$32, $acc1 | ||
| 319 | mulq $poly3 | ||
| 320 | shr \$32, $t1 | ||
| 321 | add $acc1, $acc2 | ||
| 322 | adc $t1, $acc3 | ||
| 323 | adc %rax, $acc4 | ||
| 324 | mov 8*2($b_ptr), %rax | ||
| 325 | adc %rdx, $acc5 | ||
| 326 | adc \$0, $acc0 | ||
| 327 | xor $acc1, $acc1 | ||
| 328 | |||
| 329 | ######################################################################## | ||
| 330 | # Multiply by b[2] | ||
| 331 | mov %rax, $t1 | ||
| 332 | mulq 8*0($a_ptr) | ||
| 333 | add %rax, $acc2 | ||
| 334 | mov $t1, %rax | ||
| 335 | adc \$0, %rdx | ||
| 336 | mov %rdx, $t0 | ||
| 337 | |||
| 338 | mulq 8*1($a_ptr) | ||
| 339 | add $t0, $acc3 | ||
| 340 | adc \$0, %rdx | ||
| 341 | add %rax, $acc3 | ||
| 342 | mov $t1, %rax | ||
| 343 | adc \$0, %rdx | ||
| 344 | mov %rdx, $t0 | ||
| 345 | |||
| 346 | mulq 8*2($a_ptr) | ||
| 347 | add $t0, $acc4 | ||
| 348 | adc \$0, %rdx | ||
| 349 | add %rax, $acc4 | ||
| 350 | mov $t1, %rax | ||
| 351 | adc \$0, %rdx | ||
| 352 | mov %rdx, $t0 | ||
| 353 | |||
| 354 | mulq 8*3($a_ptr) | ||
| 355 | add $t0, $acc5 | ||
| 356 | adc \$0, %rdx | ||
| 357 | add %rax, $acc5 | ||
| 358 | mov $acc2, %rax | ||
| 359 | adc %rdx, $acc0 | ||
| 360 | adc \$0, $acc1 | ||
| 361 | |||
| 362 | ######################################################################## | ||
| 363 | # Third reduction step | ||
| 364 | mov $acc2, $t1 | ||
| 365 | shl \$32, $acc2 | ||
| 366 | mulq $poly3 | ||
| 367 | shr \$32, $t1 | ||
| 368 | add $acc2, $acc3 | ||
| 369 | adc $t1, $acc4 | ||
| 370 | adc %rax, $acc5 | ||
| 371 | mov 8*3($b_ptr), %rax | ||
| 372 | adc %rdx, $acc0 | ||
| 373 | adc \$0, $acc1 | ||
| 374 | xor $acc2, $acc2 | ||
| 375 | |||
| 376 | ######################################################################## | ||
| 377 | # Multiply by b[3] | ||
| 378 | mov %rax, $t1 | ||
| 379 | mulq 8*0($a_ptr) | ||
| 380 | add %rax, $acc3 | ||
| 381 | mov $t1, %rax | ||
| 382 | adc \$0, %rdx | ||
| 383 | mov %rdx, $t0 | ||
| 384 | |||
| 385 | mulq 8*1($a_ptr) | ||
| 386 | add $t0, $acc4 | ||
| 387 | adc \$0, %rdx | ||
| 388 | add %rax, $acc4 | ||
| 389 | mov $t1, %rax | ||
| 390 | adc \$0, %rdx | ||
| 391 | mov %rdx, $t0 | ||
| 392 | |||
| 393 | mulq 8*2($a_ptr) | ||
| 394 | add $t0, $acc5 | ||
| 395 | adc \$0, %rdx | ||
| 396 | add %rax, $acc5 | ||
| 397 | mov $t1, %rax | ||
| 398 | adc \$0, %rdx | ||
| 399 | mov %rdx, $t0 | ||
| 400 | |||
| 401 | mulq 8*3($a_ptr) | ||
| 402 | add $t0, $acc0 | ||
| 403 | adc \$0, %rdx | ||
| 404 | add %rax, $acc0 | ||
| 405 | mov $acc3, %rax | ||
| 406 | adc %rdx, $acc1 | ||
| 407 | adc \$0, $acc2 | ||
| 408 | |||
| 409 | ######################################################################## | ||
| 410 | # Final reduction step | ||
| 411 | mov $acc3, $t1 | ||
| 412 | shl \$32, $acc3 | ||
| 413 | mulq $poly3 | ||
| 414 | shr \$32, $t1 | ||
| 415 | add $acc3, $acc4 | ||
| 416 | adc $t1, $acc5 | ||
| 417 | mov $acc4, $t0 | ||
| 418 | adc %rax, $acc0 | ||
| 419 | adc %rdx, $acc1 | ||
| 420 | mov $acc5, $t1 | ||
| 421 | adc \$0, $acc2 | ||
| 422 | |||
| 423 | ######################################################################## | ||
| 424 | # Branch-less conditional subtraction of P | ||
| 425 | sub \$-1, $acc4 # .Lpoly[0] | ||
| 426 | mov $acc0, $t2 | ||
| 427 | sbb $poly1, $acc5 # .Lpoly[1] | ||
| 428 | sbb \$0, $acc0 # .Lpoly[2] | ||
| 429 | mov $acc1, $t3 | ||
| 430 | sbb $poly3, $acc1 # .Lpoly[3] | ||
| 431 | sbb \$0, $acc2 | ||
| 432 | |||
| 433 | cmovc $t0, $acc4 | ||
| 434 | cmovc $t1, $acc5 | ||
| 435 | mov $acc4, 8*0($r_ptr) | ||
| 436 | cmovc $t2, $acc0 | ||
| 437 | mov $acc5, 8*1($r_ptr) | ||
| 438 | cmovc $t3, $acc1 | ||
| 439 | mov $acc0, 8*2($r_ptr) | ||
| 440 | mov $acc1, 8*3($r_ptr) | ||
| 441 | |||
| 442 | ret | ||
| 443 | .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq | ||
| 444 | |||
| 445 | ################################################################################ | ||
| 446 | # void ecp_nistz256_sqr_mont( | ||
| 447 | # uint64_t res[4], | ||
| 448 | # uint64_t a[4]); | ||
| 449 | |||
| 450 | # we optimize the square according to S.Gueron and V.Krasnov, | ||
| 451 | # "Speeding up Big-Number Squaring" | ||
| 452 | .globl ecp_nistz256_sqr_mont | ||
| 453 | .type ecp_nistz256_sqr_mont,\@function,2 | ||
| 454 | .align 32 | ||
| 455 | ecp_nistz256_sqr_mont: | ||
| 456 | push %rbp | ||
| 457 | push %rbx | ||
| 458 | push %r12 | ||
| 459 | push %r13 | ||
| 460 | push %r14 | ||
| 461 | push %r15 | ||
| 462 | |||
| 463 | mov 8*0($a_ptr), %rax | ||
| 464 | mov 8*1($a_ptr), $acc6 | ||
| 465 | mov 8*2($a_ptr), $acc7 | ||
| 466 | mov 8*3($a_ptr), $acc0 | ||
| 467 | |||
| 468 | call __ecp_nistz256_sqr_montq | ||
| 469 | |||
| 470 | pop %r15 | ||
| 471 | pop %r14 | ||
| 472 | pop %r13 | ||
| 473 | pop %r12 | ||
| 474 | pop %rbx | ||
| 475 | pop %rbp | ||
| 476 | ret | ||
| 477 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | ||
| 478 | |||
| 479 | .type __ecp_nistz256_sqr_montq,\@abi-omnipotent | ||
| 480 | .align 32 | ||
| 481 | __ecp_nistz256_sqr_montq: | ||
| 482 | mov %rax, $acc5 | ||
| 483 | mulq $acc6 # a[1]*a[0] | ||
| 484 | mov %rax, $acc1 | ||
| 485 | mov $acc7, %rax | ||
| 486 | mov %rdx, $acc2 | ||
| 487 | |||
| 488 | mulq $acc5 # a[0]*a[2] | ||
| 489 | add %rax, $acc2 | ||
| 490 | mov $acc0, %rax | ||
| 491 | adc \$0, %rdx | ||
| 492 | mov %rdx, $acc3 | ||
| 493 | |||
| 494 | mulq $acc5 # a[0]*a[3] | ||
| 495 | add %rax, $acc3 | ||
| 496 | mov $acc7, %rax | ||
| 497 | adc \$0, %rdx | ||
| 498 | mov %rdx, $acc4 | ||
| 499 | |||
| 500 | ################################# | ||
| 501 | mulq $acc6 # a[1]*a[2] | ||
| 502 | add %rax, $acc3 | ||
| 503 | mov $acc0, %rax | ||
| 504 | adc \$0, %rdx | ||
| 505 | mov %rdx, $t1 | ||
| 506 | |||
| 507 | mulq $acc6 # a[1]*a[3] | ||
| 508 | add %rax, $acc4 | ||
| 509 | mov $acc0, %rax | ||
| 510 | adc \$0, %rdx | ||
| 511 | add $t1, $acc4 | ||
| 512 | mov %rdx, $acc5 | ||
| 513 | adc \$0, $acc5 | ||
| 514 | |||
| 515 | ################################# | ||
| 516 | mulq $acc7 # a[2]*a[3] | ||
| 517 | xor $acc7, $acc7 | ||
| 518 | add %rax, $acc5 | ||
| 519 | mov 8*0($a_ptr), %rax | ||
| 520 | mov %rdx, $acc6 | ||
| 521 | adc \$0, $acc6 | ||
| 522 | |||
| 523 | add $acc1, $acc1 # acc1:6<<1 | ||
| 524 | adc $acc2, $acc2 | ||
| 525 | adc $acc3, $acc3 | ||
| 526 | adc $acc4, $acc4 | ||
| 527 | adc $acc5, $acc5 | ||
| 528 | adc $acc6, $acc6 | ||
| 529 | adc \$0, $acc7 | ||
| 530 | |||
| 531 | mulq %rax | ||
| 532 | mov %rax, $acc0 | ||
| 533 | mov 8*1($a_ptr), %rax | ||
| 534 | mov %rdx, $t0 | ||
| 535 | |||
| 536 | mulq %rax | ||
| 537 | add $t0, $acc1 | ||
| 538 | adc %rax, $acc2 | ||
| 539 | mov 8*2($a_ptr), %rax | ||
| 540 | adc \$0, %rdx | ||
| 541 | mov %rdx, $t0 | ||
| 542 | |||
| 543 | mulq %rax | ||
| 544 | add $t0, $acc3 | ||
| 545 | adc %rax, $acc4 | ||
| 546 | mov 8*3($a_ptr), %rax | ||
| 547 | adc \$0, %rdx | ||
| 548 | mov %rdx, $t0 | ||
| 549 | |||
| 550 | mulq %rax | ||
| 551 | add $t0, $acc5 | ||
| 552 | adc %rax, $acc6 | ||
| 553 | mov $acc0, %rax | ||
| 554 | adc %rdx, $acc7 | ||
| 555 | |||
| 556 | mov .Lpoly+8*1(%rip), $a_ptr | ||
| 557 | mov .Lpoly+8*3(%rip), $t1 | ||
| 558 | |||
| 559 | ########################################## | ||
| 560 | # Now the reduction | ||
| 561 | # First iteration | ||
| 562 | mov $acc0, $t0 | ||
| 563 | shl \$32, $acc0 | ||
| 564 | mulq $t1 | ||
| 565 | shr \$32, $t0 | ||
| 566 | add $acc0, $acc1 # +=acc[0]<<96 | ||
| 567 | adc $t0, $acc2 | ||
| 568 | adc %rax, $acc3 | ||
| 569 | mov $acc1, %rax | ||
| 570 | adc \$0, %rdx | ||
| 571 | |||
| 572 | ########################################## | ||
| 573 | # Second iteration | ||
| 574 | mov $acc1, $t0 | ||
| 575 | shl \$32, $acc1 | ||
| 576 | mov %rdx, $acc0 | ||
| 577 | mulq $t1 | ||
| 578 | shr \$32, $t0 | ||
| 579 | add $acc1, $acc2 | ||
| 580 | adc $t0, $acc3 | ||
| 581 | adc %rax, $acc0 | ||
| 582 | mov $acc2, %rax | ||
| 583 | adc \$0, %rdx | ||
| 584 | |||
| 585 | ########################################## | ||
| 586 | # Third iteration | ||
| 587 | mov $acc2, $t0 | ||
| 588 | shl \$32, $acc2 | ||
| 589 | mov %rdx, $acc1 | ||
| 590 | mulq $t1 | ||
| 591 | shr \$32, $t0 | ||
| 592 | add $acc2, $acc3 | ||
| 593 | adc $t0, $acc0 | ||
| 594 | adc %rax, $acc1 | ||
| 595 | mov $acc3, %rax | ||
| 596 | adc \$0, %rdx | ||
| 597 | |||
| 598 | ########################################### | ||
| 599 | # Last iteration | ||
| 600 | mov $acc3, $t0 | ||
| 601 | shl \$32, $acc3 | ||
| 602 | mov %rdx, $acc2 | ||
| 603 | mulq $t1 | ||
| 604 | shr \$32, $t0 | ||
| 605 | add $acc3, $acc0 | ||
| 606 | adc $t0, $acc1 | ||
| 607 | adc %rax, $acc2 | ||
| 608 | adc \$0, %rdx | ||
| 609 | xor $acc3, $acc3 | ||
| 610 | |||
| 611 | ############################################ | ||
| 612 | # Add the rest of the acc | ||
| 613 | add $acc0, $acc4 | ||
| 614 | adc $acc1, $acc5 | ||
| 615 | mov $acc4, $acc0 | ||
| 616 | adc $acc2, $acc6 | ||
| 617 | adc %rdx, $acc7 | ||
| 618 | mov $acc5, $acc1 | ||
| 619 | adc \$0, $acc3 | ||
| 620 | |||
| 621 | sub \$-1, $acc4 # .Lpoly[0] | ||
| 622 | mov $acc6, $acc2 | ||
| 623 | sbb $a_ptr, $acc5 # .Lpoly[1] | ||
| 624 | sbb \$0, $acc6 # .Lpoly[2] | ||
| 625 | mov $acc7, $t0 | ||
| 626 | sbb $t1, $acc7 # .Lpoly[3] | ||
| 627 | sbb \$0, $acc3 | ||
| 628 | |||
| 629 | cmovc $acc0, $acc4 | ||
| 630 | cmovc $acc1, $acc5 | ||
| 631 | mov $acc4, 8*0($r_ptr) | ||
| 632 | cmovc $acc2, $acc6 | ||
| 633 | mov $acc5, 8*1($r_ptr) | ||
| 634 | cmovc $t0, $acc7 | ||
| 635 | mov $acc6, 8*2($r_ptr) | ||
| 636 | mov $acc7, 8*3($r_ptr) | ||
| 637 | |||
| 638 | ret | ||
| 639 | .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq | ||
| 640 | ___ | ||
| 641 | |||
| 642 | } | ||
| 643 | { | ||
| 644 | my ($r_ptr,$in_ptr)=("%rdi","%rsi"); | ||
| 645 | my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); | ||
| 646 | my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); | ||
| 647 | |||
| 648 | $code.=<<___; | ||
| 649 | ################################################################################ | ||
| 650 | # void ecp_nistz256_from_mont( | ||
| 651 | # uint64_t res[4], | ||
| 652 | # uint64_t in[4]); | ||
| 653 | # This one performs Montgomery multiplication by 1, so we only need the reduction | ||
| 654 | |||
| 655 | .globl ecp_nistz256_from_mont | ||
| 656 | .type ecp_nistz256_from_mont,\@function,2 | ||
| 657 | .align 32 | ||
| 658 | ecp_nistz256_from_mont: | ||
| 659 | push %r12 | ||
| 660 | push %r13 | ||
| 661 | |||
| 662 | mov 8*0($in_ptr), %rax | ||
| 663 | mov .Lpoly+8*3(%rip), $t2 | ||
| 664 | mov 8*1($in_ptr), $acc1 | ||
| 665 | mov 8*2($in_ptr), $acc2 | ||
| 666 | mov 8*3($in_ptr), $acc3 | ||
| 667 | mov %rax, $acc0 | ||
| 668 | mov .Lpoly+8*1(%rip), $t1 | ||
| 669 | |||
| 670 | ######################################### | ||
| 671 | # First iteration | ||
| 672 | mov %rax, $t0 | ||
| 673 | shl \$32, $acc0 | ||
| 674 | mulq $t2 | ||
| 675 | shr \$32, $t0 | ||
| 676 | add $acc0, $acc1 | ||
| 677 | adc $t0, $acc2 | ||
| 678 | adc %rax, $acc3 | ||
| 679 | mov $acc1, %rax | ||
| 680 | adc \$0, %rdx | ||
| 681 | |||
| 682 | ######################################### | ||
| 683 | # Second iteration | ||
| 684 | mov $acc1, $t0 | ||
| 685 | shl \$32, $acc1 | ||
| 686 | mov %rdx, $acc0 | ||
| 687 | mulq $t2 | ||
| 688 | shr \$32, $t0 | ||
| 689 | add $acc1, $acc2 | ||
| 690 | adc $t0, $acc3 | ||
| 691 | adc %rax, $acc0 | ||
| 692 | mov $acc2, %rax | ||
| 693 | adc \$0, %rdx | ||
| 694 | |||
| 695 | ########################################## | ||
| 696 | # Third iteration | ||
| 697 | mov $acc2, $t0 | ||
| 698 | shl \$32, $acc2 | ||
| 699 | mov %rdx, $acc1 | ||
| 700 | mulq $t2 | ||
| 701 | shr \$32, $t0 | ||
| 702 | add $acc2, $acc3 | ||
| 703 | adc $t0, $acc0 | ||
| 704 | adc %rax, $acc1 | ||
| 705 | mov $acc3, %rax | ||
| 706 | adc \$0, %rdx | ||
| 707 | |||
| 708 | ########################################### | ||
| 709 | # Last iteration | ||
| 710 | mov $acc3, $t0 | ||
| 711 | shl \$32, $acc3 | ||
| 712 | mov %rdx, $acc2 | ||
| 713 | mulq $t2 | ||
| 714 | shr \$32, $t0 | ||
| 715 | add $acc3, $acc0 | ||
| 716 | adc $t0, $acc1 | ||
| 717 | mov $acc0, $t0 | ||
| 718 | adc %rax, $acc2 | ||
| 719 | mov $acc1, $in_ptr | ||
| 720 | adc \$0, %rdx | ||
| 721 | |||
| 722 | ########################################### | ||
| 723 | # Branch-less conditional subtraction | ||
| 724 | sub \$-1, $acc0 | ||
| 725 | mov $acc2, %rax | ||
| 726 | sbb $t1, $acc1 | ||
| 727 | sbb \$0, $acc2 | ||
| 728 | mov %rdx, $acc3 | ||
| 729 | sbb $t2, %rdx | ||
| 730 | sbb $t2, $t2 | ||
| 731 | |||
| 732 | cmovnz $t0, $acc0 | ||
| 733 | cmovnz $in_ptr, $acc1 | ||
| 734 | mov $acc0, 8*0($r_ptr) | ||
| 735 | cmovnz %rax, $acc2 | ||
| 736 | mov $acc1, 8*1($r_ptr) | ||
| 737 | cmovz %rdx, $acc3 | ||
| 738 | mov $acc2, 8*2($r_ptr) | ||
| 739 | mov $acc3, 8*3($r_ptr) | ||
| 740 | |||
| 741 | pop %r13 | ||
| 742 | pop %r12 | ||
| 743 | ret | ||
| 744 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | ||
| 745 | ___ | ||
| 746 | } | ||
| 747 | { | ||
| 748 | my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); | ||
| 749 | my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); | ||
| 750 | my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); | ||
| 751 | my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); | ||
| 752 | |||
| 753 | $code.=<<___; | ||
| 754 | ################################################################################ | ||
| 755 | # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); | ||
| 756 | .globl ecp_nistz256_select_w5 | ||
| 757 | .type ecp_nistz256_select_w5,\@abi-omnipotent | ||
| 758 | .align 32 | ||
| 759 | ecp_nistz256_select_w5: | ||
| 760 | ___ | ||
| 761 | $code.=<<___ if ($win64); | ||
| 762 | lea -0x88(%rsp), %rax | ||
| 763 | .LSEH_begin_ecp_nistz256_select_w5: | ||
| 764 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp | ||
| 765 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) | ||
| 766 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) | ||
| 767 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) | ||
| 768 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) | ||
| 769 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) | ||
| 770 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) | ||
| 771 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) | ||
| 772 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) | ||
| 773 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) | ||
| 774 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) | ||
| 775 | ___ | ||
| 776 | $code.=<<___; | ||
| 777 | movdqa .LOne(%rip), $ONE | ||
| 778 | movd $index, $INDEX | ||
| 779 | |||
| 780 | pxor $Ra, $Ra | ||
| 781 | pxor $Rb, $Rb | ||
| 782 | pxor $Rc, $Rc | ||
| 783 | pxor $Rd, $Rd | ||
| 784 | pxor $Re, $Re | ||
| 785 | pxor $Rf, $Rf | ||
| 786 | |||
| 787 | movdqa $ONE, $M0 | ||
| 788 | pshufd \$0, $INDEX, $INDEX | ||
| 789 | |||
| 790 | mov \$16, %rax | ||
| 791 | .Lselect_loop_sse_w5: | ||
| 792 | |||
| 793 | movdqa $M0, $TMP0 | ||
| 794 | paddd $ONE, $M0 | ||
| 795 | pcmpeqd $INDEX, $TMP0 | ||
| 796 | |||
| 797 | movdqa 16*0($in_t), $T0a | ||
| 798 | movdqa 16*1($in_t), $T0b | ||
| 799 | movdqa 16*2($in_t), $T0c | ||
| 800 | movdqa 16*3($in_t), $T0d | ||
| 801 | movdqa 16*4($in_t), $T0e | ||
| 802 | movdqa 16*5($in_t), $T0f | ||
| 803 | lea 16*6($in_t), $in_t | ||
| 804 | |||
| 805 | pand $TMP0, $T0a | ||
| 806 | pand $TMP0, $T0b | ||
| 807 | por $T0a, $Ra | ||
| 808 | pand $TMP0, $T0c | ||
| 809 | por $T0b, $Rb | ||
| 810 | pand $TMP0, $T0d | ||
| 811 | por $T0c, $Rc | ||
| 812 | pand $TMP0, $T0e | ||
| 813 | por $T0d, $Rd | ||
| 814 | pand $TMP0, $T0f | ||
| 815 | por $T0e, $Re | ||
| 816 | por $T0f, $Rf | ||
| 817 | |||
| 818 | dec %rax | ||
| 819 | jnz .Lselect_loop_sse_w5 | ||
| 820 | |||
| 821 | movdqu $Ra, 16*0($val) | ||
| 822 | movdqu $Rb, 16*1($val) | ||
| 823 | movdqu $Rc, 16*2($val) | ||
| 824 | movdqu $Rd, 16*3($val) | ||
| 825 | movdqu $Re, 16*4($val) | ||
| 826 | movdqu $Rf, 16*5($val) | ||
| 827 | ___ | ||
| 828 | $code.=<<___ if ($win64); | ||
| 829 | movaps (%rsp), %xmm6 | ||
| 830 | movaps 0x10(%rsp), %xmm7 | ||
| 831 | movaps 0x20(%rsp), %xmm8 | ||
| 832 | movaps 0x30(%rsp), %xmm9 | ||
| 833 | movaps 0x40(%rsp), %xmm10 | ||
| 834 | movaps 0x50(%rsp), %xmm11 | ||
| 835 | movaps 0x60(%rsp), %xmm12 | ||
| 836 | movaps 0x70(%rsp), %xmm13 | ||
| 837 | movaps 0x80(%rsp), %xmm14 | ||
| 838 | movaps 0x90(%rsp), %xmm15 | ||
| 839 | lea 0xa8(%rsp), %rsp | ||
| 840 | .LSEH_end_ecp_nistz256_select_w5: | ||
| 841 | ___ | ||
| 842 | $code.=<<___; | ||
| 843 | ret | ||
| 844 | .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 | ||
| 845 | |||
| 846 | ################################################################################ | ||
| 847 | # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); | ||
| 848 | .globl ecp_nistz256_select_w7 | ||
| 849 | .type ecp_nistz256_select_w7,\@abi-omnipotent | ||
| 850 | .align 32 | ||
| 851 | ecp_nistz256_select_w7: | ||
| 852 | ___ | ||
| 853 | $code.=<<___ if ($win64); | ||
| 854 | lea -0x88(%rsp), %rax | ||
| 855 | .LSEH_begin_ecp_nistz256_select_w7: | ||
| 856 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp | ||
| 857 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) | ||
| 858 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) | ||
| 859 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) | ||
| 860 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) | ||
| 861 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) | ||
| 862 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) | ||
| 863 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) | ||
| 864 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) | ||
| 865 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) | ||
| 866 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) | ||
| 867 | ___ | ||
| 868 | $code.=<<___; | ||
| 869 | movdqa .LOne(%rip), $M0 | ||
| 870 | movd $index, $INDEX | ||
| 871 | |||
| 872 | pxor $Ra, $Ra | ||
| 873 | pxor $Rb, $Rb | ||
| 874 | pxor $Rc, $Rc | ||
| 875 | pxor $Rd, $Rd | ||
| 876 | |||
| 877 | movdqa $M0, $ONE | ||
| 878 | pshufd \$0, $INDEX, $INDEX | ||
| 879 | mov \$64, %rax | ||
| 880 | |||
| 881 | .Lselect_loop_sse_w7: | ||
| 882 | movdqa $M0, $TMP0 | ||
| 883 | paddd $ONE, $M0 | ||
| 884 | movdqa 16*0($in_t), $T0a | ||
| 885 | movdqa 16*1($in_t), $T0b | ||
| 886 | pcmpeqd $INDEX, $TMP0 | ||
| 887 | movdqa 16*2($in_t), $T0c | ||
| 888 | movdqa 16*3($in_t), $T0d | ||
| 889 | lea 16*4($in_t), $in_t | ||
| 890 | |||
| 891 | pand $TMP0, $T0a | ||
| 892 | pand $TMP0, $T0b | ||
| 893 | por $T0a, $Ra | ||
| 894 | pand $TMP0, $T0c | ||
| 895 | por $T0b, $Rb | ||
| 896 | pand $TMP0, $T0d | ||
| 897 | por $T0c, $Rc | ||
| 898 | prefetcht0 255($in_t) | ||
| 899 | por $T0d, $Rd | ||
| 900 | |||
| 901 | dec %rax | ||
| 902 | jnz .Lselect_loop_sse_w7 | ||
| 903 | |||
| 904 | movdqu $Ra, 16*0($val) | ||
| 905 | movdqu $Rb, 16*1($val) | ||
| 906 | movdqu $Rc, 16*2($val) | ||
| 907 | movdqu $Rd, 16*3($val) | ||
| 908 | ___ | ||
| 909 | $code.=<<___ if ($win64); | ||
| 910 | movaps (%rsp), %xmm6 | ||
| 911 | movaps 0x10(%rsp), %xmm7 | ||
| 912 | movaps 0x20(%rsp), %xmm8 | ||
| 913 | movaps 0x30(%rsp), %xmm9 | ||
| 914 | movaps 0x40(%rsp), %xmm10 | ||
| 915 | movaps 0x50(%rsp), %xmm11 | ||
| 916 | movaps 0x60(%rsp), %xmm12 | ||
| 917 | movaps 0x70(%rsp), %xmm13 | ||
| 918 | movaps 0x80(%rsp), %xmm14 | ||
| 919 | movaps 0x90(%rsp), %xmm15 | ||
| 920 | lea 0xa8(%rsp), %rsp | ||
| 921 | .LSEH_end_ecp_nistz256_select_w7: | ||
| 922 | ___ | ||
| 923 | $code.=<<___; | ||
| 924 | ret | ||
| 925 | .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 | ||
| 926 | ___ | ||
| 927 | } | ||
| 928 | {{{ | ||
| 929 | ######################################################################## | ||
| 930 | # This block implements higher level point_double, point_add and | ||
| 931 | # point_add_affine. The key to performance in this case is to allow | ||
| 932 | # out-of-order execution logic to overlap computations from next step | ||
| 933 | # with tail processing from current step. By using tailored calling | ||
| 934 | # sequence we minimize inter-step overhead to give processor better | ||
| 935 | # shot at overlapping operations... | ||
| 936 | # | ||
| 937 | # You will notice that input data is copied to stack. Trouble is that | ||
| 938 | # there are no registers to spare for holding original pointers and | ||
| 939 | # reloading them, pointers, would create undesired dependencies on | ||
| 940 | # effective addresses calculation paths. In other words it's too done | ||
| 941 | # to favour out-of-order execution logic. | ||
| 942 | # <appro@openssl.org> | ||
| 943 | |||
| 944 | my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); | ||
| 945 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); | ||
| 946 | my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); | ||
| 947 | my ($poly1,$poly3)=($acc6,$acc7); | ||
| 948 | |||
| 949 | sub load_for_mul () { | ||
| 950 | my ($a,$b,$src0) = @_; | ||
| 951 | my $bias = $src0 eq "%rax" ? 0 : -128; | ||
| 952 | |||
| 953 | " mov $b, $src0 | ||
| 954 | lea $b, $b_ptr | ||
| 955 | mov 8*0+$a, $acc1 | ||
| 956 | mov 8*1+$a, $acc2 | ||
| 957 | lea $bias+$a, $a_ptr | ||
| 958 | mov 8*2+$a, $acc3 | ||
| 959 | mov 8*3+$a, $acc4" | ||
| 960 | } | ||
| 961 | |||
| 962 | sub load_for_sqr () { | ||
| 963 | my ($a,$src0) = @_; | ||
| 964 | my $bias = $src0 eq "%rax" ? 0 : -128; | ||
| 965 | |||
| 966 | " mov 8*0+$a, $src0 | ||
| 967 | mov 8*1+$a, $acc6 | ||
| 968 | lea $bias+$a, $a_ptr | ||
| 969 | mov 8*2+$a, $acc7 | ||
| 970 | mov 8*3+$a, $acc0" | ||
| 971 | } | ||
| 972 | |||
| 973 | { | ||
| 974 | ######################################################################## | ||
| 975 | # operate in 4-5-0-1 "name space" that matches multiplication output | ||
| 976 | # | ||
| 977 | my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | ||
| 978 | |||
| 979 | $code.=<<___; | ||
| 980 | .type __ecp_nistz256_add_toq,\@abi-omnipotent | ||
| 981 | .align 32 | ||
| 982 | __ecp_nistz256_add_toq: | ||
| 983 | add 8*0($b_ptr), $a0 | ||
| 984 | adc 8*1($b_ptr), $a1 | ||
| 985 | mov $a0, $t0 | ||
| 986 | adc 8*2($b_ptr), $a2 | ||
| 987 | adc 8*3($b_ptr), $a3 | ||
| 988 | mov $a1, $t1 | ||
| 989 | sbb $t4, $t4 | ||
| 990 | |||
| 991 | sub \$-1, $a0 | ||
| 992 | mov $a2, $t2 | ||
| 993 | sbb $poly1, $a1 | ||
| 994 | sbb \$0, $a2 | ||
| 995 | mov $a3, $t3 | ||
| 996 | sbb $poly3, $a3 | ||
| 997 | test $t4, $t4 | ||
| 998 | |||
| 999 | cmovz $t0, $a0 | ||
| 1000 | cmovz $t1, $a1 | ||
| 1001 | mov $a0, 8*0($r_ptr) | ||
| 1002 | cmovz $t2, $a2 | ||
| 1003 | mov $a1, 8*1($r_ptr) | ||
| 1004 | cmovz $t3, $a3 | ||
| 1005 | mov $a2, 8*2($r_ptr) | ||
| 1006 | mov $a3, 8*3($r_ptr) | ||
| 1007 | |||
| 1008 | ret | ||
| 1009 | .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq | ||
| 1010 | |||
| 1011 | .type __ecp_nistz256_sub_fromq,\@abi-omnipotent | ||
| 1012 | .align 32 | ||
| 1013 | __ecp_nistz256_sub_fromq: | ||
| 1014 | sub 8*0($b_ptr), $a0 | ||
| 1015 | sbb 8*1($b_ptr), $a1 | ||
| 1016 | mov $a0, $t0 | ||
| 1017 | sbb 8*2($b_ptr), $a2 | ||
| 1018 | sbb 8*3($b_ptr), $a3 | ||
| 1019 | mov $a1, $t1 | ||
| 1020 | sbb $t4, $t4 | ||
| 1021 | |||
| 1022 | add \$-1, $a0 | ||
| 1023 | mov $a2, $t2 | ||
| 1024 | adc $poly1, $a1 | ||
| 1025 | adc \$0, $a2 | ||
| 1026 | mov $a3, $t3 | ||
| 1027 | adc $poly3, $a3 | ||
| 1028 | test $t4, $t4 | ||
| 1029 | |||
| 1030 | cmovz $t0, $a0 | ||
| 1031 | cmovz $t1, $a1 | ||
| 1032 | mov $a0, 8*0($r_ptr) | ||
| 1033 | cmovz $t2, $a2 | ||
| 1034 | mov $a1, 8*1($r_ptr) | ||
| 1035 | cmovz $t3, $a3 | ||
| 1036 | mov $a2, 8*2($r_ptr) | ||
| 1037 | mov $a3, 8*3($r_ptr) | ||
| 1038 | |||
| 1039 | ret | ||
| 1040 | .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq | ||
| 1041 | |||
| 1042 | .type __ecp_nistz256_subq,\@abi-omnipotent | ||
| 1043 | .align 32 | ||
| 1044 | __ecp_nistz256_subq: | ||
| 1045 | sub $a0, $t0 | ||
| 1046 | sbb $a1, $t1 | ||
| 1047 | mov $t0, $a0 | ||
| 1048 | sbb $a2, $t2 | ||
| 1049 | sbb $a3, $t3 | ||
| 1050 | mov $t1, $a1 | ||
| 1051 | sbb $t4, $t4 | ||
| 1052 | |||
| 1053 | add \$-1, $t0 | ||
| 1054 | mov $t2, $a2 | ||
| 1055 | adc $poly1, $t1 | ||
| 1056 | adc \$0, $t2 | ||
| 1057 | mov $t3, $a3 | ||
| 1058 | adc $poly3, $t3 | ||
| 1059 | test $t4, $t4 | ||
| 1060 | |||
| 1061 | cmovnz $t0, $a0 | ||
| 1062 | cmovnz $t1, $a1 | ||
| 1063 | cmovnz $t2, $a2 | ||
| 1064 | cmovnz $t3, $a3 | ||
| 1065 | |||
| 1066 | ret | ||
| 1067 | .size __ecp_nistz256_subq,.-__ecp_nistz256_subq | ||
| 1068 | |||
| 1069 | .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent | ||
| 1070 | .align 32 | ||
| 1071 | __ecp_nistz256_mul_by_2q: | ||
| 1072 | add $a0, $a0 # a0:a3+a0:a3 | ||
| 1073 | adc $a1, $a1 | ||
| 1074 | mov $a0, $t0 | ||
| 1075 | adc $a2, $a2 | ||
| 1076 | adc $a3, $a3 | ||
| 1077 | mov $a1, $t1 | ||
| 1078 | sbb $t4, $t4 | ||
| 1079 | |||
| 1080 | sub \$-1, $a0 | ||
| 1081 | mov $a2, $t2 | ||
| 1082 | sbb $poly1, $a1 | ||
| 1083 | sbb \$0, $a2 | ||
| 1084 | mov $a3, $t3 | ||
| 1085 | sbb $poly3, $a3 | ||
| 1086 | test $t4, $t4 | ||
| 1087 | |||
| 1088 | cmovz $t0, $a0 | ||
| 1089 | cmovz $t1, $a1 | ||
| 1090 | mov $a0, 8*0($r_ptr) | ||
| 1091 | cmovz $t2, $a2 | ||
| 1092 | mov $a1, 8*1($r_ptr) | ||
| 1093 | cmovz $t3, $a3 | ||
| 1094 | mov $a2, 8*2($r_ptr) | ||
| 1095 | mov $a3, 8*3($r_ptr) | ||
| 1096 | |||
| 1097 | ret | ||
| 1098 | .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q | ||
| 1099 | ___ | ||
| 1100 | } | ||
| 1101 | sub gen_double () { | ||
| 1102 | my $x = shift; | ||
| 1103 | my ($src0,$sfx,$bias); | ||
| 1104 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | ||
| 1105 | |||
| 1106 | if ($x ne "x") { | ||
| 1107 | $src0 = "%rax"; | ||
| 1108 | $sfx = ""; | ||
| 1109 | $bias = 0; | ||
| 1110 | |||
| 1111 | $code.=<<___; | ||
| 1112 | .globl ecp_nistz256_point_double | ||
| 1113 | .type ecp_nistz256_point_double,\@function,2 | ||
| 1114 | .align 32 | ||
| 1115 | ecp_nistz256_point_double: | ||
| 1116 | ___ | ||
| 1117 | } else { | ||
| 1118 | $src0 = "%rdx"; | ||
| 1119 | $sfx = "x"; | ||
| 1120 | $bias = 128; | ||
| 1121 | |||
| 1122 | $code.=<<___; | ||
| 1123 | .type ecp_nistz256_point_doublex,\@function,2 | ||
| 1124 | .align 32 | ||
| 1125 | ecp_nistz256_point_doublex: | ||
| 1126 | .Lpoint_doublex: | ||
| 1127 | ___ | ||
| 1128 | } | ||
| 1129 | $code.=<<___; | ||
| 1130 | push %rbp | ||
| 1131 | push %rbx | ||
| 1132 | push %r12 | ||
| 1133 | push %r13 | ||
| 1134 | push %r14 | ||
| 1135 | push %r15 | ||
| 1136 | sub \$32*5+8, %rsp | ||
| 1137 | |||
| 1138 | .Lpoint_double_shortcut$x: | ||
| 1139 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x | ||
| 1140 | mov $a_ptr, $b_ptr # backup copy | ||
| 1141 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1142 | mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order | ||
| 1143 | mov 0x20+8*1($a_ptr), $acc5 | ||
| 1144 | mov 0x20+8*2($a_ptr), $acc0 | ||
| 1145 | mov 0x20+8*3($a_ptr), $acc1 | ||
| 1146 | mov .Lpoly+8*1(%rip), $poly1 | ||
| 1147 | mov .Lpoly+8*3(%rip), $poly3 | ||
| 1148 | movdqa %xmm0, $in_x(%rsp) | ||
| 1149 | movdqa %xmm1, $in_x+0x10(%rsp) | ||
| 1150 | lea 0x20($r_ptr), $acc2 | ||
| 1151 | lea 0x40($r_ptr), $acc3 | ||
| 1152 | movq $r_ptr, %xmm0 | ||
| 1153 | movq $acc2, %xmm1 | ||
| 1154 | movq $acc3, %xmm2 | ||
| 1155 | |||
| 1156 | lea $S(%rsp), $r_ptr | ||
| 1157 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); | ||
| 1158 | |||
| 1159 | mov 0x40+8*0($a_ptr), $src0 | ||
| 1160 | mov 0x40+8*1($a_ptr), $acc6 | ||
| 1161 | mov 0x40+8*2($a_ptr), $acc7 | ||
| 1162 | mov 0x40+8*3($a_ptr), $acc0 | ||
| 1163 | lea 0x40-$bias($a_ptr), $a_ptr | ||
| 1164 | lea $Zsqr(%rsp), $r_ptr | ||
| 1165 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); | ||
| 1166 | |||
| 1167 | `&load_for_sqr("$S(%rsp)", "$src0")` | ||
| 1168 | lea $S(%rsp), $r_ptr | ||
| 1169 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); | ||
| 1170 | |||
| 1171 | mov 0x20($b_ptr), $src0 # $b_ptr is still valid | ||
| 1172 | mov 0x40+8*0($b_ptr), $acc1 | ||
| 1173 | mov 0x40+8*1($b_ptr), $acc2 | ||
| 1174 | mov 0x40+8*2($b_ptr), $acc3 | ||
| 1175 | mov 0x40+8*3($b_ptr), $acc4 | ||
| 1176 | lea 0x40-$bias($b_ptr), $a_ptr | ||
| 1177 | lea 0x20($b_ptr), $b_ptr | ||
| 1178 | movq %xmm2, $r_ptr | ||
| 1179 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); | ||
| 1180 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); | ||
| 1181 | |||
| 1182 | mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order | ||
| 1183 | mov $in_x+8*1(%rsp), $acc5 | ||
| 1184 | lea $Zsqr(%rsp), $b_ptr | ||
| 1185 | mov $in_x+8*2(%rsp), $acc0 | ||
| 1186 | mov $in_x+8*3(%rsp), $acc1 | ||
| 1187 | lea $M(%rsp), $r_ptr | ||
| 1188 | call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); | ||
| 1189 | |||
| 1190 | mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order | ||
| 1191 | mov $in_x+8*1(%rsp), $acc5 | ||
| 1192 | lea $Zsqr(%rsp), $b_ptr | ||
| 1193 | mov $in_x+8*2(%rsp), $acc0 | ||
| 1194 | mov $in_x+8*3(%rsp), $acc1 | ||
| 1195 | lea $Zsqr(%rsp), $r_ptr | ||
| 1196 | call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); | ||
| 1197 | |||
| 1198 | `&load_for_sqr("$S(%rsp)", "$src0")` | ||
| 1199 | movq %xmm1, $r_ptr | ||
| 1200 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); | ||
| 1201 | ___ | ||
| 1202 | { | ||
| 1203 | ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## | ||
| 1204 | # operate in 4-5-6-7 "name space" that matches squaring output | ||
| 1205 | # | ||
| 1206 | my ($poly1,$poly3)=($a_ptr,$t1); | ||
| 1207 | my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); | ||
| 1208 | |||
| 1209 | $code.=<<___; | ||
| 1210 | xor $t4, $t4 | ||
| 1211 | mov $a0, $t0 | ||
| 1212 | add \$-1, $a0 | ||
| 1213 | mov $a1, $t1 | ||
| 1214 | adc $poly1, $a1 | ||
| 1215 | mov $a2, $t2 | ||
| 1216 | adc \$0, $a2 | ||
| 1217 | mov $a3, $t3 | ||
| 1218 | adc $poly3, $a3 | ||
| 1219 | adc \$0, $t4 | ||
| 1220 | xor $a_ptr, $a_ptr # borrow $a_ptr | ||
| 1221 | test \$1, $t0 | ||
| 1222 | |||
| 1223 | cmovz $t0, $a0 | ||
| 1224 | cmovz $t1, $a1 | ||
| 1225 | cmovz $t2, $a2 | ||
| 1226 | cmovz $t3, $a3 | ||
| 1227 | cmovz $a_ptr, $t4 | ||
| 1228 | |||
| 1229 | mov $a1, $t0 # a0:a3>>1 | ||
| 1230 | shr \$1, $a0 | ||
| 1231 | shl \$63, $t0 | ||
| 1232 | mov $a2, $t1 | ||
| 1233 | shr \$1, $a1 | ||
| 1234 | or $t0, $a0 | ||
| 1235 | shl \$63, $t1 | ||
| 1236 | mov $a3, $t2 | ||
| 1237 | shr \$1, $a2 | ||
| 1238 | or $t1, $a1 | ||
| 1239 | shl \$63, $t2 | ||
| 1240 | mov $a0, 8*0($r_ptr) | ||
| 1241 | shr \$1, $a3 | ||
| 1242 | mov $a1, 8*1($r_ptr) | ||
| 1243 | shl \$63, $t4 | ||
| 1244 | or $t2, $a2 | ||
| 1245 | or $t4, $a3 | ||
| 1246 | mov $a2, 8*2($r_ptr) | ||
| 1247 | mov $a3, 8*3($r_ptr) | ||
| 1248 | ___ | ||
| 1249 | } | ||
| 1250 | $code.=<<___; | ||
| 1251 | `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` | ||
| 1252 | lea $M(%rsp), $r_ptr | ||
| 1253 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); | ||
| 1254 | |||
| 1255 | lea $tmp0(%rsp), $r_ptr | ||
| 1256 | call __ecp_nistz256_mul_by_2$x | ||
| 1257 | |||
| 1258 | lea $M(%rsp), $b_ptr | ||
| 1259 | lea $M(%rsp), $r_ptr | ||
| 1260 | call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); | ||
| 1261 | |||
| 1262 | `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` | ||
| 1263 | lea $S(%rsp), $r_ptr | ||
| 1264 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); | ||
| 1265 | |||
| 1266 | lea $tmp0(%rsp), $r_ptr | ||
| 1267 | call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); | ||
| 1268 | |||
| 1269 | `&load_for_sqr("$M(%rsp)", "$src0")` | ||
| 1270 | movq %xmm0, $r_ptr | ||
| 1271 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); | ||
| 1272 | |||
| 1273 | lea $tmp0(%rsp), $b_ptr | ||
| 1274 | mov $acc6, $acc0 # harmonize sqr output and sub input | ||
| 1275 | mov $acc7, $acc1 | ||
| 1276 | mov $a_ptr, $poly1 | ||
| 1277 | mov $t1, $poly3 | ||
| 1278 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); | ||
| 1279 | |||
| 1280 | mov $S+8*0(%rsp), $t0 | ||
| 1281 | mov $S+8*1(%rsp), $t1 | ||
| 1282 | mov $S+8*2(%rsp), $t2 | ||
| 1283 | mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order | ||
| 1284 | lea $S(%rsp), $r_ptr | ||
| 1285 | call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); | ||
| 1286 | |||
| 1287 | mov $M(%rsp), $src0 | ||
| 1288 | lea $M(%rsp), $b_ptr | ||
| 1289 | mov $acc4, $acc6 # harmonize sub output and mul input | ||
| 1290 | xor %ecx, %ecx | ||
| 1291 | mov $acc4, $S+8*0(%rsp) # have to save:-( | ||
| 1292 | mov $acc5, $acc2 | ||
| 1293 | mov $acc5, $S+8*1(%rsp) | ||
| 1294 | cmovz $acc0, $acc3 | ||
| 1295 | mov $acc0, $S+8*2(%rsp) | ||
| 1296 | lea $S-$bias(%rsp), $a_ptr | ||
| 1297 | cmovz $acc1, $acc4 | ||
| 1298 | mov $acc1, $S+8*3(%rsp) | ||
| 1299 | mov $acc6, $acc1 | ||
| 1300 | lea $S(%rsp), $r_ptr | ||
| 1301 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); | ||
| 1302 | |||
| 1303 | movq %xmm1, $b_ptr | ||
| 1304 | movq %xmm1, $r_ptr | ||
| 1305 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); | ||
| 1306 | |||
| 1307 | add \$32*5+8, %rsp | ||
| 1308 | pop %r15 | ||
| 1309 | pop %r14 | ||
| 1310 | pop %r13 | ||
| 1311 | pop %r12 | ||
| 1312 | pop %rbx | ||
| 1313 | pop %rbp | ||
| 1314 | ret | ||
| 1315 | .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx | ||
| 1316 | ___ | ||
| 1317 | } | ||
| 1318 | &gen_double("q"); | ||
| 1319 | |||
| 1320 | sub gen_add () { | ||
| 1321 | my $x = shift; | ||
| 1322 | my ($src0,$sfx,$bias); | ||
| 1323 | my ($H,$Hsqr,$R,$Rsqr,$Hcub, | ||
| 1324 | $U1,$U2,$S1,$S2, | ||
| 1325 | $res_x,$res_y,$res_z, | ||
| 1326 | $in1_x,$in1_y,$in1_z, | ||
| 1327 | $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); | ||
| 1328 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | ||
| 1329 | |||
| 1330 | if ($x ne "x") { | ||
| 1331 | $src0 = "%rax"; | ||
| 1332 | $sfx = ""; | ||
| 1333 | $bias = 0; | ||
| 1334 | |||
| 1335 | $code.=<<___; | ||
| 1336 | .globl ecp_nistz256_point_add | ||
| 1337 | .type ecp_nistz256_point_add,\@function,3 | ||
| 1338 | .align 32 | ||
| 1339 | ecp_nistz256_point_add: | ||
| 1340 | ___ | ||
| 1341 | } else { | ||
| 1342 | $src0 = "%rdx"; | ||
| 1343 | $sfx = "x"; | ||
| 1344 | $bias = 128; | ||
| 1345 | } | ||
| 1346 | $code.=<<___; | ||
| 1347 | push %rbp | ||
| 1348 | push %rbx | ||
| 1349 | push %r12 | ||
| 1350 | push %r13 | ||
| 1351 | push %r14 | ||
| 1352 | push %r15 | ||
| 1353 | sub \$32*18+8, %rsp | ||
| 1354 | |||
| 1355 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr | ||
| 1356 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1357 | movdqu 0x20($a_ptr), %xmm2 | ||
| 1358 | movdqu 0x30($a_ptr), %xmm3 | ||
| 1359 | movdqu 0x40($a_ptr), %xmm4 | ||
| 1360 | movdqu 0x50($a_ptr), %xmm5 | ||
| 1361 | mov $a_ptr, $b_ptr # reassign | ||
| 1362 | mov $b_org, $a_ptr # reassign | ||
| 1363 | movdqa %xmm0, $in1_x(%rsp) | ||
| 1364 | movdqa %xmm1, $in1_x+0x10(%rsp) | ||
| 1365 | por %xmm0, %xmm1 | ||
| 1366 | movdqa %xmm2, $in1_y(%rsp) | ||
| 1367 | movdqa %xmm3, $in1_y+0x10(%rsp) | ||
| 1368 | por %xmm2, %xmm3 | ||
| 1369 | movdqa %xmm4, $in1_z(%rsp) | ||
| 1370 | movdqa %xmm5, $in1_z+0x10(%rsp) | ||
| 1371 | por %xmm1, %xmm3 | ||
| 1372 | |||
| 1373 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr | ||
| 1374 | pshufd \$0xb1, %xmm3, %xmm5 | ||
| 1375 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1376 | movdqu 0x20($a_ptr), %xmm2 | ||
| 1377 | por %xmm3, %xmm5 | ||
| 1378 | movdqu 0x30($a_ptr), %xmm3 | ||
| 1379 | mov 0x40+8*0($a_ptr), $src0 # load original in2_z | ||
| 1380 | mov 0x40+8*1($a_ptr), $acc6 | ||
| 1381 | mov 0x40+8*2($a_ptr), $acc7 | ||
| 1382 | mov 0x40+8*3($a_ptr), $acc0 | ||
| 1383 | movdqa %xmm0, $in2_x(%rsp) | ||
| 1384 | pshufd \$0x1e, %xmm5, %xmm4 | ||
| 1385 | movdqa %xmm1, $in2_x+0x10(%rsp) | ||
| 1386 | por %xmm0, %xmm1 | ||
| 1387 | movq $r_ptr, %xmm0 # save $r_ptr | ||
| 1388 | movdqa %xmm2, $in2_y(%rsp) | ||
| 1389 | movdqa %xmm3, $in2_y+0x10(%rsp) | ||
| 1390 | por %xmm2, %xmm3 | ||
| 1391 | por %xmm4, %xmm5 | ||
| 1392 | pxor %xmm4, %xmm4 | ||
| 1393 | por %xmm1, %xmm3 | ||
| 1394 | |||
| 1395 | lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid | ||
| 1396 | mov $src0, $in2_z+8*0(%rsp) # make in2_z copy | ||
| 1397 | mov $acc6, $in2_z+8*1(%rsp) | ||
| 1398 | mov $acc7, $in2_z+8*2(%rsp) | ||
| 1399 | mov $acc0, $in2_z+8*3(%rsp) | ||
| 1400 | lea $Z2sqr(%rsp), $r_ptr # Z2^2 | ||
| 1401 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); | ||
| 1402 | |||
| 1403 | pcmpeqd %xmm4, %xmm5 | ||
| 1404 | pshufd \$0xb1, %xmm3, %xmm4 | ||
| 1405 | por %xmm3, %xmm4 | ||
| 1406 | pshufd \$0, %xmm5, %xmm5 # in1infty | ||
| 1407 | pshufd \$0x1e, %xmm4, %xmm3 | ||
| 1408 | por %xmm3, %xmm4 | ||
| 1409 | pxor %xmm3, %xmm3 | ||
| 1410 | pcmpeqd %xmm3, %xmm4 | ||
| 1411 | pshufd \$0, %xmm4, %xmm4 # in2infty | ||
| 1412 | mov 0x40+8*0($b_ptr), $src0 # load original in1_z | ||
| 1413 | mov 0x40+8*1($b_ptr), $acc6 | ||
| 1414 | mov 0x40+8*2($b_ptr), $acc7 | ||
| 1415 | mov 0x40+8*3($b_ptr), $acc0 | ||
| 1416 | movq $b_ptr, %xmm1 | ||
| 1417 | |||
| 1418 | lea 0x40-$bias($b_ptr), $a_ptr | ||
| 1419 | lea $Z1sqr(%rsp), $r_ptr # Z1^2 | ||
| 1420 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1421 | |||
| 1422 | `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` | ||
| 1423 | lea $S1(%rsp), $r_ptr # S1 = Z2^3 | ||
| 1424 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); | ||
| 1425 | |||
| 1426 | `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1427 | lea $S2(%rsp), $r_ptr # S2 = Z1^3 | ||
| 1428 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1429 | |||
| 1430 | `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` | ||
| 1431 | lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 | ||
| 1432 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); | ||
| 1433 | |||
| 1434 | `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` | ||
| 1435 | lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 | ||
| 1436 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); | ||
| 1437 | |||
| 1438 | lea $S1(%rsp), $b_ptr | ||
| 1439 | lea $R(%rsp), $r_ptr # R = S2 - S1 | ||
| 1440 | call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); | ||
| 1441 | |||
| 1442 | or $acc5, $acc4 # see if result is zero | ||
| 1443 | movdqa %xmm4, %xmm2 | ||
| 1444 | or $acc0, $acc4 | ||
| 1445 | or $acc1, $acc4 | ||
| 1446 | por %xmm5, %xmm2 # in1infty || in2infty | ||
| 1447 | movq $acc4, %xmm3 | ||
| 1448 | |||
| 1449 | `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` | ||
| 1450 | lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 | ||
| 1451 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); | ||
| 1452 | |||
| 1453 | `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` | ||
| 1454 | lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 | ||
| 1455 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); | ||
| 1456 | |||
| 1457 | lea $U1(%rsp), $b_ptr | ||
| 1458 | lea $H(%rsp), $r_ptr # H = U2 - U1 | ||
| 1459 | call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); | ||
| 1460 | |||
| 1461 | or $acc5, $acc4 # see if result is zero | ||
| 1462 | or $acc0, $acc4 | ||
| 1463 | or $acc1, $acc4 | ||
| 1464 | |||
| 1465 | .byte 0x3e # predict taken | ||
| 1466 | jnz .Ladd_proceed$x # is_equal(U1,U2)? | ||
| 1467 | movq %xmm2, $acc0 | ||
| 1468 | movq %xmm3, $acc1 | ||
| 1469 | test $acc0, $acc0 | ||
| 1470 | jnz .Ladd_proceed$x # (in1infty || in2infty)? | ||
| 1471 | test $acc1, $acc1 | ||
| 1472 | jz .Ladd_double$x # is_equal(S1,S2)? | ||
| 1473 | |||
| 1474 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1475 | pxor %xmm0, %xmm0 | ||
| 1476 | movdqu %xmm0, 0x00($r_ptr) | ||
| 1477 | movdqu %xmm0, 0x10($r_ptr) | ||
| 1478 | movdqu %xmm0, 0x20($r_ptr) | ||
| 1479 | movdqu %xmm0, 0x30($r_ptr) | ||
| 1480 | movdqu %xmm0, 0x40($r_ptr) | ||
| 1481 | movdqu %xmm0, 0x50($r_ptr) | ||
| 1482 | jmp .Ladd_done$x | ||
| 1483 | |||
| 1484 | .align 32 | ||
| 1485 | .Ladd_double$x: | ||
| 1486 | movq %xmm1, $a_ptr # restore $a_ptr | ||
| 1487 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1488 | add \$`32*(18-5)`, %rsp # difference in frame sizes | ||
| 1489 | jmp .Lpoint_double_shortcut$x | ||
| 1490 | |||
| 1491 | .align 32 | ||
| 1492 | .Ladd_proceed$x: | ||
| 1493 | `&load_for_sqr("$R(%rsp)", "$src0")` | ||
| 1494 | lea $Rsqr(%rsp), $r_ptr # R^2 | ||
| 1495 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); | ||
| 1496 | |||
| 1497 | `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1498 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | ||
| 1499 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); | ||
| 1500 | |||
| 1501 | `&load_for_sqr("$H(%rsp)", "$src0")` | ||
| 1502 | lea $Hsqr(%rsp), $r_ptr # H^2 | ||
| 1503 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); | ||
| 1504 | |||
| 1505 | `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` | ||
| 1506 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | ||
| 1507 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); | ||
| 1508 | |||
| 1509 | `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` | ||
| 1510 | lea $Hcub(%rsp), $r_ptr # H^3 | ||
| 1511 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1512 | |||
| 1513 | `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` | ||
| 1514 | lea $U2(%rsp), $r_ptr # U1*H^2 | ||
| 1515 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); | ||
| 1516 | ___ | ||
| 1517 | { | ||
| 1518 | ####################################################################### | ||
| 1519 | # operate in 4-5-0-1 "name space" that matches multiplication output | ||
| 1520 | # | ||
| 1521 | my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | ||
| 1522 | my ($poly1, $poly3)=($acc6,$acc7); | ||
| 1523 | |||
| 1524 | $code.=<<___; | ||
| 1525 | #lea $U2(%rsp), $a_ptr | ||
| 1526 | #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | ||
| 1527 | #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | ||
| 1528 | |||
| 1529 | add $acc0, $acc0 # a0:a3+a0:a3 | ||
| 1530 | lea $Rsqr(%rsp), $a_ptr | ||
| 1531 | adc $acc1, $acc1 | ||
| 1532 | mov $acc0, $t0 | ||
| 1533 | adc $acc2, $acc2 | ||
| 1534 | adc $acc3, $acc3 | ||
| 1535 | mov $acc1, $t1 | ||
| 1536 | sbb $t4, $t4 | ||
| 1537 | |||
| 1538 | sub \$-1, $acc0 | ||
| 1539 | mov $acc2, $t2 | ||
| 1540 | sbb $poly1, $acc1 | ||
| 1541 | sbb \$0, $acc2 | ||
| 1542 | mov $acc3, $t3 | ||
| 1543 | sbb $poly3, $acc3 | ||
| 1544 | test $t4, $t4 | ||
| 1545 | |||
| 1546 | cmovz $t0, $acc0 | ||
| 1547 | mov 8*0($a_ptr), $t0 | ||
| 1548 | cmovz $t1, $acc1 | ||
| 1549 | mov 8*1($a_ptr), $t1 | ||
| 1550 | cmovz $t2, $acc2 | ||
| 1551 | mov 8*2($a_ptr), $t2 | ||
| 1552 | cmovz $t3, $acc3 | ||
| 1553 | mov 8*3($a_ptr), $t3 | ||
| 1554 | |||
| 1555 | call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1556 | |||
| 1557 | lea $Hcub(%rsp), $b_ptr | ||
| 1558 | lea $res_x(%rsp), $r_ptr | ||
| 1559 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); | ||
| 1560 | |||
| 1561 | mov $U2+8*0(%rsp), $t0 | ||
| 1562 | mov $U2+8*1(%rsp), $t1 | ||
| 1563 | mov $U2+8*2(%rsp), $t2 | ||
| 1564 | mov $U2+8*3(%rsp), $t3 | ||
| 1565 | lea $res_y(%rsp), $r_ptr | ||
| 1566 | |||
| 1567 | call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); | ||
| 1568 | |||
| 1569 | mov $acc0, 8*0($r_ptr) # save the result, as | ||
| 1570 | mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't | ||
| 1571 | mov $acc2, 8*2($r_ptr) | ||
| 1572 | mov $acc3, 8*3($r_ptr) | ||
| 1573 | ___ | ||
| 1574 | } | ||
| 1575 | $code.=<<___; | ||
| 1576 | `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` | ||
| 1577 | lea $S2(%rsp), $r_ptr | ||
| 1578 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); | ||
| 1579 | |||
| 1580 | `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` | ||
| 1581 | lea $res_y(%rsp), $r_ptr | ||
| 1582 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); | ||
| 1583 | |||
| 1584 | lea $S2(%rsp), $b_ptr | ||
| 1585 | lea $res_y(%rsp), $r_ptr | ||
| 1586 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); | ||
| 1587 | |||
| 1588 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1589 | |||
| 1590 | movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); | ||
| 1591 | movdqa %xmm5, %xmm1 | ||
| 1592 | pandn $res_z(%rsp), %xmm0 | ||
| 1593 | movdqa %xmm5, %xmm2 | ||
| 1594 | pandn $res_z+0x10(%rsp), %xmm1 | ||
| 1595 | movdqa %xmm5, %xmm3 | ||
| 1596 | pand $in2_z(%rsp), %xmm2 | ||
| 1597 | pand $in2_z+0x10(%rsp), %xmm3 | ||
| 1598 | por %xmm0, %xmm2 | ||
| 1599 | por %xmm1, %xmm3 | ||
| 1600 | |||
| 1601 | movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); | ||
| 1602 | movdqa %xmm4, %xmm1 | ||
| 1603 | pandn %xmm2, %xmm0 | ||
| 1604 | movdqa %xmm4, %xmm2 | ||
| 1605 | pandn %xmm3, %xmm1 | ||
| 1606 | movdqa %xmm4, %xmm3 | ||
| 1607 | pand $in1_z(%rsp), %xmm2 | ||
| 1608 | pand $in1_z+0x10(%rsp), %xmm3 | ||
| 1609 | por %xmm0, %xmm2 | ||
| 1610 | por %xmm1, %xmm3 | ||
| 1611 | movdqu %xmm2, 0x40($r_ptr) | ||
| 1612 | movdqu %xmm3, 0x50($r_ptr) | ||
| 1613 | |||
| 1614 | movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); | ||
| 1615 | movdqa %xmm5, %xmm1 | ||
| 1616 | pandn $res_x(%rsp), %xmm0 | ||
| 1617 | movdqa %xmm5, %xmm2 | ||
| 1618 | pandn $res_x+0x10(%rsp), %xmm1 | ||
| 1619 | movdqa %xmm5, %xmm3 | ||
| 1620 | pand $in2_x(%rsp), %xmm2 | ||
| 1621 | pand $in2_x+0x10(%rsp), %xmm3 | ||
| 1622 | por %xmm0, %xmm2 | ||
| 1623 | por %xmm1, %xmm3 | ||
| 1624 | |||
| 1625 | movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); | ||
| 1626 | movdqa %xmm4, %xmm1 | ||
| 1627 | pandn %xmm2, %xmm0 | ||
| 1628 | movdqa %xmm4, %xmm2 | ||
| 1629 | pandn %xmm3, %xmm1 | ||
| 1630 | movdqa %xmm4, %xmm3 | ||
| 1631 | pand $in1_x(%rsp), %xmm2 | ||
| 1632 | pand $in1_x+0x10(%rsp), %xmm3 | ||
| 1633 | por %xmm0, %xmm2 | ||
| 1634 | por %xmm1, %xmm3 | ||
| 1635 | movdqu %xmm2, 0x00($r_ptr) | ||
| 1636 | movdqu %xmm3, 0x10($r_ptr) | ||
| 1637 | |||
| 1638 | movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); | ||
| 1639 | movdqa %xmm5, %xmm1 | ||
| 1640 | pandn $res_y(%rsp), %xmm0 | ||
| 1641 | movdqa %xmm5, %xmm2 | ||
| 1642 | pandn $res_y+0x10(%rsp), %xmm1 | ||
| 1643 | movdqa %xmm5, %xmm3 | ||
| 1644 | pand $in2_y(%rsp), %xmm2 | ||
| 1645 | pand $in2_y+0x10(%rsp), %xmm3 | ||
| 1646 | por %xmm0, %xmm2 | ||
| 1647 | por %xmm1, %xmm3 | ||
| 1648 | |||
| 1649 | movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); | ||
| 1650 | movdqa %xmm4, %xmm1 | ||
| 1651 | pandn %xmm2, %xmm0 | ||
| 1652 | movdqa %xmm4, %xmm2 | ||
| 1653 | pandn %xmm3, %xmm1 | ||
| 1654 | movdqa %xmm4, %xmm3 | ||
| 1655 | pand $in1_y(%rsp), %xmm2 | ||
| 1656 | pand $in1_y+0x10(%rsp), %xmm3 | ||
| 1657 | por %xmm0, %xmm2 | ||
| 1658 | por %xmm1, %xmm3 | ||
| 1659 | movdqu %xmm2, 0x20($r_ptr) | ||
| 1660 | movdqu %xmm3, 0x30($r_ptr) | ||
| 1661 | |||
| 1662 | .Ladd_done$x: | ||
| 1663 | add \$32*18+8, %rsp | ||
| 1664 | pop %r15 | ||
| 1665 | pop %r14 | ||
| 1666 | pop %r13 | ||
| 1667 | pop %r12 | ||
| 1668 | pop %rbx | ||
| 1669 | pop %rbp | ||
| 1670 | ret | ||
| 1671 | .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx | ||
| 1672 | ___ | ||
| 1673 | } | ||
| 1674 | &gen_add("q"); | ||
| 1675 | |||
| 1676 | sub gen_add_affine () { | ||
| 1677 | my $x = shift; | ||
| 1678 | my ($src0,$sfx,$bias); | ||
| 1679 | my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, | ||
| 1680 | $res_x,$res_y,$res_z, | ||
| 1681 | $in1_x,$in1_y,$in1_z, | ||
| 1682 | $in2_x,$in2_y)=map(32*$_,(0..14)); | ||
| 1683 | my $Z1sqr = $S2; | ||
| 1684 | |||
| 1685 | if ($x ne "x") { | ||
| 1686 | $src0 = "%rax"; | ||
| 1687 | $sfx = ""; | ||
| 1688 | $bias = 0; | ||
| 1689 | |||
| 1690 | $code.=<<___; | ||
| 1691 | .globl ecp_nistz256_point_add_affine | ||
| 1692 | .type ecp_nistz256_point_add_affine,\@function,3 | ||
| 1693 | .align 32 | ||
| 1694 | ecp_nistz256_point_add_affine: | ||
| 1695 | ___ | ||
| 1696 | } else { | ||
| 1697 | $src0 = "%rdx"; | ||
| 1698 | $sfx = "x"; | ||
| 1699 | $bias = 128; | ||
| 1700 | } | ||
| 1701 | $code.=<<___; | ||
| 1702 | push %rbp | ||
| 1703 | push %rbx | ||
| 1704 | push %r12 | ||
| 1705 | push %r13 | ||
| 1706 | push %r14 | ||
| 1707 | push %r15 | ||
| 1708 | sub \$32*15+8, %rsp | ||
| 1709 | |||
| 1710 | movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr | ||
| 1711 | mov $b_org, $b_ptr # reassign | ||
| 1712 | movdqu 0x10($a_ptr), %xmm1 | ||
| 1713 | movdqu 0x20($a_ptr), %xmm2 | ||
| 1714 | movdqu 0x30($a_ptr), %xmm3 | ||
| 1715 | movdqu 0x40($a_ptr), %xmm4 | ||
| 1716 | movdqu 0x50($a_ptr), %xmm5 | ||
| 1717 | mov 0x40+8*0($a_ptr), $src0 # load original in1_z | ||
| 1718 | mov 0x40+8*1($a_ptr), $acc6 | ||
| 1719 | mov 0x40+8*2($a_ptr), $acc7 | ||
| 1720 | mov 0x40+8*3($a_ptr), $acc0 | ||
| 1721 | movdqa %xmm0, $in1_x(%rsp) | ||
| 1722 | movdqa %xmm1, $in1_x+0x10(%rsp) | ||
| 1723 | por %xmm0, %xmm1 | ||
| 1724 | movdqa %xmm2, $in1_y(%rsp) | ||
| 1725 | movdqa %xmm3, $in1_y+0x10(%rsp) | ||
| 1726 | por %xmm2, %xmm3 | ||
| 1727 | movdqa %xmm4, $in1_z(%rsp) | ||
| 1728 | movdqa %xmm5, $in1_z+0x10(%rsp) | ||
| 1729 | por %xmm1, %xmm3 | ||
| 1730 | |||
| 1731 | movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr | ||
| 1732 | pshufd \$0xb1, %xmm3, %xmm5 | ||
| 1733 | movdqu 0x10($b_ptr), %xmm1 | ||
| 1734 | movdqu 0x20($b_ptr), %xmm2 | ||
| 1735 | por %xmm3, %xmm5 | ||
| 1736 | movdqu 0x30($b_ptr), %xmm3 | ||
| 1737 | movdqa %xmm0, $in2_x(%rsp) | ||
| 1738 | pshufd \$0x1e, %xmm5, %xmm4 | ||
| 1739 | movdqa %xmm1, $in2_x+0x10(%rsp) | ||
| 1740 | por %xmm0, %xmm1 | ||
| 1741 | movq $r_ptr, %xmm0 # save $r_ptr | ||
| 1742 | movdqa %xmm2, $in2_y(%rsp) | ||
| 1743 | movdqa %xmm3, $in2_y+0x10(%rsp) | ||
| 1744 | por %xmm2, %xmm3 | ||
| 1745 | por %xmm4, %xmm5 | ||
| 1746 | pxor %xmm4, %xmm4 | ||
| 1747 | por %xmm1, %xmm3 | ||
| 1748 | |||
| 1749 | lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid | ||
| 1750 | lea $Z1sqr(%rsp), $r_ptr # Z1^2 | ||
| 1751 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); | ||
| 1752 | |||
| 1753 | pcmpeqd %xmm4, %xmm5 | ||
| 1754 | pshufd \$0xb1, %xmm3, %xmm4 | ||
| 1755 | mov 0x00($b_ptr), $src0 # $b_ptr is still valid | ||
| 1756 | #lea 0x00($b_ptr), $b_ptr | ||
| 1757 | mov $acc4, $acc1 # harmonize sqr output and mul input | ||
| 1758 | por %xmm3, %xmm4 | ||
| 1759 | pshufd \$0, %xmm5, %xmm5 # in1infty | ||
| 1760 | pshufd \$0x1e, %xmm4, %xmm3 | ||
| 1761 | mov $acc5, $acc2 | ||
| 1762 | por %xmm3, %xmm4 | ||
| 1763 | pxor %xmm3, %xmm3 | ||
| 1764 | mov $acc6, $acc3 | ||
| 1765 | pcmpeqd %xmm3, %xmm4 | ||
| 1766 | pshufd \$0, %xmm4, %xmm4 # in2infty | ||
| 1767 | |||
| 1768 | lea $Z1sqr-$bias(%rsp), $a_ptr | ||
| 1769 | mov $acc7, $acc4 | ||
| 1770 | lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 | ||
| 1771 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); | ||
| 1772 | |||
| 1773 | lea $in1_x(%rsp), $b_ptr | ||
| 1774 | lea $H(%rsp), $r_ptr # H = U2 - U1 | ||
| 1775 | call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); | ||
| 1776 | |||
| 1777 | `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1778 | lea $S2(%rsp), $r_ptr # S2 = Z1^3 | ||
| 1779 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); | ||
| 1780 | |||
| 1781 | `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` | ||
| 1782 | lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 | ||
| 1783 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); | ||
| 1784 | |||
| 1785 | `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` | ||
| 1786 | lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 | ||
| 1787 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); | ||
| 1788 | |||
| 1789 | lea $in1_y(%rsp), $b_ptr | ||
| 1790 | lea $R(%rsp), $r_ptr # R = S2 - S1 | ||
| 1791 | call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); | ||
| 1792 | |||
| 1793 | `&load_for_sqr("$H(%rsp)", "$src0")` | ||
| 1794 | lea $Hsqr(%rsp), $r_ptr # H^2 | ||
| 1795 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); | ||
| 1796 | |||
| 1797 | `&load_for_sqr("$R(%rsp)", "$src0")` | ||
| 1798 | lea $Rsqr(%rsp), $r_ptr # R^2 | ||
| 1799 | call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); | ||
| 1800 | |||
| 1801 | `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` | ||
| 1802 | lea $Hcub(%rsp), $r_ptr # H^3 | ||
| 1803 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); | ||
| 1804 | |||
| 1805 | `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` | ||
| 1806 | lea $U2(%rsp), $r_ptr # U1*H^2 | ||
| 1807 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); | ||
| 1808 | ___ | ||
| 1809 | { | ||
| 1810 | ####################################################################### | ||
| 1811 | # operate in 4-5-0-1 "name space" that matches multiplication output | ||
| 1812 | # | ||
| 1813 | my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); | ||
| 1814 | my ($poly1, $poly3)=($acc6,$acc7); | ||
| 1815 | |||
| 1816 | $code.=<<___; | ||
| 1817 | #lea $U2(%rsp), $a_ptr | ||
| 1818 | #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | ||
| 1819 | #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | ||
| 1820 | |||
| 1821 | add $acc0, $acc0 # a0:a3+a0:a3 | ||
| 1822 | lea $Rsqr(%rsp), $a_ptr | ||
| 1823 | adc $acc1, $acc1 | ||
| 1824 | mov $acc0, $t0 | ||
| 1825 | adc $acc2, $acc2 | ||
| 1826 | adc $acc3, $acc3 | ||
| 1827 | mov $acc1, $t1 | ||
| 1828 | sbb $t4, $t4 | ||
| 1829 | |||
| 1830 | sub \$-1, $acc0 | ||
| 1831 | mov $acc2, $t2 | ||
| 1832 | sbb $poly1, $acc1 | ||
| 1833 | sbb \$0, $acc2 | ||
| 1834 | mov $acc3, $t3 | ||
| 1835 | sbb $poly3, $acc3 | ||
| 1836 | test $t4, $t4 | ||
| 1837 | |||
| 1838 | cmovz $t0, $acc0 | ||
| 1839 | mov 8*0($a_ptr), $t0 | ||
| 1840 | cmovz $t1, $acc1 | ||
| 1841 | mov 8*1($a_ptr), $t1 | ||
| 1842 | cmovz $t2, $acc2 | ||
| 1843 | mov 8*2($a_ptr), $t2 | ||
| 1844 | cmovz $t3, $acc3 | ||
| 1845 | mov 8*3($a_ptr), $t3 | ||
| 1846 | |||
| 1847 | call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | ||
| 1848 | |||
| 1849 | lea $Hcub(%rsp), $b_ptr | ||
| 1850 | lea $res_x(%rsp), $r_ptr | ||
| 1851 | call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); | ||
| 1852 | |||
| 1853 | mov $U2+8*0(%rsp), $t0 | ||
| 1854 | mov $U2+8*1(%rsp), $t1 | ||
| 1855 | mov $U2+8*2(%rsp), $t2 | ||
| 1856 | mov $U2+8*3(%rsp), $t3 | ||
| 1857 | lea $H(%rsp), $r_ptr | ||
| 1858 | |||
| 1859 | call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); | ||
| 1860 | |||
| 1861 | mov $acc0, 8*0($r_ptr) # save the result, as | ||
| 1862 | mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't | ||
| 1863 | mov $acc2, 8*2($r_ptr) | ||
| 1864 | mov $acc3, 8*3($r_ptr) | ||
| 1865 | ___ | ||
| 1866 | } | ||
| 1867 | $code.=<<___; | ||
| 1868 | `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` | ||
| 1869 | lea $S2(%rsp), $r_ptr | ||
| 1870 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); | ||
| 1871 | |||
| 1872 | `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` | ||
| 1873 | lea $H(%rsp), $r_ptr | ||
| 1874 | call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); | ||
| 1875 | |||
| 1876 | lea $S2(%rsp), $b_ptr | ||
| 1877 | lea $res_y(%rsp), $r_ptr | ||
| 1878 | call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); | ||
| 1879 | |||
| 1880 | movq %xmm0, $r_ptr # restore $r_ptr | ||
| 1881 | |||
| 1882 | movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); | ||
| 1883 | movdqa %xmm5, %xmm1 | ||
| 1884 | pandn $res_z(%rsp), %xmm0 | ||
| 1885 | movdqa %xmm5, %xmm2 | ||
| 1886 | pandn $res_z+0x10(%rsp), %xmm1 | ||
| 1887 | movdqa %xmm5, %xmm3 | ||
| 1888 | pand .LONE_mont(%rip), %xmm2 | ||
| 1889 | pand .LONE_mont+0x10(%rip), %xmm3 | ||
| 1890 | por %xmm0, %xmm2 | ||
| 1891 | por %xmm1, %xmm3 | ||
| 1892 | |||
| 1893 | movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); | ||
| 1894 | movdqa %xmm4, %xmm1 | ||
| 1895 | pandn %xmm2, %xmm0 | ||
| 1896 | movdqa %xmm4, %xmm2 | ||
| 1897 | pandn %xmm3, %xmm1 | ||
| 1898 | movdqa %xmm4, %xmm3 | ||
| 1899 | pand $in1_z(%rsp), %xmm2 | ||
| 1900 | pand $in1_z+0x10(%rsp), %xmm3 | ||
| 1901 | por %xmm0, %xmm2 | ||
| 1902 | por %xmm1, %xmm3 | ||
| 1903 | movdqu %xmm2, 0x40($r_ptr) | ||
| 1904 | movdqu %xmm3, 0x50($r_ptr) | ||
| 1905 | |||
| 1906 | movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); | ||
| 1907 | movdqa %xmm5, %xmm1 | ||
| 1908 | pandn $res_x(%rsp), %xmm0 | ||
| 1909 | movdqa %xmm5, %xmm2 | ||
| 1910 | pandn $res_x+0x10(%rsp), %xmm1 | ||
| 1911 | movdqa %xmm5, %xmm3 | ||
| 1912 | pand $in2_x(%rsp), %xmm2 | ||
| 1913 | pand $in2_x+0x10(%rsp), %xmm3 | ||
| 1914 | por %xmm0, %xmm2 | ||
| 1915 | por %xmm1, %xmm3 | ||
| 1916 | |||
| 1917 | movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); | ||
| 1918 | movdqa %xmm4, %xmm1 | ||
| 1919 | pandn %xmm2, %xmm0 | ||
| 1920 | movdqa %xmm4, %xmm2 | ||
| 1921 | pandn %xmm3, %xmm1 | ||
| 1922 | movdqa %xmm4, %xmm3 | ||
| 1923 | pand $in1_x(%rsp), %xmm2 | ||
| 1924 | pand $in1_x+0x10(%rsp), %xmm3 | ||
| 1925 | por %xmm0, %xmm2 | ||
| 1926 | por %xmm1, %xmm3 | ||
| 1927 | movdqu %xmm2, 0x00($r_ptr) | ||
| 1928 | movdqu %xmm3, 0x10($r_ptr) | ||
| 1929 | |||
| 1930 | movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); | ||
| 1931 | movdqa %xmm5, %xmm1 | ||
| 1932 | pandn $res_y(%rsp), %xmm0 | ||
| 1933 | movdqa %xmm5, %xmm2 | ||
| 1934 | pandn $res_y+0x10(%rsp), %xmm1 | ||
| 1935 | movdqa %xmm5, %xmm3 | ||
| 1936 | pand $in2_y(%rsp), %xmm2 | ||
| 1937 | pand $in2_y+0x10(%rsp), %xmm3 | ||
| 1938 | por %xmm0, %xmm2 | ||
| 1939 | por %xmm1, %xmm3 | ||
| 1940 | |||
| 1941 | movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); | ||
| 1942 | movdqa %xmm4, %xmm1 | ||
| 1943 | pandn %xmm2, %xmm0 | ||
| 1944 | movdqa %xmm4, %xmm2 | ||
| 1945 | pandn %xmm3, %xmm1 | ||
| 1946 | movdqa %xmm4, %xmm3 | ||
| 1947 | pand $in1_y(%rsp), %xmm2 | ||
| 1948 | pand $in1_y+0x10(%rsp), %xmm3 | ||
| 1949 | por %xmm0, %xmm2 | ||
| 1950 | por %xmm1, %xmm3 | ||
| 1951 | movdqu %xmm2, 0x20($r_ptr) | ||
| 1952 | movdqu %xmm3, 0x30($r_ptr) | ||
| 1953 | |||
| 1954 | add \$32*15+8, %rsp | ||
| 1955 | pop %r15 | ||
| 1956 | pop %r14 | ||
| 1957 | pop %r13 | ||
| 1958 | pop %r12 | ||
| 1959 | pop %rbx | ||
| 1960 | pop %rbp | ||
| 1961 | ret | ||
| 1962 | .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx | ||
| 1963 | ___ | ||
| 1964 | } | ||
| 1965 | &gen_add_affine("q"); | ||
| 1966 | |||
| 1967 | }}} | ||
| 1968 | |||
| 1969 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 1970 | print $code; | ||
| 1971 | close STDOUT; | ||
