summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/ec/asm
diff options
context:
space:
mode:
authorjsing <>2023-01-14 15:45:44 +0000
committerjsing <>2023-01-14 15:45:44 +0000
commit2caf68e3ec46ff4ba172978eb728e2aa23948684 (patch)
tree2beaa725f80865b4e4c7d3384563e3ed4940579e /src/lib/libcrypto/ec/asm
parente182204c3487929ef9f6791554e79586f4d30335 (diff)
downloadopenbsd-2caf68e3ec46ff4ba172978eb728e2aa23948684.tar.gz
openbsd-2caf68e3ec46ff4ba172978eb728e2aa23948684.tar.bz2
openbsd-2caf68e3ec46ff4ba172978eb728e2aa23948684.zip
Remove unused Elliptic Curve code.
For various reasons, the ecp_nistp* and ecp_nistz* code is unused. While ecp_nistp* was being compiled, it is disabled due to OPENSSL_NO_EC_NISTP_64_GCC_128 being defined. On the other hand, ecp_nistz* was not even being built. We will bring in new versions or alternative versions of such code, if we end up enabling it in the future. For now it is just causing complexity (and grep noise) while trying to improve the EC code. Discussed with tb@
Diffstat (limited to 'src/lib/libcrypto/ec/asm')
-rw-r--r--src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl1733
-rw-r--r--src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl2890
-rw-r--r--src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl1740
-rw-r--r--src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl1971
4 files changed, 0 insertions, 8334 deletions
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl
deleted file mode 100644
index 9e6c65905f..0000000000
--- a/src/lib/libcrypto/ec/asm/ecp_nistz256-armv4.pl
+++ /dev/null
@@ -1,1733 +0,0 @@
1#! /usr/bin/env perl
2# $OpenBSD: ecp_nistz256-armv4.pl,v 1.2 2022/12/26 07:18:51 jmc Exp $
3#
4# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
5#
6# Licensed under the OpenSSL license (the "License"). You may not use
7# this file except in compliance with the License. You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see http://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18#
19# ECP_NISTZ256 module for ARMv4.
20#
21# October 2014.
22#
23# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
24# http://eprint.iacr.org/2013/816. In the process of adaptation
25# original .c module was made 32-bit savvy in order to make this
26# implementation possible.
27#
28# with/without -DECP_NISTZ256_ASM
29# Cortex-A8 +53-170%
30# Cortex-A9 +76-205%
31# Cortex-A15 +100-316%
32# Snapdragon S4 +66-187%
33#
34# Ranges denote minimum and maximum improvement coefficients depending
35# on benchmark. Lower coefficients are for ECDSA sign, server-side
36# operation. Keep in mind that +200% means 3x improvement.
37
38$flavour = shift;
39if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
40else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
41
42if ($flavour && $flavour ne "void") {
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46 die "can't locate arm-xlate.pl";
47
48 open STDOUT,"| \"$^X\" $xlate $flavour $output";
49} else {
50 open STDOUT,">$output";
51}
52
53$code.=<<___;
54#include "arm_arch.h"
55
56.text
57#if defined(__thumb2__)
58.syntax unified
59.thumb
60#else
61.code 32
62#endif
63___
64
65$code.=<<___;
66.Lone:
67.long 1,0,0,0,0,0,0,0
68.align 6
69___
70
71########################################################################
72# common register layout, note that $t2 is link register, so that if
73# internal subroutine uses $t2, then it has to offload lr...
74
75($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
76 map("r$_",(0..12,14));
77($t0,$t3)=($ff,$a_ptr);
78
79$code.=<<___;
80@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
81.globl ecp_nistz256_from_mont
82.type ecp_nistz256_from_mont,%function
83ecp_nistz256_from_mont:
84 adr $b_ptr,.Lone
85 b .Lecp_nistz256_mul_mont
86.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
87
88@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
89.globl ecp_nistz256_mul_by_2
90.type ecp_nistz256_mul_by_2,%function
91.align 4
92ecp_nistz256_mul_by_2:
93 stmdb sp!,{r4-r12,lr}
94 bl __ecp_nistz256_mul_by_2
95#if __ARM_ARCH__>=5 || !defined(__thumb__)
96 ldmia sp!,{r4-r12,pc}
97#else
98 ldmia sp!,{r4-r12,lr}
99 bx lr @ interoperable with Thumb ISA:-)
100#endif
101.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
102
103.type __ecp_nistz256_mul_by_2,%function
104.align 4
105__ecp_nistz256_mul_by_2:
106 ldr $a0,[$a_ptr,#0]
107 ldr $a1,[$a_ptr,#4]
108 ldr $a2,[$a_ptr,#8]
109 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
110 ldr $a3,[$a_ptr,#12]
111 adcs $a1,$a1,$a1
112 ldr $a4,[$a_ptr,#16]
113 adcs $a2,$a2,$a2
114 ldr $a5,[$a_ptr,#20]
115 adcs $a3,$a3,$a3
116 ldr $a6,[$a_ptr,#24]
117 adcs $a4,$a4,$a4
118 ldr $a7,[$a_ptr,#28]
119 adcs $a5,$a5,$a5
120 adcs $a6,$a6,$a6
121 mov $ff,#0
122 adcs $a7,$a7,$a7
123 adc $ff,$ff,#0
124
125 b .Lreduce_by_sub
126.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
127
128@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
129@ const BN_ULONG r2[8]);
130.globl ecp_nistz256_add
131.type ecp_nistz256_add,%function
132.align 4
133ecp_nistz256_add:
134 stmdb sp!,{r4-r12,lr}
135 bl __ecp_nistz256_add
136#if __ARM_ARCH__>=5 || !defined(__thumb__)
137 ldmia sp!,{r4-r12,pc}
138#else
139 ldmia sp!,{r4-r12,lr}
140 bx lr @ interoperable with Thumb ISA:-)
141#endif
142.size ecp_nistz256_add,.-ecp_nistz256_add
143
144.type __ecp_nistz256_add,%function
145.align 4
146__ecp_nistz256_add:
147 str lr,[sp,#-4]! @ push lr
148
149 ldr $a0,[$a_ptr,#0]
150 ldr $a1,[$a_ptr,#4]
151 ldr $a2,[$a_ptr,#8]
152 ldr $a3,[$a_ptr,#12]
153 ldr $a4,[$a_ptr,#16]
154 ldr $t0,[$b_ptr,#0]
155 ldr $a5,[$a_ptr,#20]
156 ldr $t1,[$b_ptr,#4]
157 ldr $a6,[$a_ptr,#24]
158 ldr $t2,[$b_ptr,#8]
159 ldr $a7,[$a_ptr,#28]
160 ldr $t3,[$b_ptr,#12]
161 adds $a0,$a0,$t0
162 ldr $t0,[$b_ptr,#16]
163 adcs $a1,$a1,$t1
164 ldr $t1,[$b_ptr,#20]
165 adcs $a2,$a2,$t2
166 ldr $t2,[$b_ptr,#24]
167 adcs $a3,$a3,$t3
168 ldr $t3,[$b_ptr,#28]
169 adcs $a4,$a4,$t0
170 adcs $a5,$a5,$t1
171 adcs $a6,$a6,$t2
172 mov $ff,#0
173 adcs $a7,$a7,$t3
174 adc $ff,$ff,#0
175 ldr lr,[sp],#4 @ pop lr
176
177.Lreduce_by_sub:
178
179 @ if a+b >= modulus, subtract modulus.
180 @
181 @ But since comparison implies subtraction, we subtract
182 @ modulus and then add it back if subtraction borrowed.
183
184 subs $a0,$a0,#-1
185 sbcs $a1,$a1,#-1
186 sbcs $a2,$a2,#-1
187 sbcs $a3,$a3,#0
188 sbcs $a4,$a4,#0
189 sbcs $a5,$a5,#0
190 sbcs $a6,$a6,#1
191 sbcs $a7,$a7,#-1
192 sbc $ff,$ff,#0
193
194 @ Note that because mod has special form, i.e. consists of
195 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
196 @ using value of borrow as a whole or extracting single bit.
197 @ Follow $ff register...
198
199 adds $a0,$a0,$ff @ add synthesized modulus
200 adcs $a1,$a1,$ff
201 str $a0,[$r_ptr,#0]
202 adcs $a2,$a2,$ff
203 str $a1,[$r_ptr,#4]
204 adcs $a3,$a3,#0
205 str $a2,[$r_ptr,#8]
206 adcs $a4,$a4,#0
207 str $a3,[$r_ptr,#12]
208 adcs $a5,$a5,#0
209 str $a4,[$r_ptr,#16]
210 adcs $a6,$a6,$ff,lsr#31
211 str $a5,[$r_ptr,#20]
212 adcs $a7,$a7,$ff
213 str $a6,[$r_ptr,#24]
214 str $a7,[$r_ptr,#28]
215
216 mov pc,lr
217.size __ecp_nistz256_add,.-__ecp_nistz256_add
218
219@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
220.globl ecp_nistz256_mul_by_3
221.type ecp_nistz256_mul_by_3,%function
222.align 4
223ecp_nistz256_mul_by_3:
224 stmdb sp!,{r4-r12,lr}
225 bl __ecp_nistz256_mul_by_3
226#if __ARM_ARCH__>=5 || !defined(__thumb__)
227 ldmia sp!,{r4-r12,pc}
228#else
229 ldmia sp!,{r4-r12,lr}
230 bx lr @ interoperable with Thumb ISA:-)
231#endif
232.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
233
234.type __ecp_nistz256_mul_by_3,%function
235.align 4
236__ecp_nistz256_mul_by_3:
237 str lr,[sp,#-4]! @ push lr
238
239 @ As multiplication by 3 is performed as 2*n+n, below are inline
240 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
241 @ corresponding subroutines for details.
242
243 ldr $a0,[$a_ptr,#0]
244 ldr $a1,[$a_ptr,#4]
245 ldr $a2,[$a_ptr,#8]
246 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
247 ldr $a3,[$a_ptr,#12]
248 adcs $a1,$a1,$a1
249 ldr $a4,[$a_ptr,#16]
250 adcs $a2,$a2,$a2
251 ldr $a5,[$a_ptr,#20]
252 adcs $a3,$a3,$a3
253 ldr $a6,[$a_ptr,#24]
254 adcs $a4,$a4,$a4
255 ldr $a7,[$a_ptr,#28]
256 adcs $a5,$a5,$a5
257 adcs $a6,$a6,$a6
258 mov $ff,#0
259 adcs $a7,$a7,$a7
260 adc $ff,$ff,#0
261
262 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
263 sbcs $a1,$a1,#-1
264 sbcs $a2,$a2,#-1
265 sbcs $a3,$a3,#0
266 sbcs $a4,$a4,#0
267 sbcs $a5,$a5,#0
268 sbcs $a6,$a6,#1
269 sbcs $a7,$a7,#-1
270 sbc $ff,$ff,#0
271
272 adds $a0,$a0,$ff @ add synthesized modulus
273 adcs $a1,$a1,$ff
274 adcs $a2,$a2,$ff
275 adcs $a3,$a3,#0
276 adcs $a4,$a4,#0
277 ldr $b_ptr,[$a_ptr,#0]
278 adcs $a5,$a5,#0
279 ldr $t1,[$a_ptr,#4]
280 adcs $a6,$a6,$ff,lsr#31
281 ldr $t2,[$a_ptr,#8]
282 adc $a7,$a7,$ff
283
284 ldr $t0,[$a_ptr,#12]
285 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
286 ldr $b_ptr,[$a_ptr,#16]
287 adcs $a1,$a1,$t1
288 ldr $t1,[$a_ptr,#20]
289 adcs $a2,$a2,$t2
290 ldr $t2,[$a_ptr,#24]
291 adcs $a3,$a3,$t0
292 ldr $t3,[$a_ptr,#28]
293 adcs $a4,$a4,$b_ptr
294 adcs $a5,$a5,$t1
295 adcs $a6,$a6,$t2
296 mov $ff,#0
297 adcs $a7,$a7,$t3
298 adc $ff,$ff,#0
299 ldr lr,[sp],#4 @ pop lr
300
301 b .Lreduce_by_sub
302.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
303
304@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
305.globl ecp_nistz256_div_by_2
306.type ecp_nistz256_div_by_2,%function
307.align 4
308ecp_nistz256_div_by_2:
309 stmdb sp!,{r4-r12,lr}
310 bl __ecp_nistz256_div_by_2
311#if __ARM_ARCH__>=5 || !defined(__thumb__)
312 ldmia sp!,{r4-r12,pc}
313#else
314 ldmia sp!,{r4-r12,lr}
315 bx lr @ interoperable with Thumb ISA:-)
316#endif
317.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
318
319.type __ecp_nistz256_div_by_2,%function
320.align 4
321__ecp_nistz256_div_by_2:
322 @ ret = (a is odd ? a+mod : a) >> 1
323
324 ldr $a0,[$a_ptr,#0]
325 ldr $a1,[$a_ptr,#4]
326 ldr $a2,[$a_ptr,#8]
327 mov $ff,$a0,lsl#31 @ place least significant bit to most
328 @ significant position, now arithmetic
329 @ right shift by 31 will produce -1 or
330 @ 0, while logical right shift 1 or 0,
331 @ this is how modulus is conditionally
332 @ synthesized in this case...
333 ldr $a3,[$a_ptr,#12]
334 adds $a0,$a0,$ff,asr#31
335 ldr $a4,[$a_ptr,#16]
336 adcs $a1,$a1,$ff,asr#31
337 ldr $a5,[$a_ptr,#20]
338 adcs $a2,$a2,$ff,asr#31
339 ldr $a6,[$a_ptr,#24]
340 adcs $a3,$a3,#0
341 ldr $a7,[$a_ptr,#28]
342 adcs $a4,$a4,#0
343 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
344 @ because it doesn't affect flags
345 adcs $a5,$a5,#0
346 orr $a0,$a0,$a1,lsl#31
347 adcs $a6,$a6,$ff,lsr#31
348 mov $b_ptr,#0
349 adcs $a7,$a7,$ff,asr#31
350 mov $a1,$a1,lsr#1
351 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
352
353 orr $a1,$a1,$a2,lsl#31
354 mov $a2,$a2,lsr#1
355 str $a0,[$r_ptr,#0]
356 orr $a2,$a2,$a3,lsl#31
357 mov $a3,$a3,lsr#1
358 str $a1,[$r_ptr,#4]
359 orr $a3,$a3,$a4,lsl#31
360 mov $a4,$a4,lsr#1
361 str $a2,[$r_ptr,#8]
362 orr $a4,$a4,$a5,lsl#31
363 mov $a5,$a5,lsr#1
364 str $a3,[$r_ptr,#12]
365 orr $a5,$a5,$a6,lsl#31
366 mov $a6,$a6,lsr#1
367 str $a4,[$r_ptr,#16]
368 orr $a6,$a6,$a7,lsl#31
369 mov $a7,$a7,lsr#1
370 str $a5,[$r_ptr,#20]
371 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
372 str $a6,[$r_ptr,#24]
373 str $a7,[$r_ptr,#28]
374
375 mov pc,lr
376.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
377
378@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
379@ const BN_ULONG r2[8]);
380.globl ecp_nistz256_sub
381.type ecp_nistz256_sub,%function
382.align 4
383ecp_nistz256_sub:
384 stmdb sp!,{r4-r12,lr}
385 bl __ecp_nistz256_sub
386#if __ARM_ARCH__>=5 || !defined(__thumb__)
387 ldmia sp!,{r4-r12,pc}
388#else
389 ldmia sp!,{r4-r12,lr}
390 bx lr @ interoperable with Thumb ISA:-)
391#endif
392.size ecp_nistz256_sub,.-ecp_nistz256_sub
393
394.type __ecp_nistz256_sub,%function
395.align 4
396__ecp_nistz256_sub:
397 str lr,[sp,#-4]! @ push lr
398
399 ldr $a0,[$a_ptr,#0]
400 ldr $a1,[$a_ptr,#4]
401 ldr $a2,[$a_ptr,#8]
402 ldr $a3,[$a_ptr,#12]
403 ldr $a4,[$a_ptr,#16]
404 ldr $t0,[$b_ptr,#0]
405 ldr $a5,[$a_ptr,#20]
406 ldr $t1,[$b_ptr,#4]
407 ldr $a6,[$a_ptr,#24]
408 ldr $t2,[$b_ptr,#8]
409 ldr $a7,[$a_ptr,#28]
410 ldr $t3,[$b_ptr,#12]
411 subs $a0,$a0,$t0
412 ldr $t0,[$b_ptr,#16]
413 sbcs $a1,$a1,$t1
414 ldr $t1,[$b_ptr,#20]
415 sbcs $a2,$a2,$t2
416 ldr $t2,[$b_ptr,#24]
417 sbcs $a3,$a3,$t3
418 ldr $t3,[$b_ptr,#28]
419 sbcs $a4,$a4,$t0
420 sbcs $a5,$a5,$t1
421 sbcs $a6,$a6,$t2
422 sbcs $a7,$a7,$t3
423 sbc $ff,$ff,$ff @ broadcast borrow bit
424 ldr lr,[sp],#4 @ pop lr
425
426.Lreduce_by_add:
427
428 @ if a-b borrows, add modulus.
429 @
430 @ Note that because mod has special form, i.e. consists of
431 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
432 @ broadcasting borrow bit to a register, $ff, and using it as
433 @ a whole or extracting single bit.
434
435 adds $a0,$a0,$ff @ add synthesized modulus
436 adcs $a1,$a1,$ff
437 str $a0,[$r_ptr,#0]
438 adcs $a2,$a2,$ff
439 str $a1,[$r_ptr,#4]
440 adcs $a3,$a3,#0
441 str $a2,[$r_ptr,#8]
442 adcs $a4,$a4,#0
443 str $a3,[$r_ptr,#12]
444 adcs $a5,$a5,#0
445 str $a4,[$r_ptr,#16]
446 adcs $a6,$a6,$ff,lsr#31
447 str $a5,[$r_ptr,#20]
448 adcs $a7,$a7,$ff
449 str $a6,[$r_ptr,#24]
450 str $a7,[$r_ptr,#28]
451
452 mov pc,lr
453.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
454
455@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
456.globl ecp_nistz256_neg
457.type ecp_nistz256_neg,%function
458.align 4
459ecp_nistz256_neg:
460 stmdb sp!,{r4-r12,lr}
461 bl __ecp_nistz256_neg
462#if __ARM_ARCH__>=5 || !defined(__thumb__)
463 ldmia sp!,{r4-r12,pc}
464#else
465 ldmia sp!,{r4-r12,lr}
466 bx lr @ interoperable with Thumb ISA:-)
467#endif
468.size ecp_nistz256_neg,.-ecp_nistz256_neg
469
470.type __ecp_nistz256_neg,%function
471.align 4
472__ecp_nistz256_neg:
473 ldr $a0,[$a_ptr,#0]
474 eor $ff,$ff,$ff
475 ldr $a1,[$a_ptr,#4]
476 ldr $a2,[$a_ptr,#8]
477 subs $a0,$ff,$a0
478 ldr $a3,[$a_ptr,#12]
479 sbcs $a1,$ff,$a1
480 ldr $a4,[$a_ptr,#16]
481 sbcs $a2,$ff,$a2
482 ldr $a5,[$a_ptr,#20]
483 sbcs $a3,$ff,$a3
484 ldr $a6,[$a_ptr,#24]
485 sbcs $a4,$ff,$a4
486 ldr $a7,[$a_ptr,#28]
487 sbcs $a5,$ff,$a5
488 sbcs $a6,$ff,$a6
489 sbcs $a7,$ff,$a7
490 sbc $ff,$ff,$ff
491
492 b .Lreduce_by_add
493.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
494___
495{
496my @acc=map("r$_",(3..11));
497my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
498
499$code.=<<___;
500@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
501.globl ecp_nistz256_sqr_mont
502.type ecp_nistz256_sqr_mont,%function
503.align 4
504ecp_nistz256_sqr_mont:
505 mov $b_ptr,$a_ptr
506 b .Lecp_nistz256_mul_mont
507.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
508
509@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
510@ const BN_ULONG r2[8]);
511.globl ecp_nistz256_mul_mont
512.type ecp_nistz256_mul_mont,%function
513.align 4
514ecp_nistz256_mul_mont:
515.Lecp_nistz256_mul_mont:
516 stmdb sp!,{r4-r12,lr}
517 bl __ecp_nistz256_mul_mont
518#if __ARM_ARCH__>=5 || !defined(__thumb__)
519 ldmia sp!,{r4-r12,pc}
520#else
521 ldmia sp!,{r4-r12,lr}
522 bx lr @ interoperable with Thumb ISA:-)
523#endif
524.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
525
526.type __ecp_nistz256_mul_mont,%function
527.align 4
528__ecp_nistz256_mul_mont:
529 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
530
531 ldr $bj,[$b_ptr,#0] @ b[0]
532 ldmia $a_ptr,{@acc[1]-@acc[8]}
533
534 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
535 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
536 @ that it can be addressed
537 @ without spending register
538 @ on address
539 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
540 umull @acc[2],$t1,@acc[3],$bj
541 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
542 umull @acc[3],$t2,@acc[4],$bj
543 adcs @acc[2],@acc[2],$t0
544 umull @acc[4],$t3,@acc[5],$bj
545 adcs @acc[3],@acc[3],$t1
546 umull @acc[5],$t0,@acc[6],$bj
547 adcs @acc[4],@acc[4],$t2
548 umull @acc[6],$t1,@acc[7],$bj
549 adcs @acc[5],@acc[5],$t3
550 umull @acc[7],$t2,@acc[8],$bj
551 adcs @acc[6],@acc[6],$t0
552 adcs @acc[7],@acc[7],$t1
553 eor $t3,$t3,$t3 @ first overflow bit is zero
554 adc @acc[8],$t2,#0
555___
556for(my $i=1;$i<8;$i++) {
557my $t4=@acc[0];
558
559 # Reduction iteration is normally performed by accumulating
560 # result of multiplication of modulus by "magic" digit [and
561 # omitting least significant word, which is guaranteed to
562 # be 0], but thanks to special form of modulus and "magic"
563 # digit being equal to least significant word, it can be
564 # performed with additions and subtractions alone. Indeed:
565 #
566 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
567 # * abcd
568 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
569 #
570 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
571 # rewrite above as:
572 #
573 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
574 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
575 # - abcd.0000.0000.0000.0000.0000.0000.abcd
576 #
577 # or marking redundant operations:
578 #
579 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
580 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
581 # - abcd.----.----.----.----.----.----.----
582
583$code.=<<___;
584 @ multiplication-less reduction $i
585 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
586 ldr $bj,[sp,#40] @ restore b_ptr
587 adcs @acc[4],@acc[4],#0 @ r[4]+=0
588 adcs @acc[5],@acc[5],#0 @ r[5]+=0
589 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
590 ldr $t1,[sp,#0] @ load a[0]
591 adcs @acc[7],@acc[7],#0 @ r[7]+=0
592 ldr $bj,[$bj,#4*$i] @ load b[i]
593 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
594 eor $t0,$t0,$t0
595 adc $t3,$t3,#0 @ overflow bit
596 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
597 ldr $t2,[sp,#4] @ a[1]
598 sbcs @acc[8],@acc[8],#0 @ r[8]-=0
599 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
600 eor $t1,$t1,$t1
601 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
602 @ that netto result is
603 @ addition of a value which
604 @ makes underflow impossible
605
606 ldr $t3,[sp,#8] @ a[2]
607 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
608 str @acc[0],[sp,#36] @ temporarily offload overflow
609 eor $t2,$t2,$t2
610 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
611 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
612 eor $t3,$t3,$t3
613 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
614 ldr $t0,[sp,#16] @ a[4]
615 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
616 eor $t4,$t4,$t4
617 adcs @acc[3],@acc[3],$t1
618 ldr $t1,[sp,#20] @ a[5]
619 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
620 eor $t0,$t0,$t0
621 adcs @acc[4],@acc[4],$t2
622 ldr $t2,[sp,#24] @ a[6]
623 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
624 eor $t1,$t1,$t1
625 adcs @acc[5],@acc[5],$t3
626 ldr $t3,[sp,#28] @ a[7]
627 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
628 eor $t2,$t2,$t2
629 adcs @acc[6],@acc[6],$t4
630 ldr @acc[0],[sp,#36] @ restore overflow bit
631 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
632 eor $t3,$t3,$t3
633 adcs @acc[7],@acc[7],$t0
634 adcs @acc[8],@acc[8],$t1
635 adcs @acc[0],$acc[0],$t2
636 adc $t3,$t3,#0 @ new overflow bit
637___
638 push(@acc,shift(@acc)); # rotate registers, so that
639 # "r[i]" becomes r[i]
640}
641$code.=<<___;
642 @ last multiplication-less reduction
643 adds @acc[3],@acc[3],@acc[0]
644 ldr $r_ptr,[sp,#32] @ restore r_ptr
645 adcs @acc[4],@acc[4],#0
646 adcs @acc[5],@acc[5],#0
647 adcs @acc[6],@acc[6],@acc[0]
648 adcs @acc[7],@acc[7],#0
649 adcs @acc[8],@acc[8],@acc[0]
650 adc $t3,$t3,#0
651 subs @acc[7],@acc[7],@acc[0]
652 sbcs @acc[8],@acc[8],#0
653 sbc @acc[0],$t3,#0 @ overflow bit
654
655 @ Final step is "if result > mod, subtract mod", but we do it
656 @ "other way around", namely subtract modulus from result
657 @ and if it borrowed, add modulus back.
658
659 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
660 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
661 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
662 sbcs @acc[4],@acc[4],#0
663 sbcs @acc[5],@acc[5],#0
664 sbcs @acc[6],@acc[6],#0
665 sbcs @acc[7],@acc[7],#1
666 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
667 ldr lr,[sp,#44] @ restore lr
668 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
669 add sp,sp,#48
670
671 @ Note that because mod has special form, i.e. consists of
672 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
673 @ broadcasting borrow bit to a register, @acc[0], and using it as
674 @ a whole or extracting single bit.
675
676 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
677 adcs @acc[2],@acc[2],@acc[0]
678 str @acc[1],[$r_ptr,#0]
679 adcs @acc[3],@acc[3],@acc[0]
680 str @acc[2],[$r_ptr,#4]
681 adcs @acc[4],@acc[4],#0
682 str @acc[3],[$r_ptr,#8]
683 adcs @acc[5],@acc[5],#0
684 str @acc[4],[$r_ptr,#12]
685 adcs @acc[6],@acc[6],#0
686 str @acc[5],[$r_ptr,#16]
687 adcs @acc[7],@acc[7],@acc[0],lsr#31
688 str @acc[6],[$r_ptr,#20]
689 adc @acc[8],@acc[8],@acc[0]
690 str @acc[7],[$r_ptr,#24]
691 str @acc[8],[$r_ptr,#28]
692
693 mov pc,lr
694.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
695___
696}
697
698{
699my ($out,$inp,$index,$mask)=map("r$_",(0..3));
700$code.=<<___;
701@ void ecp_nistz256_select_w5(P256_POINT *r0,const void *r1,
702@ int r2);
703.globl ecp_nistz256_select_w5
704.type ecp_nistz256_select_w5,%function
705.align 5
706ecp_nistz256_select_w5:
707 stmdb sp!,{r4-r11}
708
709 cmp $index,#0
710 mov $mask,#0
711#ifdef __thumb2__
712 itt ne
713#endif
714 subne $index,$index,#1
715 movne $mask,#-1
716 add $inp,$inp,$index,lsl#2
717
718 ldr r4,[$inp,#64*0]
719 ldr r5,[$inp,#64*1]
720 ldr r6,[$inp,#64*2]
721 and r4,r4,$mask
722 ldr r7,[$inp,#64*3]
723 and r5,r5,$mask
724 ldr r8,[$inp,#64*4]
725 and r6,r6,$mask
726 ldr r9,[$inp,#64*5]
727 and r7,r7,$mask
728 ldr r10,[$inp,#64*6]
729 and r8,r8,$mask
730 ldr r11,[$inp,#64*7]
731 add $inp,$inp,#64*8
732 and r9,r9,$mask
733 and r10,r10,$mask
734 and r11,r11,$mask
735 stmia $out!,{r4-r11} @ X
736
737 ldr r4,[$inp,#64*0]
738 ldr r5,[$inp,#64*1]
739 ldr r6,[$inp,#64*2]
740 and r4,r4,$mask
741 ldr r7,[$inp,#64*3]
742 and r5,r5,$mask
743 ldr r8,[$inp,#64*4]
744 and r6,r6,$mask
745 ldr r9,[$inp,#64*5]
746 and r7,r7,$mask
747 ldr r10,[$inp,#64*6]
748 and r8,r8,$mask
749 ldr r11,[$inp,#64*7]
750 add $inp,$inp,#64*8
751 and r9,r9,$mask
752 and r10,r10,$mask
753 and r11,r11,$mask
754 stmia $out!,{r4-r11} @ Y
755
756 ldr r4,[$inp,#64*0]
757 ldr r5,[$inp,#64*1]
758 ldr r6,[$inp,#64*2]
759 and r4,r4,$mask
760 ldr r7,[$inp,#64*3]
761 and r5,r5,$mask
762 ldr r8,[$inp,#64*4]
763 and r6,r6,$mask
764 ldr r9,[$inp,#64*5]
765 and r7,r7,$mask
766 ldr r10,[$inp,#64*6]
767 and r8,r8,$mask
768 ldr r11,[$inp,#64*7]
769 and r9,r9,$mask
770 and r10,r10,$mask
771 and r11,r11,$mask
772 stmia $out,{r4-r11} @ Z
773
774 ldmia sp!,{r4-r11}
775#if __ARM_ARCH__>=5 || defined(__thumb__)
776 bx lr
777#else
778 mov pc,lr
779#endif
780.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
781
782@ void ecp_nistz256_select_w7(P256_POINT_AFFINE *r0,const void *r1,
783@ int r2);
784.globl ecp_nistz256_select_w7
785.type ecp_nistz256_select_w7,%function
786.align 5
787ecp_nistz256_select_w7:
788 stmdb sp!,{r4-r7}
789
790 cmp $index,#0
791 mov $mask,#0
792#ifdef __thumb2__
793 itt ne
794#endif
795 subne $index,$index,#1
796 movne $mask,#-1
797 add $inp,$inp,$index
798 mov $index,#64/4
799 nop
800.Loop_select_w7:
801 ldrb r4,[$inp,#64*0]
802 subs $index,$index,#1
803 ldrb r5,[$inp,#64*1]
804 ldrb r6,[$inp,#64*2]
805 ldrb r7,[$inp,#64*3]
806 add $inp,$inp,#64*4
807 orr r4,r4,r5,lsl#8
808 orr r4,r4,r6,lsl#16
809 orr r4,r4,r7,lsl#24
810 and r4,r4,$mask
811 str r4,[$out],#4
812 bne .Loop_select_w7
813
814 ldmia sp!,{r4-r7}
815#if __ARM_ARCH__>=5 || defined(__thumb__)
816 bx lr
817#else
818 mov pc,lr
819#endif
820.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
821___
822}
823if (0) {
824# In comparison to integer-only equivalent of below subroutine:
825#
826# Cortex-A8 +10%
827# Cortex-A9 -10%
828# Snapdragon S4 +5%
829#
830# As not all time is spent in multiplication, overall impact is deemed
831# too low to care about.
832
833my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
834my $mask="q4";
835my $mult="q5";
836my @AxB=map("q$_",(8..15));
837
838my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
839
840$code.=<<___;
841#if __ARM_ARCH__>=7
842.fpu neon
843
844.globl ecp_nistz256_mul_mont_neon
845.type ecp_nistz256_mul_mont_neon,%function
846.align 5
847ecp_nistz256_mul_mont_neon:
848 mov ip,sp
849 stmdb sp!,{r4-r9}
850 vstmdb sp!,{q4-q5} @ ABI specification says so
851
852 sub $toutptr,sp,#40
853 vld1.32 {${Bi}[0]},[$bptr,:32]!
854 veor $zero,$zero,$zero
855 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
856 vzip.16 $Bi,$zero
857 mov sp,$toutptr @ alloca
858 vmov.i64 $mask,#0xffff
859
860 vmull.u32 @AxB[0],$Bi,${A0}[0]
861 vmull.u32 @AxB[1],$Bi,${A0}[1]
862 vmull.u32 @AxB[2],$Bi,${A1}[0]
863 vmull.u32 @AxB[3],$Bi,${A1}[1]
864 vshr.u64 $temp,@AxB[0]#lo,#16
865 vmull.u32 @AxB[4],$Bi,${A2}[0]
866 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
867 vmull.u32 @AxB[5],$Bi,${A2}[1]
868 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
869 vmull.u32 @AxB[6],$Bi,${A3}[0]
870 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
871 vmull.u32 @AxB[7],$Bi,${A3}[1]
872___
873for($i=1;$i<8;$i++) {
874$code.=<<___;
875 vld1.32 {${Bi}[0]},[$bptr,:32]!
876 veor $zero,$zero,$zero
877 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
878 vshl.u64 $mult,@AxB[0],#32
879 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
880 vsub.u64 $mult,$mult,@AxB[0]
881 vzip.16 $Bi,$zero
882 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
883 vadd.u64 @AxB[7],@AxB[7],$mult
884___
885 push(@AxB,shift(@AxB));
886$code.=<<___;
887 vmlal.u32 @AxB[0],$Bi,${A0}[0]
888 vmlal.u32 @AxB[1],$Bi,${A0}[1]
889 vmlal.u32 @AxB[2],$Bi,${A1}[0]
890 vmlal.u32 @AxB[3],$Bi,${A1}[1]
891 vshr.u64 $temp,@AxB[0]#lo,#16
892 vmlal.u32 @AxB[4],$Bi,${A2}[0]
893 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
894 vmlal.u32 @AxB[5],$Bi,${A2}[1]
895 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
896 vmlal.u32 @AxB[6],$Bi,${A3}[0]
897 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
898 vmull.u32 @AxB[7],$Bi,${A3}[1]
899___
900}
901$code.=<<___;
902 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
903 vshl.u64 $mult,@AxB[0],#32
904 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
905 vsub.u64 $mult,$mult,@AxB[0]
906 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
907 vadd.u64 @AxB[7],@AxB[7],$mult
908
909 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
910 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
911 vshr.u64 $temp,@AxB[1]#hi,#16
912 vzip.16 @AxB[1]#lo,@AxB[1]#hi
913___
914foreach (2..7) {
915$code.=<<___;
916 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
917 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
918 vshr.u64 $temp,@AxB[$_]#lo,#16
919 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
920 vshr.u64 $temp,@AxB[$_]#hi,#16
921 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
922___
923}
924$code.=<<___;
925 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
926 vst1.32 {$temp},[$toutptr] @ upper 33 bits
927
928 ldr r1,[sp,#0]
929 ldr r2,[sp,#4]
930 ldr r3,[sp,#8]
931 subs r1,r1,#-1
932 ldr r4,[sp,#12]
933 sbcs r2,r2,#-1
934 ldr r5,[sp,#16]
935 sbcs r3,r3,#-1
936 ldr r6,[sp,#20]
937 sbcs r4,r4,#0
938 ldr r7,[sp,#24]
939 sbcs r5,r5,#0
940 ldr r8,[sp,#28]
941 sbcs r6,r6,#0
942 ldr r9,[sp,#32] @ top-most bit
943 sbcs r7,r7,#1
944 sub sp,ip,#40+16
945 sbcs r8,r8,#-1
946 sbc r9,r9,#0
947 vldmia sp!,{q4-q5}
948
949 adds r1,r1,r9
950 adcs r2,r2,r9
951 str r1,[$rptr,#0]
952 adcs r3,r3,r9
953 str r2,[$rptr,#4]
954 adcs r4,r4,#0
955 str r3,[$rptr,#8]
956 adcs r5,r5,#0
957 str r4,[$rptr,#12]
958 adcs r6,r6,#0
959 str r5,[$rptr,#16]
960 adcs r7,r7,r9,lsr#31
961 str r6,[$rptr,#20]
962 adcs r8,r8,r9
963 str r7,[$rptr,#24]
964 str r8,[$rptr,#28]
965
966 ldmia sp!,{r4-r9}
967 bx lr
968.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
969#endif
970___
971}
972
973{{{
974########################################################################
975# Below $aN assignment matches order in which 256-bit result appears in
976# register bank at return from __ecp_nistz256_mul_mont, so that we can
977# skip over reloading it from memory. This means that below functions
978# use custom calling sequence accepting 256-bit input in registers,
979# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
980#
981# See their "normal" counterparts for insights on calculations.
982
983my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
984 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
985my $ff=$b_ptr;
986
987$code.=<<___;
988.type __ecp_nistz256_sub_from,%function
989.align 5
990__ecp_nistz256_sub_from:
991 str lr,[sp,#-4]! @ push lr
992
993 ldr $t0,[$b_ptr,#0]
994 ldr $t1,[$b_ptr,#4]
995 ldr $t2,[$b_ptr,#8]
996 ldr $t3,[$b_ptr,#12]
997 subs $a0,$a0,$t0
998 ldr $t0,[$b_ptr,#16]
999 sbcs $a1,$a1,$t1
1000 ldr $t1,[$b_ptr,#20]
1001 sbcs $a2,$a2,$t2
1002 ldr $t2,[$b_ptr,#24]
1003 sbcs $a3,$a3,$t3
1004 ldr $t3,[$b_ptr,#28]
1005 sbcs $a4,$a4,$t0
1006 sbcs $a5,$a5,$t1
1007 sbcs $a6,$a6,$t2
1008 sbcs $a7,$a7,$t3
1009 sbc $ff,$ff,$ff @ broadcast borrow bit
1010 ldr lr,[sp],#4 @ pop lr
1011
1012 adds $a0,$a0,$ff @ add synthesized modulus
1013 adcs $a1,$a1,$ff
1014 str $a0,[$r_ptr,#0]
1015 adcs $a2,$a2,$ff
1016 str $a1,[$r_ptr,#4]
1017 adcs $a3,$a3,#0
1018 str $a2,[$r_ptr,#8]
1019 adcs $a4,$a4,#0
1020 str $a3,[$r_ptr,#12]
1021 adcs $a5,$a5,#0
1022 str $a4,[$r_ptr,#16]
1023 adcs $a6,$a6,$ff,lsr#31
1024 str $a5,[$r_ptr,#20]
1025 adcs $a7,$a7,$ff
1026 str $a6,[$r_ptr,#24]
1027 str $a7,[$r_ptr,#28]
1028
1029 mov pc,lr
1030.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1031
1032.type __ecp_nistz256_sub_morf,%function
1033.align 5
1034__ecp_nistz256_sub_morf:
1035 str lr,[sp,#-4]! @ push lr
1036
1037 ldr $t0,[$b_ptr,#0]
1038 ldr $t1,[$b_ptr,#4]
1039 ldr $t2,[$b_ptr,#8]
1040 ldr $t3,[$b_ptr,#12]
1041 subs $a0,$t0,$a0
1042 ldr $t0,[$b_ptr,#16]
1043 sbcs $a1,$t1,$a1
1044 ldr $t1,[$b_ptr,#20]
1045 sbcs $a2,$t2,$a2
1046 ldr $t2,[$b_ptr,#24]
1047 sbcs $a3,$t3,$a3
1048 ldr $t3,[$b_ptr,#28]
1049 sbcs $a4,$t0,$a4
1050 sbcs $a5,$t1,$a5
1051 sbcs $a6,$t2,$a6
1052 sbcs $a7,$t3,$a7
1053 sbc $ff,$ff,$ff @ broadcast borrow bit
1054 ldr lr,[sp],#4 @ pop lr
1055
1056 adds $a0,$a0,$ff @ add synthesized modulus
1057 adcs $a1,$a1,$ff
1058 str $a0,[$r_ptr,#0]
1059 adcs $a2,$a2,$ff
1060 str $a1,[$r_ptr,#4]
1061 adcs $a3,$a3,#0
1062 str $a2,[$r_ptr,#8]
1063 adcs $a4,$a4,#0
1064 str $a3,[$r_ptr,#12]
1065 adcs $a5,$a5,#0
1066 str $a4,[$r_ptr,#16]
1067 adcs $a6,$a6,$ff,lsr#31
1068 str $a5,[$r_ptr,#20]
1069 adcs $a7,$a7,$ff
1070 str $a6,[$r_ptr,#24]
1071 str $a7,[$r_ptr,#28]
1072
1073 mov pc,lr
1074.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1075
1076.type __ecp_nistz256_add_self,%function
1077.align 4
1078__ecp_nistz256_add_self:
1079 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
1080 adcs $a1,$a1,$a1
1081 adcs $a2,$a2,$a2
1082 adcs $a3,$a3,$a3
1083 adcs $a4,$a4,$a4
1084 adcs $a5,$a5,$a5
1085 adcs $a6,$a6,$a6
1086 mov $ff,#0
1087 adcs $a7,$a7,$a7
1088 adc $ff,$ff,#0
1089
1090 @ if a+b >= modulus, subtract modulus.
1091 @
1092 @ But since comparison implies subtraction, we subtract
1093 @ modulus and then add it back if subtraction borrowed.
1094
1095 subs $a0,$a0,#-1
1096 sbcs $a1,$a1,#-1
1097 sbcs $a2,$a2,#-1
1098 sbcs $a3,$a3,#0
1099 sbcs $a4,$a4,#0
1100 sbcs $a5,$a5,#0
1101 sbcs $a6,$a6,#1
1102 sbcs $a7,$a7,#-1
1103 sbc $ff,$ff,#0
1104
1105 @ Note that because mod has special form, i.e. consists of
1106 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1107 @ using value of borrow as a whole or extracting single bit.
1108 @ Follow $ff register...
1109
1110 adds $a0,$a0,$ff @ add synthesized modulus
1111 adcs $a1,$a1,$ff
1112 str $a0,[$r_ptr,#0]
1113 adcs $a2,$a2,$ff
1114 str $a1,[$r_ptr,#4]
1115 adcs $a3,$a3,#0
1116 str $a2,[$r_ptr,#8]
1117 adcs $a4,$a4,#0
1118 str $a3,[$r_ptr,#12]
1119 adcs $a5,$a5,#0
1120 str $a4,[$r_ptr,#16]
1121 adcs $a6,$a6,$ff,lsr#31
1122 str $a5,[$r_ptr,#20]
1123 adcs $a7,$a7,$ff
1124 str $a6,[$r_ptr,#24]
1125 str $a7,[$r_ptr,#28]
1126
1127 mov pc,lr
1128.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1129
1130___
1131
1132########################################################################
1133# following subroutines are "literal" implementation of those found in
1134# ecp_nistz256.c
1135#
1136########################################################################
1137# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1138#
1139{
1140my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1141# above map() describes stack layout with 5 temporary
1142# 256-bit vectors on top. Then note that we push
1143# starting from r0, which means that we have copy of
1144# input arguments just below these temporary vectors.
1145
1146$code.=<<___;
1147.globl ecp_nistz256_point_double
1148.type ecp_nistz256_point_double,%function
1149.align 5
1150ecp_nistz256_point_double:
1151 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1152 sub sp,sp,#32*5
1153
1154.Lpoint_double_shortcut:
1155 add r3,sp,#$in_x
1156 ldmia $a_ptr!,{r4-r11} @ copy in_x
1157 stmia r3,{r4-r11}
1158
1159 add $r_ptr,sp,#$S
1160 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
1161
1162 add $b_ptr,$a_ptr,#32
1163 add $a_ptr,$a_ptr,#32
1164 add $r_ptr,sp,#$Zsqr
1165 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
1166
1167 add $a_ptr,sp,#$S
1168 add $b_ptr,sp,#$S
1169 add $r_ptr,sp,#$S
1170 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
1171
1172 ldr $b_ptr,[sp,#32*5+4]
1173 add $a_ptr,$b_ptr,#32
1174 add $b_ptr,$b_ptr,#64
1175 add $r_ptr,sp,#$tmp0
1176 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
1177
1178 ldr $r_ptr,[sp,#32*5]
1179 add $r_ptr,$r_ptr,#64
1180 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
1181
1182 add $a_ptr,sp,#$in_x
1183 add $b_ptr,sp,#$Zsqr
1184 add $r_ptr,sp,#$M
1185 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
1186
1187 add $a_ptr,sp,#$in_x
1188 add $b_ptr,sp,#$Zsqr
1189 add $r_ptr,sp,#$Zsqr
1190 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
1191
1192 add $a_ptr,sp,#$S
1193 add $b_ptr,sp,#$S
1194 add $r_ptr,sp,#$tmp0
1195 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
1196
1197 add $a_ptr,sp,#$Zsqr
1198 add $b_ptr,sp,#$M
1199 add $r_ptr,sp,#$M
1200 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
1201
1202 ldr $r_ptr,[sp,#32*5]
1203 add $a_ptr,sp,#$tmp0
1204 add $r_ptr,$r_ptr,#32
1205 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
1206
1207 add $a_ptr,sp,#$M
1208 add $r_ptr,sp,#$M
1209 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
1210
1211 add $a_ptr,sp,#$in_x
1212 add $b_ptr,sp,#$S
1213 add $r_ptr,sp,#$S
1214 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
1215
1216 add $r_ptr,sp,#$tmp0
1217 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
1218
1219 ldr $r_ptr,[sp,#32*5]
1220 add $a_ptr,sp,#$M
1221 add $b_ptr,sp,#$M
1222 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
1223
1224 add $b_ptr,sp,#$tmp0
1225 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
1226
1227 add $b_ptr,sp,#$S
1228 add $r_ptr,sp,#$S
1229 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
1230
1231 add $a_ptr,sp,#$M
1232 add $b_ptr,sp,#$S
1233 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
1234
1235 ldr $r_ptr,[sp,#32*5]
1236 add $b_ptr,$r_ptr,#32
1237 add $r_ptr,$r_ptr,#32
1238 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
1239
1240 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
1241#if __ARM_ARCH__>=5 || !defined(__thumb__)
1242 ldmia sp!,{r4-r12,pc}
1243#else
1244 ldmia sp!,{r4-r12,lr}
1245 bx lr @ interoperable with Thumb ISA:-)
1246#endif
1247.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1248___
1249}
1250
1251########################################################################
1252# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1253# const P256_POINT *in2);
1254{
1255my ($res_x,$res_y,$res_z,
1256 $in1_x,$in1_y,$in1_z,
1257 $in2_x,$in2_y,$in2_z,
1258 $H,$Hsqr,$R,$Rsqr,$Hcub,
1259 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1260my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1261# above map() describes stack layout with 18 temporary
1262# 256-bit vectors on top. Then note that we push
1263# starting from r0, which means that we have copy of
1264# input arguments just below these temporary vectors.
1265# We use three of them for !in1infty, !in2intfy and
1266# result of check for zero.
1267
1268$code.=<<___;
1269.globl ecp_nistz256_point_add
1270.type ecp_nistz256_point_add,%function
1271.align 5
1272ecp_nistz256_point_add:
1273 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1274 sub sp,sp,#32*18+16
1275
1276 ldmia $b_ptr!,{r4-r11} @ copy in2_x
1277 add r3,sp,#$in2_x
1278 stmia r3!,{r4-r11}
1279 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1280 stmia r3!,{r4-r11}
1281 ldmia $b_ptr,{r4-r11} @ copy in2_z
1282 orr r12,r4,r5
1283 orr r12,r12,r6
1284 orr r12,r12,r7
1285 orr r12,r12,r8
1286 orr r12,r12,r9
1287 orr r12,r12,r10
1288 orr r12,r12,r11
1289 cmp r12,#0
1290#ifdef __thumb2__
1291 it ne
1292#endif
1293 movne r12,#-1
1294 stmia r3,{r4-r11}
1295 str r12,[sp,#32*18+8] @ !in2infty
1296
1297 ldmia $a_ptr!,{r4-r11} @ copy in1_x
1298 add r3,sp,#$in1_x
1299 stmia r3!,{r4-r11}
1300 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1301 stmia r3!,{r4-r11}
1302 ldmia $a_ptr,{r4-r11} @ copy in1_z
1303 orr r12,r4,r5
1304 orr r12,r12,r6
1305 orr r12,r12,r7
1306 orr r12,r12,r8
1307 orr r12,r12,r9
1308 orr r12,r12,r10
1309 orr r12,r12,r11
1310 cmp r12,#0
1311#ifdef __thumb2__
1312 it ne
1313#endif
1314 movne r12,#-1
1315 stmia r3,{r4-r11}
1316 str r12,[sp,#32*18+4] @ !in1infty
1317
1318 add $a_ptr,sp,#$in2_z
1319 add $b_ptr,sp,#$in2_z
1320 add $r_ptr,sp,#$Z2sqr
1321 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
1322
1323 add $a_ptr,sp,#$in1_z
1324 add $b_ptr,sp,#$in1_z
1325 add $r_ptr,sp,#$Z1sqr
1326 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1327
1328 add $a_ptr,sp,#$in2_z
1329 add $b_ptr,sp,#$Z2sqr
1330 add $r_ptr,sp,#$S1
1331 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
1332
1333 add $a_ptr,sp,#$in1_z
1334 add $b_ptr,sp,#$Z1sqr
1335 add $r_ptr,sp,#$S2
1336 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1337
1338 add $a_ptr,sp,#$in1_y
1339 add $b_ptr,sp,#$S1
1340 add $r_ptr,sp,#$S1
1341 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
1342
1343 add $a_ptr,sp,#$in2_y
1344 add $b_ptr,sp,#$S2
1345 add $r_ptr,sp,#$S2
1346 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1347
1348 add $b_ptr,sp,#$S1
1349 add $r_ptr,sp,#$R
1350 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
1351
1352 orr $a0,$a0,$a1 @ see if result is zero
1353 orr $a2,$a2,$a3
1354 orr $a4,$a4,$a5
1355 orr $a0,$a0,$a2
1356 orr $a4,$a4,$a6
1357 orr $a0,$a0,$a7
1358 add $a_ptr,sp,#$in1_x
1359 orr $a0,$a0,$a4
1360 add $b_ptr,sp,#$Z2sqr
1361 str $a0,[sp,#32*18+12]
1362
1363 add $r_ptr,sp,#$U1
1364 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
1365
1366 add $a_ptr,sp,#$in2_x
1367 add $b_ptr,sp,#$Z1sqr
1368 add $r_ptr,sp,#$U2
1369 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
1370
1371 add $b_ptr,sp,#$U1
1372 add $r_ptr,sp,#$H
1373 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
1374
1375 orr $a0,$a0,$a1 @ see if result is zero
1376 orr $a2,$a2,$a3
1377 orr $a4,$a4,$a5
1378 orr $a0,$a0,$a2
1379 orr $a4,$a4,$a6
1380 orr $a0,$a0,$a7
1381 orrs $a0,$a0,$a4
1382
1383 bne .Ladd_proceed @ is_equal(U1,U2)?
1384
1385 ldr $t0,[sp,#32*18+4]
1386 ldr $t1,[sp,#32*18+8]
1387 ldr $t2,[sp,#32*18+12]
1388 tst $t0,$t1
1389 beq .Ladd_proceed @ (in1infty || in2infty)?
1390 tst $t2,$t2
1391 beq .Ladd_double @ is_equal(S1,S2)?
1392
1393 ldr $r_ptr,[sp,#32*18+16]
1394 eor r4,r4,r4
1395 eor r5,r5,r5
1396 eor r6,r6,r6
1397 eor r7,r7,r7
1398 eor r8,r8,r8
1399 eor r9,r9,r9
1400 eor r10,r10,r10
1401 eor r11,r11,r11
1402 stmia $r_ptr!,{r4-r11}
1403 stmia $r_ptr!,{r4-r11}
1404 stmia $r_ptr!,{r4-r11}
1405 b .Ladd_done
1406
1407.align 4
1408.Ladd_double:
1409 ldr $a_ptr,[sp,#32*18+20]
1410 add sp,sp,#32*(18-5)+16 @ difference in frame sizes
1411 b .Lpoint_double_shortcut
1412
1413.align 4
1414.Ladd_proceed:
1415 add $a_ptr,sp,#$R
1416 add $b_ptr,sp,#$R
1417 add $r_ptr,sp,#$Rsqr
1418 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1419
1420 add $a_ptr,sp,#$H
1421 add $b_ptr,sp,#$in1_z
1422 add $r_ptr,sp,#$res_z
1423 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1424
1425 add $a_ptr,sp,#$H
1426 add $b_ptr,sp,#$H
1427 add $r_ptr,sp,#$Hsqr
1428 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1429
1430 add $a_ptr,sp,#$in2_z
1431 add $b_ptr,sp,#$res_z
1432 add $r_ptr,sp,#$res_z
1433 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
1434
1435 add $a_ptr,sp,#$H
1436 add $b_ptr,sp,#$Hsqr
1437 add $r_ptr,sp,#$Hcub
1438 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1439
1440 add $a_ptr,sp,#$Hsqr
1441 add $b_ptr,sp,#$U1
1442 add $r_ptr,sp,#$U2
1443 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
1444
1445 add $r_ptr,sp,#$Hsqr
1446 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1447
1448 add $b_ptr,sp,#$Rsqr
1449 add $r_ptr,sp,#$res_x
1450 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1451
1452 add $b_ptr,sp,#$Hcub
1453 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1454
1455 add $b_ptr,sp,#$U2
1456 add $r_ptr,sp,#$res_y
1457 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1458
1459 add $a_ptr,sp,#$Hcub
1460 add $b_ptr,sp,#$S1
1461 add $r_ptr,sp,#$S2
1462 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
1463
1464 add $a_ptr,sp,#$R
1465 add $b_ptr,sp,#$res_y
1466 add $r_ptr,sp,#$res_y
1467 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1468
1469 add $b_ptr,sp,#$S2
1470 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1471
1472 ldr r11,[sp,#32*18+4] @ !in1intfy
1473 ldr r12,[sp,#32*18+8] @ !in2intfy
1474 add r1,sp,#$res_x
1475 add r2,sp,#$in2_x
1476 and r10,r11,r12
1477 mvn r11,r11
1478 add r3,sp,#$in1_x
1479 and r11,r11,r12
1480 mvn r12,r12
1481 ldr $r_ptr,[sp,#32*18+16]
1482___
1483for($i=0;$i<96;$i+=8) { # conditional moves
1484$code.=<<___;
1485 ldmia r1!,{r4-r5} @ res_x
1486 ldmia r2!,{r6-r7} @ in2_x
1487 ldmia r3!,{r8-r9} @ in1_x
1488 and r4,r4,r10
1489 and r5,r5,r10
1490 and r6,r6,r11
1491 and r7,r7,r11
1492 and r8,r8,r12
1493 and r9,r9,r12
1494 orr r4,r4,r6
1495 orr r5,r5,r7
1496 orr r4,r4,r8
1497 orr r5,r5,r9
1498 stmia $r_ptr!,{r4-r5}
1499___
1500}
1501$code.=<<___;
1502.Ladd_done:
1503 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
1504#if __ARM_ARCH__>=5 || defined(__thumb__)
1505 ldmia sp!,{r4-r12,pc}
1506#else
1507 ldmia sp!,{r4-r12,lr}
1508 bx lr @ interoperable with Thumb ISA:-)
1509#endif
1510.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1511___
1512}
1513
1514########################################################################
1515# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1516# const P256_POINT_AFFINE *in2);
1517{
1518my ($res_x,$res_y,$res_z,
1519 $in1_x,$in1_y,$in1_z,
1520 $in2_x,$in2_y,
1521 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1522my $Z1sqr = $S2;
1523# above map() describes stack layout with 18 temporary
1524# 256-bit vectors on top. Then note that we push
1525# starting from r0, which means that we have copy of
1526# input arguments just below these temporary vectors.
1527# We use two of them for !in1infty, !in2intfy.
1528
1529my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1530
1531$code.=<<___;
1532.globl ecp_nistz256_point_add_affine
1533.type ecp_nistz256_point_add_affine,%function
1534.align 5
1535ecp_nistz256_point_add_affine:
1536 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1537 sub sp,sp,#32*15
1538
1539 ldmia $a_ptr!,{r4-r11} @ copy in1_x
1540 add r3,sp,#$in1_x
1541 stmia r3!,{r4-r11}
1542 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1543 stmia r3!,{r4-r11}
1544 ldmia $a_ptr,{r4-r11} @ copy in1_z
1545 orr r12,r4,r5
1546 orr r12,r12,r6
1547 orr r12,r12,r7
1548 orr r12,r12,r8
1549 orr r12,r12,r9
1550 orr r12,r12,r10
1551 orr r12,r12,r11
1552 cmp r12,#0
1553#ifdef __thumb2__
1554 it ne
1555#endif
1556 movne r12,#-1
1557 stmia r3,{r4-r11}
1558 str r12,[sp,#32*15+4] @ !in1infty
1559
1560 ldmia $b_ptr!,{r4-r11} @ copy in2_x
1561 add r3,sp,#$in2_x
1562 orr r12,r4,r5
1563 orr r12,r12,r6
1564 orr r12,r12,r7
1565 orr r12,r12,r8
1566 orr r12,r12,r9
1567 orr r12,r12,r10
1568 orr r12,r12,r11
1569 stmia r3!,{r4-r11}
1570 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1571 orr r12,r12,r4
1572 orr r12,r12,r5
1573 orr r12,r12,r6
1574 orr r12,r12,r7
1575 orr r12,r12,r8
1576 orr r12,r12,r9
1577 orr r12,r12,r10
1578 orr r12,r12,r11
1579 stmia r3!,{r4-r11}
1580 cmp r12,#0
1581#ifdef __thumb2__
1582 it ne
1583#endif
1584 movne r12,#-1
1585 str r12,[sp,#32*15+8] @ !in2infty
1586
1587 add $a_ptr,sp,#$in1_z
1588 add $b_ptr,sp,#$in1_z
1589 add $r_ptr,sp,#$Z1sqr
1590 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1591
1592 add $a_ptr,sp,#$Z1sqr
1593 add $b_ptr,sp,#$in2_x
1594 add $r_ptr,sp,#$U2
1595 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
1596
1597 add $b_ptr,sp,#$in1_x
1598 add $r_ptr,sp,#$H
1599 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
1600
1601 add $a_ptr,sp,#$Z1sqr
1602 add $b_ptr,sp,#$in1_z
1603 add $r_ptr,sp,#$S2
1604 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1605
1606 add $a_ptr,sp,#$H
1607 add $b_ptr,sp,#$in1_z
1608 add $r_ptr,sp,#$res_z
1609 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1610
1611 add $a_ptr,sp,#$in2_y
1612 add $b_ptr,sp,#$S2
1613 add $r_ptr,sp,#$S2
1614 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1615
1616 add $b_ptr,sp,#$in1_y
1617 add $r_ptr,sp,#$R
1618 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
1619
1620 add $a_ptr,sp,#$H
1621 add $b_ptr,sp,#$H
1622 add $r_ptr,sp,#$Hsqr
1623 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1624
1625 add $a_ptr,sp,#$R
1626 add $b_ptr,sp,#$R
1627 add $r_ptr,sp,#$Rsqr
1628 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1629
1630 add $a_ptr,sp,#$H
1631 add $b_ptr,sp,#$Hsqr
1632 add $r_ptr,sp,#$Hcub
1633 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1634
1635 add $a_ptr,sp,#$Hsqr
1636 add $b_ptr,sp,#$in1_x
1637 add $r_ptr,sp,#$U2
1638 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
1639
1640 add $r_ptr,sp,#$Hsqr
1641 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1642
1643 add $b_ptr,sp,#$Rsqr
1644 add $r_ptr,sp,#$res_x
1645 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1646
1647 add $b_ptr,sp,#$Hcub
1648 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1649
1650 add $b_ptr,sp,#$U2
1651 add $r_ptr,sp,#$res_y
1652 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1653
1654 add $a_ptr,sp,#$Hcub
1655 add $b_ptr,sp,#$in1_y
1656 add $r_ptr,sp,#$S2
1657 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
1658
1659 add $a_ptr,sp,#$R
1660 add $b_ptr,sp,#$res_y
1661 add $r_ptr,sp,#$res_y
1662 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1663
1664 add $b_ptr,sp,#$S2
1665 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1666
1667 ldr r11,[sp,#32*15+4] @ !in1intfy
1668 ldr r12,[sp,#32*15+8] @ !in2intfy
1669 add r1,sp,#$res_x
1670 add r2,sp,#$in2_x
1671 and r10,r11,r12
1672 mvn r11,r11
1673 add r3,sp,#$in1_x
1674 and r11,r11,r12
1675 mvn r12,r12
1676 ldr $r_ptr,[sp,#32*15]
1677___
1678for($i=0;$i<64;$i+=8) { # conditional moves
1679$code.=<<___;
1680 ldmia r1!,{r4-r5} @ res_x
1681 ldmia r2!,{r6-r7} @ in2_x
1682 ldmia r3!,{r8-r9} @ in1_x
1683 and r4,r4,r10
1684 and r5,r5,r10
1685 and r6,r6,r11
1686 and r7,r7,r11
1687 and r8,r8,r12
1688 and r9,r9,r12
1689 orr r4,r4,r6
1690 orr r5,r5,r7
1691 orr r4,r4,r8
1692 orr r5,r5,r9
1693 stmia $r_ptr!,{r4-r5}
1694___
1695}
1696for(;$i<96;$i+=8) {
1697my $j=($i-64)/4;
1698$code.=<<___;
1699 ldmia r1!,{r4-r5} @ res_z
1700 ldmia r3!,{r8-r9} @ in1_z
1701 and r4,r4,r10
1702 and r5,r5,r10
1703 and r6,r11,#@ONE_mont[$j]
1704 and r7,r11,#@ONE_mont[$j+1]
1705 and r8,r8,r12
1706 and r9,r9,r12
1707 orr r4,r4,r6
1708 orr r5,r5,r7
1709 orr r4,r4,r8
1710 orr r5,r5,r9
1711 stmia $r_ptr!,{r4-r5}
1712___
1713}
1714$code.=<<___;
1715 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
1716#if __ARM_ARCH__>=5 || !defined(__thumb__)
1717 ldmia sp!,{r4-r12,pc}
1718#else
1719 ldmia sp!,{r4-r12,lr}
1720 bx lr @ interoperable with Thumb ISA:-)
1721#endif
1722.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1723___
1724} }}}
1725
1726foreach (split("\n",$code)) {
1727 s/\`([^\`]*)\`/eval $1/geo;
1728
1729 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1730
1731 print $_,"\n";
1732}
1733close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl
deleted file mode 100644
index 49460fefdc..0000000000
--- a/src/lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl
+++ /dev/null
@@ -1,2890 +0,0 @@
1#! /usr/bin/env perl
2# $OpenBSD: ecp_nistz256-sparcv9.pl,v 1.2 2022/12/26 07:18:51 jmc Exp $
3#
4# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
5#
6# Licensed under the OpenSSL license (the "License"). You may not use
7# this file except in compliance with the License. You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see http://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18#
19# ECP_NISTZ256 module for SPARCv9.
20#
21# February 2015.
22#
23# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
24# http://eprint.iacr.org/2013/816. In the process of adaptation
25# original .c module was made 32-bit savvy in order to make this
26# implementation possible.
27#
28# with/without -DECP_NISTZ256_ASM
29# UltraSPARC III +12-18%
30# SPARC T4 +99-550% (+66-150% on 32-bit Solaris)
31#
32# Ranges denote minimum and maximum improvement coefficients depending
33# on benchmark. Lower coefficients are for ECDSA sign, server-side
34# operation. Keep in mind that +200% means 3x improvement.
35
36# Uncomment when all sparcv9 assembly generators are updated to take the output
37# file as last argument...
38# $output = pop;
39# open STDOUT,">$output";
40
41$code.=<<___;
42#define STACK_FRAME 192
43#define STACK_BIAS 2047
44
45#define LOCALS (STACK_BIAS+STACK_FRAME)
46.register %g2,#scratch
47.register %g3,#scratch
48# define STACK64_FRAME STACK_FRAME
49# define LOCALS64 LOCALS
50
51.section ".text",#alloc,#execinstr
52___
53
54{{{
55my ($rp,$ap,$bp)=map("%i$_",(0..2));
56my @acc=map("%l$_",(0..7));
57my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
58my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
59my ($rp_real,$ap_real)=("%g2","%g3");
60
61$code.=<<___;
62.align 64
63.Lone:
64.long 1,0,0,0,0,0,0,0
65
66! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
67.globl ecp_nistz256_from_mont
68.align 32
69ecp_nistz256_from_mont:
70 save %sp,-STACK_FRAME,%sp
71 nop
721: call .+8
73 add %o7,.Lone-1b,$bp
74 call __ecp_nistz256_mul_mont
75 nop
76 ret
77 restore
78.type ecp_nistz256_from_mont,#function
79.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
80
81! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
82! const BN_ULONG %i2[8]);
83.globl ecp_nistz256_mul_mont
84.align 32
85ecp_nistz256_mul_mont:
86 save %sp,-STACK_FRAME,%sp
87 nop
88 call __ecp_nistz256_mul_mont
89 nop
90 ret
91 restore
92.type ecp_nistz256_mul_mont,#function
93.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
94
95! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
96.globl ecp_nistz256_sqr_mont
97.align 32
98ecp_nistz256_sqr_mont:
99 save %sp,-STACK_FRAME,%sp
100 mov $ap,$bp
101 call __ecp_nistz256_mul_mont
102 nop
103 ret
104 restore
105.type ecp_nistz256_sqr_mont,#function
106.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
107___
108
109########################################################################
110# Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
111# while all others are meant to keep 32. "Meant to" means that additions
112# to @acc[0-7] do "contaminate" upper bits, but they are cleared before
113# they can affect outcome (follow 'and' with $mask). Also keep in mind
114# that addition with carry is addition with 32-bit carry, even though
115# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
116# below for VIS3 code paths.]
117
118$code.=<<___;
119.align 32
120__ecp_nistz256_mul_mont:
121 ld [$bp+0],$bi ! b[0]
122 mov -1,$mask
123 ld [$ap+0],$a0
124 srl $mask,0,$mask ! 0xffffffff
125 ld [$ap+4],$t1
126 ld [$ap+8],$t2
127 ld [$ap+12],$t3
128 ld [$ap+16],$t4
129 ld [$ap+20],$t5
130 ld [$ap+24],$t6
131 ld [$ap+28],$t7
132 mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results
133 mulx $t1,$bi,$t1
134 mulx $t2,$bi,$t2
135 mulx $t3,$bi,$t3
136 mulx $t4,$bi,$t4
137 mulx $t5,$bi,$t5
138 mulx $t6,$bi,$t6
139 mulx $t7,$bi,$t7
140 srlx $t0,32,@acc[1] ! extract high parts
141 srlx $t1,32,@acc[2]
142 srlx $t2,32,@acc[3]
143 srlx $t3,32,@acc[4]
144 srlx $t4,32,@acc[5]
145 srlx $t5,32,@acc[6]
146 srlx $t6,32,@acc[7]
147 srlx $t7,32,@acc[0] ! "@acc[8]"
148 mov 0,$carry
149___
150for($i=1;$i<8;$i++) {
151$code.=<<___;
152 addcc @acc[1],$t1,@acc[1] ! accumulate high parts
153 ld [$bp+4*$i],$bi ! b[$i]
154 ld [$ap+4],$t1 ! re-load a[1-7]
155 addccc @acc[2],$t2,@acc[2]
156 addccc @acc[3],$t3,@acc[3]
157 ld [$ap+8],$t2
158 ld [$ap+12],$t3
159 addccc @acc[4],$t4,@acc[4]
160 addccc @acc[5],$t5,@acc[5]
161 ld [$ap+16],$t4
162 ld [$ap+20],$t5
163 addccc @acc[6],$t6,@acc[6]
164 addccc @acc[7],$t7,@acc[7]
165 ld [$ap+24],$t6
166 ld [$ap+28],$t7
167 addccc @acc[0],$carry,@acc[0] ! "@acc[8]"
168 addc %g0,%g0,$carry
169___
170 # Reduction iteration is normally performed by accumulating
171 # result of multiplication of modulus by "magic" digit [and
172 # omitting least significant word, which is guaranteed to
173 # be 0], but thanks to special form of modulus and "magic"
174 # digit being equal to least significant word, it can be
175 # performed with additions and subtractions alone. Indeed:
176 #
177 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
178 # * abcd
179 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
180 #
181 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
182 # rewrite above as:
183 #
184 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
185 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
186 # - abcd.0000.0000.0000.0000.0000.0000.abcd
187 #
188 # or marking redundant operations:
189 #
190 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
191 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
192 # - abcd.----.----.----.----.----.----.----
193
194$code.=<<___;
195 ! multiplication-less reduction
196 addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0]
197 addccc @acc[4],%g0,@acc[4] ! r[4]+=0
198 and @acc[1],$mask,@acc[1]
199 and @acc[2],$mask,@acc[2]
200 addccc @acc[5],%g0,@acc[5] ! r[5]+=0
201 addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0]
202 and @acc[3],$mask,@acc[3]
203 and @acc[4],$mask,@acc[4]
204 addccc @acc[7],%g0,@acc[7] ! r[7]+=0
205 addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]"
206 and @acc[5],$mask,@acc[5]
207 and @acc[6],$mask,@acc[6]
208 addc $carry,%g0,$carry ! top-most carry
209 subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0]
210 subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]"
211 subc $carry,%g0,$carry ! top-most carry
212 and @acc[7],$mask,@acc[7]
213 and @acc[0],$mask,@acc[0] ! "@acc[8]"
214___
215 push(@acc,shift(@acc)); # rotate registers to "omit" acc[0]
216$code.=<<___;
217 mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results
218 mulx $t1,$bi,$t1
219 mulx $t2,$bi,$t2
220 mulx $t3,$bi,$t3
221 mulx $t4,$bi,$t4
222 mulx $t5,$bi,$t5
223 mulx $t6,$bi,$t6
224 mulx $t7,$bi,$t7
225 add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow
226 add @acc[1],$t1,$t1
227 srlx $t0,32,@acc[1] ! extract high parts
228 add @acc[2],$t2,$t2
229 srlx $t1,32,@acc[2]
230 add @acc[3],$t3,$t3
231 srlx $t2,32,@acc[3]
232 add @acc[4],$t4,$t4
233 srlx $t3,32,@acc[4]
234 add @acc[5],$t5,$t5
235 srlx $t4,32,@acc[5]
236 add @acc[6],$t6,$t6
237 srlx $t5,32,@acc[6]
238 add @acc[7],$t7,$t7
239 srlx $t6,32,@acc[7]
240 srlx $t7,32,@acc[0] ! "@acc[8]"
241___
242}
243$code.=<<___;
244 addcc @acc[1],$t1,@acc[1] ! accumulate high parts
245 addccc @acc[2],$t2,@acc[2]
246 addccc @acc[3],$t3,@acc[3]
247 addccc @acc[4],$t4,@acc[4]
248 addccc @acc[5],$t5,@acc[5]
249 addccc @acc[6],$t6,@acc[6]
250 addccc @acc[7],$t7,@acc[7]
251 addccc @acc[0],$carry,@acc[0] ! "@acc[8]"
252 addc %g0,%g0,$carry
253
254 addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction
255 addccc @acc[4],%g0,@acc[4]
256 addccc @acc[5],%g0,@acc[5]
257 addccc @acc[6],$t0,@acc[6]
258 addccc @acc[7],%g0,@acc[7]
259 addccc @acc[0],$t0,@acc[0] ! "@acc[8]"
260 addc $carry,%g0,$carry
261 subcc @acc[7],$t0,@acc[7]
262 subccc @acc[0],%g0,@acc[0] ! "@acc[8]"
263 subc $carry,%g0,$carry ! top-most carry
264___
265 push(@acc,shift(@acc)); # rotate registers to omit acc[0]
266$code.=<<___;
267 ! Final step is "if result > mod, subtract mod", but we do it
268 ! "other way around", namely subtract modulus from result
269 ! and if it borrowed, add modulus back.
270
271 subcc @acc[0],-1,@acc[0] ! subtract modulus
272 subccc @acc[1],-1,@acc[1]
273 subccc @acc[2],-1,@acc[2]
274 subccc @acc[3],0,@acc[3]
275 subccc @acc[4],0,@acc[4]
276 subccc @acc[5],0,@acc[5]
277 subccc @acc[6],1,@acc[6]
278 subccc @acc[7],-1,@acc[7]
279 subc $carry,0,$carry ! broadcast borrow bit
280
281 ! Note that because mod has special form, i.e. consists of
282 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
283 ! using value of broadcasted borrow and the borrow bit itself.
284 ! To minimize dependency chain we first broadcast and then
285 ! extract the bit by negating (follow $bi).
286
287 addcc @acc[0],$carry,@acc[0] ! add modulus or zero
288 addccc @acc[1],$carry,@acc[1]
289 neg $carry,$bi
290 st @acc[0],[$rp]
291 addccc @acc[2],$carry,@acc[2]
292 st @acc[1],[$rp+4]
293 addccc @acc[3],0,@acc[3]
294 st @acc[2],[$rp+8]
295 addccc @acc[4],0,@acc[4]
296 st @acc[3],[$rp+12]
297 addccc @acc[5],0,@acc[5]
298 st @acc[4],[$rp+16]
299 addccc @acc[6],$bi,@acc[6]
300 st @acc[5],[$rp+20]
301 addc @acc[7],$carry,@acc[7]
302 st @acc[6],[$rp+24]
303 retl
304 st @acc[7],[$rp+28]
305.type __ecp_nistz256_mul_mont,#function
306.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
307
308! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
309! const BN_ULONG %i2[8]);
310.globl ecp_nistz256_add
311.align 32
312ecp_nistz256_add:
313 save %sp,-STACK_FRAME,%sp
314 ld [$ap],@acc[0]
315 ld [$ap+4],@acc[1]
316 ld [$ap+8],@acc[2]
317 ld [$ap+12],@acc[3]
318 ld [$ap+16],@acc[4]
319 ld [$ap+20],@acc[5]
320 ld [$ap+24],@acc[6]
321 call __ecp_nistz256_add
322 ld [$ap+28],@acc[7]
323 ret
324 restore
325.type ecp_nistz256_add,#function
326.size ecp_nistz256_add,.-ecp_nistz256_add
327
328.align 32
329__ecp_nistz256_add:
330 ld [$bp+0],$t0 ! b[0]
331 ld [$bp+4],$t1
332 ld [$bp+8],$t2
333 ld [$bp+12],$t3
334 addcc @acc[0],$t0,@acc[0]
335 ld [$bp+16],$t4
336 ld [$bp+20],$t5
337 addccc @acc[1],$t1,@acc[1]
338 ld [$bp+24],$t6
339 ld [$bp+28],$t7
340 addccc @acc[2],$t2,@acc[2]
341 addccc @acc[3],$t3,@acc[3]
342 addccc @acc[4],$t4,@acc[4]
343 addccc @acc[5],$t5,@acc[5]
344 addccc @acc[6],$t6,@acc[6]
345 addccc @acc[7],$t7,@acc[7]
346 addc %g0,%g0,$carry
347
348.Lreduce_by_sub:
349
350 ! if a+b >= modulus, subtract modulus.
351 !
352 ! But since comparison implies subtraction, we subtract
353 ! modulus and then add it back if subtraction borrowed.
354
355 subcc @acc[0],-1,@acc[0]
356 subccc @acc[1],-1,@acc[1]
357 subccc @acc[2],-1,@acc[2]
358 subccc @acc[3], 0,@acc[3]
359 subccc @acc[4], 0,@acc[4]
360 subccc @acc[5], 0,@acc[5]
361 subccc @acc[6], 1,@acc[6]
362 subccc @acc[7],-1,@acc[7]
363 subc $carry,0,$carry
364
365 ! Note that because mod has special form, i.e. consists of
366 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
367 ! using value of borrow and its negative.
368
369 addcc @acc[0],$carry,@acc[0] ! add synthesized modulus
370 addccc @acc[1],$carry,@acc[1]
371 neg $carry,$bi
372 st @acc[0],[$rp]
373 addccc @acc[2],$carry,@acc[2]
374 st @acc[1],[$rp+4]
375 addccc @acc[3],0,@acc[3]
376 st @acc[2],[$rp+8]
377 addccc @acc[4],0,@acc[4]
378 st @acc[3],[$rp+12]
379 addccc @acc[5],0,@acc[5]
380 st @acc[4],[$rp+16]
381 addccc @acc[6],$bi,@acc[6]
382 st @acc[5],[$rp+20]
383 addc @acc[7],$carry,@acc[7]
384 st @acc[6],[$rp+24]
385 retl
386 st @acc[7],[$rp+28]
387.type __ecp_nistz256_add,#function
388.size __ecp_nistz256_add,.-__ecp_nistz256_add
389
390! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
391.globl ecp_nistz256_mul_by_2
392.align 32
393ecp_nistz256_mul_by_2:
394 save %sp,-STACK_FRAME,%sp
395 ld [$ap],@acc[0]
396 ld [$ap+4],@acc[1]
397 ld [$ap+8],@acc[2]
398 ld [$ap+12],@acc[3]
399 ld [$ap+16],@acc[4]
400 ld [$ap+20],@acc[5]
401 ld [$ap+24],@acc[6]
402 call __ecp_nistz256_mul_by_2
403 ld [$ap+28],@acc[7]
404 ret
405 restore
406.type ecp_nistz256_mul_by_2,#function
407.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
408
409.align 32
410__ecp_nistz256_mul_by_2:
411 addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a
412 addccc @acc[1],@acc[1],@acc[1]
413 addccc @acc[2],@acc[2],@acc[2]
414 addccc @acc[3],@acc[3],@acc[3]
415 addccc @acc[4],@acc[4],@acc[4]
416 addccc @acc[5],@acc[5],@acc[5]
417 addccc @acc[6],@acc[6],@acc[6]
418 addccc @acc[7],@acc[7],@acc[7]
419 b .Lreduce_by_sub
420 addc %g0,%g0,$carry
421.type __ecp_nistz256_mul_by_2,#function
422.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
423
424! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
425.globl ecp_nistz256_mul_by_3
426.align 32
427ecp_nistz256_mul_by_3:
428 save %sp,-STACK_FRAME,%sp
429 ld [$ap],@acc[0]
430 ld [$ap+4],@acc[1]
431 ld [$ap+8],@acc[2]
432 ld [$ap+12],@acc[3]
433 ld [$ap+16],@acc[4]
434 ld [$ap+20],@acc[5]
435 ld [$ap+24],@acc[6]
436 call __ecp_nistz256_mul_by_3
437 ld [$ap+28],@acc[7]
438 ret
439 restore
440.type ecp_nistz256_mul_by_3,#function
441.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
442
443.align 32
444__ecp_nistz256_mul_by_3:
445 addcc @acc[0],@acc[0],$t0 ! a+a=2*a
446 addccc @acc[1],@acc[1],$t1
447 addccc @acc[2],@acc[2],$t2
448 addccc @acc[3],@acc[3],$t3
449 addccc @acc[4],@acc[4],$t4
450 addccc @acc[5],@acc[5],$t5
451 addccc @acc[6],@acc[6],$t6
452 addccc @acc[7],@acc[7],$t7
453 addc %g0,%g0,$carry
454
455 subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores
456 subccc $t1,-1,$t1
457 subccc $t2,-1,$t2
458 subccc $t3, 0,$t3
459 subccc $t4, 0,$t4
460 subccc $t5, 0,$t5
461 subccc $t6, 1,$t6
462 subccc $t7,-1,$t7
463 subc $carry,0,$carry
464
465 addcc $t0,$carry,$t0 ! add synthesized modulus
466 addccc $t1,$carry,$t1
467 neg $carry,$bi
468 addccc $t2,$carry,$t2
469 addccc $t3,0,$t3
470 addccc $t4,0,$t4
471 addccc $t5,0,$t5
472 addccc $t6,$bi,$t6
473 addc $t7,$carry,$t7
474
475 addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a
476 addccc $t1,@acc[1],@acc[1]
477 addccc $t2,@acc[2],@acc[2]
478 addccc $t3,@acc[3],@acc[3]
479 addccc $t4,@acc[4],@acc[4]
480 addccc $t5,@acc[5],@acc[5]
481 addccc $t6,@acc[6],@acc[6]
482 addccc $t7,@acc[7],@acc[7]
483 b .Lreduce_by_sub
484 addc %g0,%g0,$carry
485.type __ecp_nistz256_mul_by_3,#function
486.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
487
488! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
489.globl ecp_nistz256_neg
490.align 32
491ecp_nistz256_neg:
492 save %sp,-STACK_FRAME,%sp
493 mov $ap,$bp
494 mov 0,@acc[0]
495 mov 0,@acc[1]
496 mov 0,@acc[2]
497 mov 0,@acc[3]
498 mov 0,@acc[4]
499 mov 0,@acc[5]
500 mov 0,@acc[6]
501 call __ecp_nistz256_sub_from
502 mov 0,@acc[7]
503 ret
504 restore
505.type ecp_nistz256_neg,#function
506.size ecp_nistz256_neg,.-ecp_nistz256_neg
507
508.align 32
509__ecp_nistz256_sub_from:
510 ld [$bp+0],$t0 ! b[0]
511 ld [$bp+4],$t1
512 ld [$bp+8],$t2
513 ld [$bp+12],$t3
514 subcc @acc[0],$t0,@acc[0]
515 ld [$bp+16],$t4
516 ld [$bp+20],$t5
517 subccc @acc[1],$t1,@acc[1]
518 subccc @acc[2],$t2,@acc[2]
519 ld [$bp+24],$t6
520 ld [$bp+28],$t7
521 subccc @acc[3],$t3,@acc[3]
522 subccc @acc[4],$t4,@acc[4]
523 subccc @acc[5],$t5,@acc[5]
524 subccc @acc[6],$t6,@acc[6]
525 subccc @acc[7],$t7,@acc[7]
526 subc %g0,%g0,$carry ! broadcast borrow bit
527
528.Lreduce_by_add:
529
530 ! if a-b borrows, add modulus.
531 !
532 ! Note that because mod has special form, i.e. consists of
533 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
534 ! using value of broadcasted borrow and the borrow bit itself.
535 ! To minimize dependency chain we first broadcast and then
536 ! extract the bit by negating (follow $bi).
537
538 addcc @acc[0],$carry,@acc[0] ! add synthesized modulus
539 addccc @acc[1],$carry,@acc[1]
540 neg $carry,$bi
541 st @acc[0],[$rp]
542 addccc @acc[2],$carry,@acc[2]
543 st @acc[1],[$rp+4]
544 addccc @acc[3],0,@acc[3]
545 st @acc[2],[$rp+8]
546 addccc @acc[4],0,@acc[4]
547 st @acc[3],[$rp+12]
548 addccc @acc[5],0,@acc[5]
549 st @acc[4],[$rp+16]
550 addccc @acc[6],$bi,@acc[6]
551 st @acc[5],[$rp+20]
552 addc @acc[7],$carry,@acc[7]
553 st @acc[6],[$rp+24]
554 retl
555 st @acc[7],[$rp+28]
556.type __ecp_nistz256_sub_from,#function
557.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
558
559.align 32
560__ecp_nistz256_sub_morf:
561 ld [$bp+0],$t0 ! b[0]
562 ld [$bp+4],$t1
563 ld [$bp+8],$t2
564 ld [$bp+12],$t3
565 subcc $t0,@acc[0],@acc[0]
566 ld [$bp+16],$t4
567 ld [$bp+20],$t5
568 subccc $t1,@acc[1],@acc[1]
569 subccc $t2,@acc[2],@acc[2]
570 ld [$bp+24],$t6
571 ld [$bp+28],$t7
572 subccc $t3,@acc[3],@acc[3]
573 subccc $t4,@acc[4],@acc[4]
574 subccc $t5,@acc[5],@acc[5]
575 subccc $t6,@acc[6],@acc[6]
576 subccc $t7,@acc[7],@acc[7]
577 b .Lreduce_by_add
578 subc %g0,%g0,$carry ! broadcast borrow bit
579.type __ecp_nistz256_sub_morf,#function
580.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
581
582! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
583.globl ecp_nistz256_div_by_2
584.align 32
585ecp_nistz256_div_by_2:
586 save %sp,-STACK_FRAME,%sp
587 ld [$ap],@acc[0]
588 ld [$ap+4],@acc[1]
589 ld [$ap+8],@acc[2]
590 ld [$ap+12],@acc[3]
591 ld [$ap+16],@acc[4]
592 ld [$ap+20],@acc[5]
593 ld [$ap+24],@acc[6]
594 call __ecp_nistz256_div_by_2
595 ld [$ap+28],@acc[7]
596 ret
597 restore
598.type ecp_nistz256_div_by_2,#function
599.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
600
601.align 32
602__ecp_nistz256_div_by_2:
603 ! ret = (a is odd ? a+mod : a) >> 1
604
605 and @acc[0],1,$bi
606 neg $bi,$carry
607 addcc @acc[0],$carry,@acc[0]
608 addccc @acc[1],$carry,@acc[1]
609 addccc @acc[2],$carry,@acc[2]
610 addccc @acc[3],0,@acc[3]
611 addccc @acc[4],0,@acc[4]
612 addccc @acc[5],0,@acc[5]
613 addccc @acc[6],$bi,@acc[6]
614 addccc @acc[7],$carry,@acc[7]
615 addc %g0,%g0,$carry
616
617 ! ret >>= 1
618
619 srl @acc[0],1,@acc[0]
620 sll @acc[1],31,$t0
621 srl @acc[1],1,@acc[1]
622 or @acc[0],$t0,@acc[0]
623 sll @acc[2],31,$t1
624 srl @acc[2],1,@acc[2]
625 or @acc[1],$t1,@acc[1]
626 sll @acc[3],31,$t2
627 st @acc[0],[$rp]
628 srl @acc[3],1,@acc[3]
629 or @acc[2],$t2,@acc[2]
630 sll @acc[4],31,$t3
631 st @acc[1],[$rp+4]
632 srl @acc[4],1,@acc[4]
633 or @acc[3],$t3,@acc[3]
634 sll @acc[5],31,$t4
635 st @acc[2],[$rp+8]
636 srl @acc[5],1,@acc[5]
637 or @acc[4],$t4,@acc[4]
638 sll @acc[6],31,$t5
639 st @acc[3],[$rp+12]
640 srl @acc[6],1,@acc[6]
641 or @acc[5],$t5,@acc[5]
642 sll @acc[7],31,$t6
643 st @acc[4],[$rp+16]
644 srl @acc[7],1,@acc[7]
645 or @acc[6],$t6,@acc[6]
646 sll $carry,31,$t7
647 st @acc[5],[$rp+20]
648 or @acc[7],$t7,@acc[7]
649 st @acc[6],[$rp+24]
650 retl
651 st @acc[7],[$rp+28]
652.type __ecp_nistz256_div_by_2,#function
653.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
654___
655
656########################################################################
657# following subroutines are "literal" implementation of those found in
658# ecp_nistz256.c
659#
660########################################################################
661# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
662#
663{
664my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
665# above map() describes stack layout with 4 temporary
666# 256-bit vectors on top.
667
668$code.=<<___;
669#if 0
670#ifdef __PIC__
671SPARC_PIC_THUNK(%g1)
672#endif
673#endif
674
675.globl ecp_nistz256_point_double
676.align 32
677ecp_nistz256_point_double:
678#if 0
679 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
680 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0]
681 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
682 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
683 be ecp_nistz256_point_double_vis3
684 nop
685#endif
686
687 save %sp,-STACK_FRAME-32*4,%sp
688
689 mov $rp,$rp_real
690 mov $ap,$ap_real
691
692.Lpoint_double_shortcut:
693 ld [$ap+32],@acc[0]
694 ld [$ap+32+4],@acc[1]
695 ld [$ap+32+8],@acc[2]
696 ld [$ap+32+12],@acc[3]
697 ld [$ap+32+16],@acc[4]
698 ld [$ap+32+20],@acc[5]
699 ld [$ap+32+24],@acc[6]
700 ld [$ap+32+28],@acc[7]
701 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y);
702 add %sp,LOCALS+$S,$rp
703
704 add $ap_real,64,$bp
705 add $ap_real,64,$ap
706 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z);
707 add %sp,LOCALS+$Zsqr,$rp
708
709 add $ap_real,0,$bp
710 call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x);
711 add %sp,LOCALS+$M,$rp
712
713 add %sp,LOCALS+$S,$bp
714 add %sp,LOCALS+$S,$ap
715 call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S);
716 add %sp,LOCALS+$S,$rp
717
718 ld [$ap_real],@acc[0]
719 add %sp,LOCALS+$Zsqr,$bp
720 ld [$ap_real+4],@acc[1]
721 ld [$ap_real+8],@acc[2]
722 ld [$ap_real+12],@acc[3]
723 ld [$ap_real+16],@acc[4]
724 ld [$ap_real+20],@acc[5]
725 ld [$ap_real+24],@acc[6]
726 ld [$ap_real+28],@acc[7]
727 call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr);
728 add %sp,LOCALS+$Zsqr,$rp
729
730 add $ap_real,32,$bp
731 add $ap_real,64,$ap
732 call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y);
733 add %sp,LOCALS+$tmp0,$rp
734
735 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0);
736 add $rp_real,64,$rp
737
738 add %sp,LOCALS+$Zsqr,$bp
739 add %sp,LOCALS+$M,$ap
740 call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr);
741 add %sp,LOCALS+$M,$rp
742
743 call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M);
744 add %sp,LOCALS+$M,$rp
745
746 add %sp,LOCALS+$S,$bp
747 add %sp,LOCALS+$S,$ap
748 call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S);
749 add %sp,LOCALS+$tmp0,$rp
750
751 call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0);
752 add $rp_real,32,$rp
753
754 add $ap_real,0,$bp
755 add %sp,LOCALS+$S,$ap
756 call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x);
757 add %sp,LOCALS+$S,$rp
758
759 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S);
760 add %sp,LOCALS+$tmp0,$rp
761
762 add %sp,LOCALS+$M,$bp
763 add %sp,LOCALS+$M,$ap
764 call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M);
765 add $rp_real,0,$rp
766
767 add %sp,LOCALS+$tmp0,$bp
768 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0);
769 add $rp_real,0,$rp
770
771 add %sp,LOCALS+$S,$bp
772 call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x);
773 add %sp,LOCALS+$S,$rp
774
775 add %sp,LOCALS+$M,$bp
776 add %sp,LOCALS+$S,$ap
777 call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M);
778 add %sp,LOCALS+$S,$rp
779
780 add $rp_real,32,$bp
781 call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y);
782 add $rp_real,32,$rp
783
784 ret
785 restore
786.type ecp_nistz256_point_double,#function
787.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
788___
789}
790
791########################################################################
792# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
793# const P256_POINT *in2);
794{
795my ($res_x,$res_y,$res_z,
796 $H,$Hsqr,$R,$Rsqr,$Hcub,
797 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
798my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
799
800# above map() describes stack layout with 12 temporary
801# 256-bit vectors on top. Then we reserve some space for
802# !in1infty, !in2infty, result of check for zero and return pointer.
803
804my $bp_real=$rp_real;
805
806$code.=<<___;
807.globl ecp_nistz256_point_add
808.align 32
809ecp_nistz256_point_add:
810#if 0
811 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
812 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0]
813 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
814 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
815 be ecp_nistz256_point_add_vis3
816 nop
817#endif
818
819 save %sp,-STACK_FRAME-32*12-32,%sp
820
821 stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp
822 mov $ap,$ap_real
823 mov $bp,$bp_real
824
825 ld [$bp+64],$t0 ! in2_z
826 ld [$bp+64+4],$t1
827 ld [$bp+64+8],$t2
828 ld [$bp+64+12],$t3
829 ld [$bp+64+16],$t4
830 ld [$bp+64+20],$t5
831 ld [$bp+64+24],$t6
832 ld [$bp+64+28],$t7
833 or $t1,$t0,$t0
834 or $t3,$t2,$t2
835 or $t5,$t4,$t4
836 or $t7,$t6,$t6
837 or $t2,$t0,$t0
838 or $t6,$t4,$t4
839 or $t4,$t0,$t0 ! !in2infty
840 movrnz $t0,-1,$t0
841 st $t0,[%fp+STACK_BIAS-12]
842
843 ld [$ap+64],$t0 ! in1_z
844 ld [$ap+64+4],$t1
845 ld [$ap+64+8],$t2
846 ld [$ap+64+12],$t3
847 ld [$ap+64+16],$t4
848 ld [$ap+64+20],$t5
849 ld [$ap+64+24],$t6
850 ld [$ap+64+28],$t7
851 or $t1,$t0,$t0
852 or $t3,$t2,$t2
853 or $t5,$t4,$t4
854 or $t7,$t6,$t6
855 or $t2,$t0,$t0
856 or $t6,$t4,$t4
857 or $t4,$t0,$t0 ! !in1infty
858 movrnz $t0,-1,$t0
859 st $t0,[%fp+STACK_BIAS-16]
860
861 add $bp_real,64,$bp
862 add $bp_real,64,$ap
863 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z);
864 add %sp,LOCALS+$Z2sqr,$rp
865
866 add $ap_real,64,$bp
867 add $ap_real,64,$ap
868 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
869 add %sp,LOCALS+$Z1sqr,$rp
870
871 add $bp_real,64,$bp
872 add %sp,LOCALS+$Z2sqr,$ap
873 call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z);
874 add %sp,LOCALS+$S1,$rp
875
876 add $ap_real,64,$bp
877 add %sp,LOCALS+$Z1sqr,$ap
878 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
879 add %sp,LOCALS+$S2,$rp
880
881 add $ap_real,32,$bp
882 add %sp,LOCALS+$S1,$ap
883 call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y);
884 add %sp,LOCALS+$S1,$rp
885
886 add $bp_real,32,$bp
887 add %sp,LOCALS+$S2,$ap
888 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
889 add %sp,LOCALS+$S2,$rp
890
891 add %sp,LOCALS+$S1,$bp
892 call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1);
893 add %sp,LOCALS+$R,$rp
894
895 or @acc[1],@acc[0],@acc[0] ! see if result is zero
896 or @acc[3],@acc[2],@acc[2]
897 or @acc[5],@acc[4],@acc[4]
898 or @acc[7],@acc[6],@acc[6]
899 or @acc[2],@acc[0],@acc[0]
900 or @acc[6],@acc[4],@acc[4]
901 or @acc[4],@acc[0],@acc[0]
902 st @acc[0],[%fp+STACK_BIAS-20]
903
904 add $ap_real,0,$bp
905 add %sp,LOCALS+$Z2sqr,$ap
906 call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr);
907 add %sp,LOCALS+$U1,$rp
908
909 add $bp_real,0,$bp
910 add %sp,LOCALS+$Z1sqr,$ap
911 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr);
912 add %sp,LOCALS+$U2,$rp
913
914 add %sp,LOCALS+$U1,$bp
915 call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1);
916 add %sp,LOCALS+$H,$rp
917
918 or @acc[1],@acc[0],@acc[0] ! see if result is zero
919 or @acc[3],@acc[2],@acc[2]
920 or @acc[5],@acc[4],@acc[4]
921 or @acc[7],@acc[6],@acc[6]
922 or @acc[2],@acc[0],@acc[0]
923 or @acc[6],@acc[4],@acc[4]
924 orcc @acc[4],@acc[0],@acc[0]
925
926 bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)?
927 nop
928
929 ld [%fp+STACK_BIAS-12],$t0
930 ld [%fp+STACK_BIAS-16],$t1
931 ld [%fp+STACK_BIAS-20],$t2
932 andcc $t0,$t1,%g0
933 be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)?
934 nop
935 andcc $t2,$t2,%g0
936 be,pt %icc,.Ladd_double ! is_equal(S1,S2)?
937 nop
938
939 ldx [%fp+STACK_BIAS-8],$rp
940 st %g0,[$rp]
941 st %g0,[$rp+4]
942 st %g0,[$rp+8]
943 st %g0,[$rp+12]
944 st %g0,[$rp+16]
945 st %g0,[$rp+20]
946 st %g0,[$rp+24]
947 st %g0,[$rp+28]
948 st %g0,[$rp+32]
949 st %g0,[$rp+32+4]
950 st %g0,[$rp+32+8]
951 st %g0,[$rp+32+12]
952 st %g0,[$rp+32+16]
953 st %g0,[$rp+32+20]
954 st %g0,[$rp+32+24]
955 st %g0,[$rp+32+28]
956 st %g0,[$rp+64]
957 st %g0,[$rp+64+4]
958 st %g0,[$rp+64+8]
959 st %g0,[$rp+64+12]
960 st %g0,[$rp+64+16]
961 st %g0,[$rp+64+20]
962 st %g0,[$rp+64+24]
963 st %g0,[$rp+64+28]
964 b .Ladd_done
965 nop
966
967.align 16
968.Ladd_double:
969 ldx [%fp+STACK_BIAS-8],$rp_real
970 mov $ap_real,$ap
971 b .Lpoint_double_shortcut
972 add %sp,32*(12-4)+32,%sp ! difference in frame sizes
973
974.align 16
975.Ladd_proceed:
976 add %sp,LOCALS+$R,$bp
977 add %sp,LOCALS+$R,$ap
978 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
979 add %sp,LOCALS+$Rsqr,$rp
980
981 add $ap_real,64,$bp
982 add %sp,LOCALS+$H,$ap
983 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
984 add %sp,LOCALS+$res_z,$rp
985
986 add %sp,LOCALS+$H,$bp
987 add %sp,LOCALS+$H,$ap
988 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
989 add %sp,LOCALS+$Hsqr,$rp
990
991 add $bp_real,64,$bp
992 add %sp,LOCALS+$res_z,$ap
993 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z);
994 add %sp,LOCALS+$res_z,$rp
995
996 add %sp,LOCALS+$H,$bp
997 add %sp,LOCALS+$Hsqr,$ap
998 call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
999 add %sp,LOCALS+$Hcub,$rp
1000
1001 add %sp,LOCALS+$U1,$bp
1002 add %sp,LOCALS+$Hsqr,$ap
1003 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr);
1004 add %sp,LOCALS+$U2,$rp
1005
1006 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1007 add %sp,LOCALS+$Hsqr,$rp
1008
1009 add %sp,LOCALS+$Rsqr,$bp
1010 call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1011 add %sp,LOCALS+$res_x,$rp
1012
1013 add %sp,LOCALS+$Hcub,$bp
1014 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub);
1015 add %sp,LOCALS+$res_x,$rp
1016
1017 add %sp,LOCALS+$U2,$bp
1018 call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1019 add %sp,LOCALS+$res_y,$rp
1020
1021 add %sp,LOCALS+$Hcub,$bp
1022 add %sp,LOCALS+$S1,$ap
1023 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub);
1024 add %sp,LOCALS+$S2,$rp
1025
1026 add %sp,LOCALS+$R,$bp
1027 add %sp,LOCALS+$res_y,$ap
1028 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1029 add %sp,LOCALS+$res_y,$rp
1030
1031 add %sp,LOCALS+$S2,$bp
1032 call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1033 add %sp,LOCALS+$res_y,$rp
1034
1035 ld [%fp+STACK_BIAS-16],$t1 ! !in1infty
1036 ld [%fp+STACK_BIAS-12],$t2 ! !in2infty
1037 ldx [%fp+STACK_BIAS-8],$rp
1038___
1039for($i=0;$i<96;$i+=8) { # conditional moves
1040$code.=<<___;
1041 ld [%sp+LOCALS+$i],@acc[0] ! res
1042 ld [%sp+LOCALS+$i+4],@acc[1]
1043 ld [$bp_real+$i],@acc[2] ! in2
1044 ld [$bp_real+$i+4],@acc[3]
1045 ld [$ap_real+$i],@acc[4] ! in1
1046 ld [$ap_real+$i+4],@acc[5]
1047 movrz $t1,@acc[2],@acc[0]
1048 movrz $t1,@acc[3],@acc[1]
1049 movrz $t2,@acc[4],@acc[0]
1050 movrz $t2,@acc[5],@acc[1]
1051 st @acc[0],[$rp+$i]
1052 st @acc[1],[$rp+$i+4]
1053___
1054}
1055$code.=<<___;
1056.Ladd_done:
1057 ret
1058 restore
1059.type ecp_nistz256_point_add,#function
1060.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1061___
1062}
1063
1064########################################################################
1065# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1066# const P256_POINT_AFFINE *in2);
1067{
1068my ($res_x,$res_y,$res_z,
1069 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1070my $Z1sqr = $S2;
1071# above map() describes stack layout with 10 temporary
1072# 256-bit vectors on top. Then we reserve some space for
1073# !in1infty, !in2infty, result of check for zero and return pointer.
1074
1075my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1076my $bp_real=$rp_real;
1077
1078$code.=<<___;
1079.globl ecp_nistz256_point_add_affine
1080.align 32
1081ecp_nistz256_point_add_affine:
1082#if 0
1083 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1084 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0]
1085 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1086 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1087 be ecp_nistz256_point_add_affine_vis3
1088 nop
1089#endif
1090
1091 save %sp,-STACK_FRAME-32*10-32,%sp
1092
1093 stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp
1094 mov $ap,$ap_real
1095 mov $bp,$bp_real
1096
1097 ld [$ap+64],$t0 ! in1_z
1098 ld [$ap+64+4],$t1
1099 ld [$ap+64+8],$t2
1100 ld [$ap+64+12],$t3
1101 ld [$ap+64+16],$t4
1102 ld [$ap+64+20],$t5
1103 ld [$ap+64+24],$t6
1104 ld [$ap+64+28],$t7
1105 or $t1,$t0,$t0
1106 or $t3,$t2,$t2
1107 or $t5,$t4,$t4
1108 or $t7,$t6,$t6
1109 or $t2,$t0,$t0
1110 or $t6,$t4,$t4
1111 or $t4,$t0,$t0 ! !in1infty
1112 movrnz $t0,-1,$t0
1113 st $t0,[%fp+STACK_BIAS-16]
1114
1115 ld [$bp],@acc[0] ! in2_x
1116 ld [$bp+4],@acc[1]
1117 ld [$bp+8],@acc[2]
1118 ld [$bp+12],@acc[3]
1119 ld [$bp+16],@acc[4]
1120 ld [$bp+20],@acc[5]
1121 ld [$bp+24],@acc[6]
1122 ld [$bp+28],@acc[7]
1123 ld [$bp+32],$t0 ! in2_y
1124 ld [$bp+32+4],$t1
1125 ld [$bp+32+8],$t2
1126 ld [$bp+32+12],$t3
1127 ld [$bp+32+16],$t4
1128 ld [$bp+32+20],$t5
1129 ld [$bp+32+24],$t6
1130 ld [$bp+32+28],$t7
1131 or @acc[1],@acc[0],@acc[0]
1132 or @acc[3],@acc[2],@acc[2]
1133 or @acc[5],@acc[4],@acc[4]
1134 or @acc[7],@acc[6],@acc[6]
1135 or @acc[2],@acc[0],@acc[0]
1136 or @acc[6],@acc[4],@acc[4]
1137 or @acc[4],@acc[0],@acc[0]
1138 or $t1,$t0,$t0
1139 or $t3,$t2,$t2
1140 or $t5,$t4,$t4
1141 or $t7,$t6,$t6
1142 or $t2,$t0,$t0
1143 or $t6,$t4,$t4
1144 or $t4,$t0,$t0
1145 or @acc[0],$t0,$t0 ! !in2infty
1146 movrnz $t0,-1,$t0
1147 st $t0,[%fp+STACK_BIAS-12]
1148
1149 add $ap_real,64,$bp
1150 add $ap_real,64,$ap
1151 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
1152 add %sp,LOCALS+$Z1sqr,$rp
1153
1154 add $bp_real,0,$bp
1155 add %sp,LOCALS+$Z1sqr,$ap
1156 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x);
1157 add %sp,LOCALS+$U2,$rp
1158
1159 add $ap_real,0,$bp
1160 call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x);
1161 add %sp,LOCALS+$H,$rp
1162
1163 add $ap_real,64,$bp
1164 add %sp,LOCALS+$Z1sqr,$ap
1165 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
1166 add %sp,LOCALS+$S2,$rp
1167
1168 add $ap_real,64,$bp
1169 add %sp,LOCALS+$H,$ap
1170 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1171 add %sp,LOCALS+$res_z,$rp
1172
1173 add $bp_real,32,$bp
1174 add %sp,LOCALS+$S2,$ap
1175 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
1176 add %sp,LOCALS+$S2,$rp
1177
1178 add $ap_real,32,$bp
1179 call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y);
1180 add %sp,LOCALS+$R,$rp
1181
1182 add %sp,LOCALS+$H,$bp
1183 add %sp,LOCALS+$H,$ap
1184 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1185 add %sp,LOCALS+$Hsqr,$rp
1186
1187 add %sp,LOCALS+$R,$bp
1188 add %sp,LOCALS+$R,$ap
1189 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1190 add %sp,LOCALS+$Rsqr,$rp
1191
1192 add %sp,LOCALS+$H,$bp
1193 add %sp,LOCALS+$Hsqr,$ap
1194 call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1195 add %sp,LOCALS+$Hcub,$rp
1196
1197 add $ap_real,0,$bp
1198 add %sp,LOCALS+$Hsqr,$ap
1199 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr);
1200 add %sp,LOCALS+$U2,$rp
1201
1202 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1203 add %sp,LOCALS+$Hsqr,$rp
1204
1205 add %sp,LOCALS+$Rsqr,$bp
1206 call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1207 add %sp,LOCALS+$res_x,$rp
1208
1209 add %sp,LOCALS+$Hcub,$bp
1210 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub);
1211 add %sp,LOCALS+$res_x,$rp
1212
1213 add %sp,LOCALS+$U2,$bp
1214 call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1215 add %sp,LOCALS+$res_y,$rp
1216
1217 add $ap_real,32,$bp
1218 add %sp,LOCALS+$Hcub,$ap
1219 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub);
1220 add %sp,LOCALS+$S2,$rp
1221
1222 add %sp,LOCALS+$R,$bp
1223 add %sp,LOCALS+$res_y,$ap
1224 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1225 add %sp,LOCALS+$res_y,$rp
1226
1227 add %sp,LOCALS+$S2,$bp
1228 call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1229 add %sp,LOCALS+$res_y,$rp
1230
1231 ld [%fp+STACK_BIAS-16],$t1 ! !in1infty
1232 ld [%fp+STACK_BIAS-12],$t2 ! !in2infty
1233 ldx [%fp+STACK_BIAS-8],$rp
1234___
1235for($i=0;$i<64;$i+=8) { # conditional moves
1236$code.=<<___;
1237 ld [%sp+LOCALS+$i],@acc[0] ! res
1238 ld [%sp+LOCALS+$i+4],@acc[1]
1239 ld [$bp_real+$i],@acc[2] ! in2
1240 ld [$bp_real+$i+4],@acc[3]
1241 ld [$ap_real+$i],@acc[4] ! in1
1242 ld [$ap_real+$i+4],@acc[5]
1243 movrz $t1,@acc[2],@acc[0]
1244 movrz $t1,@acc[3],@acc[1]
1245 movrz $t2,@acc[4],@acc[0]
1246 movrz $t2,@acc[5],@acc[1]
1247 st @acc[0],[$rp+$i]
1248 st @acc[1],[$rp+$i+4]
1249___
1250}
1251for(;$i<96;$i+=8) {
1252my $j=($i-64)/4;
1253$code.=<<___;
1254 ld [%sp+LOCALS+$i],@acc[0] ! res
1255 ld [%sp+LOCALS+$i+4],@acc[1]
1256 ld [$ap_real+$i],@acc[4] ! in1
1257 ld [$ap_real+$i+4],@acc[5]
1258 movrz $t1,@ONE_mont[$j],@acc[0]
1259 movrz $t1,@ONE_mont[$j+1],@acc[1]
1260 movrz $t2,@acc[4],@acc[0]
1261 movrz $t2,@acc[5],@acc[1]
1262 st @acc[0],[$rp+$i]
1263 st @acc[1],[$rp+$i+4]
1264___
1265}
1266$code.=<<___;
1267 ret
1268 restore
1269.type ecp_nistz256_point_add_affine,#function
1270.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1271___
1272} }}}
1273{{{
1274my ($out,$inp,$index)=map("%i$_",(0..2));
1275my $mask="%o0";
1276
1277$code.=<<___;
1278! void ecp_nistz256_select_w5(P256_POINT *%i0,const void *%i1,
1279! int %i2);
1280.globl ecp_nistz256_select_w5
1281.align 32
1282ecp_nistz256_select_w5:
1283 save %sp,-STACK_FRAME,%sp
1284
1285 neg $index,$mask
1286 srax $mask,63,$mask
1287
1288 add $index,$mask,$index
1289 sll $index,2,$index
1290 add $inp,$index,$inp
1291
1292 ld [$inp+64*0],%l0
1293 ld [$inp+64*1],%l1
1294 ld [$inp+64*2],%l2
1295 ld [$inp+64*3],%l3
1296 ld [$inp+64*4],%l4
1297 ld [$inp+64*5],%l5
1298 ld [$inp+64*6],%l6
1299 ld [$inp+64*7],%l7
1300 add $inp,64*8,$inp
1301 and %l0,$mask,%l0
1302 and %l1,$mask,%l1
1303 st %l0,[$out] ! X
1304 and %l2,$mask,%l2
1305 st %l1,[$out+4]
1306 and %l3,$mask,%l3
1307 st %l2,[$out+8]
1308 and %l4,$mask,%l4
1309 st %l3,[$out+12]
1310 and %l5,$mask,%l5
1311 st %l4,[$out+16]
1312 and %l6,$mask,%l6
1313 st %l5,[$out+20]
1314 and %l7,$mask,%l7
1315 st %l6,[$out+24]
1316 st %l7,[$out+28]
1317 add $out,32,$out
1318
1319 ld [$inp+64*0],%l0
1320 ld [$inp+64*1],%l1
1321 ld [$inp+64*2],%l2
1322 ld [$inp+64*3],%l3
1323 ld [$inp+64*4],%l4
1324 ld [$inp+64*5],%l5
1325 ld [$inp+64*6],%l6
1326 ld [$inp+64*7],%l7
1327 add $inp,64*8,$inp
1328 and %l0,$mask,%l0
1329 and %l1,$mask,%l1
1330 st %l0,[$out] ! Y
1331 and %l2,$mask,%l2
1332 st %l1,[$out+4]
1333 and %l3,$mask,%l3
1334 st %l2,[$out+8]
1335 and %l4,$mask,%l4
1336 st %l3,[$out+12]
1337 and %l5,$mask,%l5
1338 st %l4,[$out+16]
1339 and %l6,$mask,%l6
1340 st %l5,[$out+20]
1341 and %l7,$mask,%l7
1342 st %l6,[$out+24]
1343 st %l7,[$out+28]
1344 add $out,32,$out
1345
1346 ld [$inp+64*0],%l0
1347 ld [$inp+64*1],%l1
1348 ld [$inp+64*2],%l2
1349 ld [$inp+64*3],%l3
1350 ld [$inp+64*4],%l4
1351 ld [$inp+64*5],%l5
1352 ld [$inp+64*6],%l6
1353 ld [$inp+64*7],%l7
1354 and %l0,$mask,%l0
1355 and %l1,$mask,%l1
1356 st %l0,[$out] ! Z
1357 and %l2,$mask,%l2
1358 st %l1,[$out+4]
1359 and %l3,$mask,%l3
1360 st %l2,[$out+8]
1361 and %l4,$mask,%l4
1362 st %l3,[$out+12]
1363 and %l5,$mask,%l5
1364 st %l4,[$out+16]
1365 and %l6,$mask,%l6
1366 st %l5,[$out+20]
1367 and %l7,$mask,%l7
1368 st %l6,[$out+24]
1369 st %l7,[$out+28]
1370
1371 ret
1372 restore
1373.type ecp_nistz256_select_w5,#function
1374.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1375
1376! void ecp_nistz256_select_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1377! int %i2);
1378.globl ecp_nistz256_select_w7
1379.align 32
1380ecp_nistz256_select_w7:
1381 save %sp,-STACK_FRAME,%sp
1382
1383 neg $index,$mask
1384 srax $mask,63,$mask
1385
1386 add $index,$mask,$index
1387 add $inp,$index,$inp
1388 mov 64/4,$index
1389
1390.Loop_select_w7:
1391 ldub [$inp+64*0],%l0
1392 prefetch [$inp+3840+64*0],1
1393 subcc $index,1,$index
1394 ldub [$inp+64*1],%l1
1395 prefetch [$inp+3840+64*1],1
1396 ldub [$inp+64*2],%l2
1397 prefetch [$inp+3840+64*2],1
1398 ldub [$inp+64*3],%l3
1399 prefetch [$inp+3840+64*3],1
1400 add $inp,64*4,$inp
1401 sll %l1,8,%l1
1402 sll %l2,16,%l2
1403 or %l0,%l1,%l0
1404 sll %l3,24,%l3
1405 or %l0,%l2,%l0
1406 or %l0,%l3,%l0
1407 and %l0,$mask,%l0
1408 st %l0,[$out]
1409 bne .Loop_select_w7
1410 add $out,4,$out
1411
1412 ret
1413 restore
1414.type ecp_nistz256_select_w7,#function
1415.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1416___
1417}}}
1418{{{
1419########################################################################
1420# Following subroutines are VIS3 counterparts of those above that
1421# implement ones found in ecp_nistz256.c. Key difference is that they
1422# use 128-bit muliplication and addition with 64-bit carry, and in order
1423# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1424# entry and vice versa on return.
1425#
1426my ($rp,$ap,$bp)=map("%i$_",(0..2));
1427my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1428my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1429my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1430my ($rp_real,$ap_real)=("%g2","%g3");
1431my ($acc6,$acc7)=($bp,$bi); # used in squaring
1432
1433$code.=<<___;
1434#if 0
1435.align 32
1436__ecp_nistz256_mul_by_2_vis3:
1437 addcc $acc0,$acc0,$acc0
1438 addxccc $acc1,$acc1,$acc1
1439 addxccc $acc2,$acc2,$acc2
1440 addxccc $acc3,$acc3,$acc3
1441 b .Lreduce_by_sub_vis3
1442 addxc %g0,%g0,$acc4 ! did it carry?
1443.type __ecp_nistz256_mul_by_2_vis3,#function
1444.size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1445
1446.align 32
1447__ecp_nistz256_add_vis3:
1448 ldx [$bp+0],$t0
1449 ldx [$bp+8],$t1
1450 ldx [$bp+16],$t2
1451 ldx [$bp+24],$t3
1452
1453__ecp_nistz256_add_noload_vis3:
1454
1455 addcc $t0,$acc0,$acc0
1456 addxccc $t1,$acc1,$acc1
1457 addxccc $t2,$acc2,$acc2
1458 addxccc $t3,$acc3,$acc3
1459 addxc %g0,%g0,$acc4 ! did it carry?
1460
1461.Lreduce_by_sub_vis3:
1462
1463 addcc $acc0,1,$t0 ! add -modulus, i.e. subtract
1464 addxccc $acc1,$poly1,$t1
1465 addxccc $acc2,$minus1,$t2
1466 addxccc $acc3,$poly3,$t3
1467 addxc $acc4,$minus1,$acc4
1468
1469 movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus
1470 movrz $acc4,$t1,$acc1
1471 stx $acc0,[$rp]
1472 movrz $acc4,$t2,$acc2
1473 stx $acc1,[$rp+8]
1474 movrz $acc4,$t3,$acc3
1475 stx $acc2,[$rp+16]
1476 retl
1477 stx $acc3,[$rp+24]
1478.type __ecp_nistz256_add_vis3,#function
1479.size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1480
1481! Trouble with subtraction is that there is no subtraction with 64-bit
1482! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1483! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1484! recall that SPARC is big-endian, which is why you'll observe that
1485! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1486! "collect" result back to 64-bit $acc0-$acc3.
1487.align 32
1488__ecp_nistz256_sub_from_vis3:
1489 ld [$bp+4],$t0
1490 ld [$bp+0],$t1
1491 ld [$bp+12],$t2
1492 ld [$bp+8],$t3
1493
1494 srlx $acc0,32,$acc4
1495 not $poly1,$poly1
1496 srlx $acc1,32,$acc5
1497 subcc $acc0,$t0,$acc0
1498 ld [$bp+20],$t0
1499 subccc $acc4,$t1,$acc4
1500 ld [$bp+16],$t1
1501 subccc $acc1,$t2,$acc1
1502 ld [$bp+28],$t2
1503 and $acc0,$poly1,$acc0
1504 subccc $acc5,$t3,$acc5
1505 ld [$bp+24],$t3
1506 sllx $acc4,32,$acc4
1507 and $acc1,$poly1,$acc1
1508 sllx $acc5,32,$acc5
1509 or $acc0,$acc4,$acc0
1510 srlx $acc2,32,$acc4
1511 or $acc1,$acc5,$acc1
1512 srlx $acc3,32,$acc5
1513 subccc $acc2,$t0,$acc2
1514 subccc $acc4,$t1,$acc4
1515 subccc $acc3,$t2,$acc3
1516 and $acc2,$poly1,$acc2
1517 subccc $acc5,$t3,$acc5
1518 sllx $acc4,32,$acc4
1519 and $acc3,$poly1,$acc3
1520 sllx $acc5,32,$acc5
1521 or $acc2,$acc4,$acc2
1522 subc %g0,%g0,$acc4 ! did it borrow?
1523 b .Lreduce_by_add_vis3
1524 or $acc3,$acc5,$acc3
1525.type __ecp_nistz256_sub_from_vis3,#function
1526.size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1527
1528.align 32
1529__ecp_nistz256_sub_morf_vis3:
1530 ld [$bp+4],$t0
1531 ld [$bp+0],$t1
1532 ld [$bp+12],$t2
1533 ld [$bp+8],$t3
1534
1535 srlx $acc0,32,$acc4
1536 not $poly1,$poly1
1537 srlx $acc1,32,$acc5
1538 subcc $t0,$acc0,$acc0
1539 ld [$bp+20],$t0
1540 subccc $t1,$acc4,$acc4
1541 ld [$bp+16],$t1
1542 subccc $t2,$acc1,$acc1
1543 ld [$bp+28],$t2
1544 and $acc0,$poly1,$acc0
1545 subccc $t3,$acc5,$acc5
1546 ld [$bp+24],$t3
1547 sllx $acc4,32,$acc4
1548 and $acc1,$poly1,$acc1
1549 sllx $acc5,32,$acc5
1550 or $acc0,$acc4,$acc0
1551 srlx $acc2,32,$acc4
1552 or $acc1,$acc5,$acc1
1553 srlx $acc3,32,$acc5
1554 subccc $t0,$acc2,$acc2
1555 subccc $t1,$acc4,$acc4
1556 subccc $t2,$acc3,$acc3
1557 and $acc2,$poly1,$acc2
1558 subccc $t3,$acc5,$acc5
1559 sllx $acc4,32,$acc4
1560 and $acc3,$poly1,$acc3
1561 sllx $acc5,32,$acc5
1562 or $acc2,$acc4,$acc2
1563 subc %g0,%g0,$acc4 ! did it borrow?
1564 or $acc3,$acc5,$acc3
1565
1566.Lreduce_by_add_vis3:
1567
1568 addcc $acc0,-1,$t0 ! add modulus
1569 not $poly3,$t3
1570 addxccc $acc1,$poly1,$t1
1571 not $poly1,$poly1 ! restore $poly1
1572 addxccc $acc2,%g0,$t2
1573 addxc $acc3,$t3,$t3
1574
1575 movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod
1576 movrnz $acc4,$t1,$acc1
1577 stx $acc0,[$rp]
1578 movrnz $acc4,$t2,$acc2
1579 stx $acc1,[$rp+8]
1580 movrnz $acc4,$t3,$acc3
1581 stx $acc2,[$rp+16]
1582 retl
1583 stx $acc3,[$rp+24]
1584.type __ecp_nistz256_sub_morf_vis3,#function
1585.size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1586
1587.align 32
1588__ecp_nistz256_div_by_2_vis3:
1589 ! ret = (a is odd ? a+mod : a) >> 1
1590
1591 not $poly1,$t1
1592 not $poly3,$t3
1593 and $acc0,1,$acc5
1594 addcc $acc0,-1,$t0 ! add modulus
1595 addxccc $acc1,$t1,$t1
1596 addxccc $acc2,%g0,$t2
1597 addxccc $acc3,$t3,$t3
1598 addxc %g0,%g0,$acc4 ! carry bit
1599
1600 movrnz $acc5,$t0,$acc0
1601 movrnz $acc5,$t1,$acc1
1602 movrnz $acc5,$t2,$acc2
1603 movrnz $acc5,$t3,$acc3
1604 movrz $acc5,%g0,$acc4
1605
1606 ! ret >>= 1
1607
1608 srlx $acc0,1,$acc0
1609 sllx $acc1,63,$t0
1610 srlx $acc1,1,$acc1
1611 or $acc0,$t0,$acc0
1612 sllx $acc2,63,$t1
1613 srlx $acc2,1,$acc2
1614 or $acc1,$t1,$acc1
1615 sllx $acc3,63,$t2
1616 stx $acc0,[$rp]
1617 srlx $acc3,1,$acc3
1618 or $acc2,$t2,$acc2
1619 sllx $acc4,63,$t3 ! don't forget carry bit
1620 stx $acc1,[$rp+8]
1621 or $acc3,$t3,$acc3
1622 stx $acc2,[$rp+16]
1623 retl
1624 stx $acc3,[$rp+24]
1625.type __ecp_nistz256_div_by_2_vis3,#function
1626.size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1627
1628! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1629! 4x faster [on T4]...
1630.align 32
1631__ecp_nistz256_mul_mont_vis3:
1632 mulx $a0,$bi,$acc0
1633 not $poly3,$poly3 ! 0xFFFFFFFF00000001
1634 umulxhi $a0,$bi,$t0
1635 mulx $a1,$bi,$acc1
1636 umulxhi $a1,$bi,$t1
1637 mulx $a2,$bi,$acc2
1638 umulxhi $a2,$bi,$t2
1639 mulx $a3,$bi,$acc3
1640 umulxhi $a3,$bi,$t3
1641 ldx [$bp+8],$bi ! b[1]
1642
1643 addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
1644 sllx $acc0,32,$t0
1645 addxccc $acc2,$t1,$acc2
1646 srlx $acc0,32,$t1
1647 addxccc $acc3,$t2,$acc3
1648 addxc %g0,$t3,$acc4
1649 mov 0,$acc5
1650___
1651for($i=1;$i<4;$i++) {
1652 # Reduction iteration is normally performed by accumulating
1653 # result of multiplication of modulus by "magic" digit [and
1654 # omitting least significant word, which is guaranteed to
1655 # be 0], but thanks to special form of modulus and "magic"
1656 # digit being equal to least significant word, it can be
1657 # performed with additions and subtractions alone. Indeed:
1658 #
1659 # ffff0001.00000000.0000ffff.ffffffff
1660 # * abcdefgh
1661 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1662 #
1663 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1664 # rewrite above as:
1665 #
1666 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1667 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1668 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1669 #
1670 # or marking redundant operations:
1671 #
1672 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1673 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1674 # - 0000abcd.efgh0000.--------.--------.--------
1675 # ^^^^^^^^ but this word is calculated with umulxhi, because
1676 # there is no subtract with 64-bit borrow:-(
1677
1678$code.=<<___;
1679 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1680 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1681 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1682 mulx $a0,$bi,$t0
1683 addxccc $acc2,$t1,$acc1
1684 mulx $a1,$bi,$t1
1685 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1686 mulx $a2,$bi,$t2
1687 addxccc $acc4,$t3,$acc3
1688 mulx $a3,$bi,$t3
1689 addxc $acc5,%g0,$acc4
1690
1691 addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication
1692 umulxhi $a0,$bi,$t0
1693 addxccc $acc1,$t1,$acc1
1694 umulxhi $a1,$bi,$t1
1695 addxccc $acc2,$t2,$acc2
1696 umulxhi $a2,$bi,$t2
1697 addxccc $acc3,$t3,$acc3
1698 umulxhi $a3,$bi,$t3
1699 addxc $acc4,%g0,$acc4
1700___
1701$code.=<<___ if ($i<3);
1702 ldx [$bp+8*($i+1)],$bi ! bp[$i+1]
1703___
1704$code.=<<___;
1705 addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
1706 sllx $acc0,32,$t0
1707 addxccc $acc2,$t1,$acc2
1708 srlx $acc0,32,$t1
1709 addxccc $acc3,$t2,$acc3
1710 addxccc $acc4,$t3,$acc4
1711 addxc %g0,%g0,$acc5
1712___
1713}
1714$code.=<<___;
1715 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1716 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1717 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1718 addxccc $acc2,$t1,$acc1
1719 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1720 addxccc $acc4,$t3,$acc3
1721 b .Lmul_final_vis3 ! see below
1722 addxc $acc5,%g0,$acc4
1723.type __ecp_nistz256_mul_mont_vis3,#function
1724.size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1725
1726! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1727! instructions, but only 14% faster [on T4]...
1728.align 32
1729__ecp_nistz256_sqr_mont_vis3:
1730 ! | | | | | |a1*a0| |
1731 ! | | | | |a2*a0| | |
1732 ! | |a3*a2|a3*a0| | | |
1733 ! | | | |a2*a1| | | |
1734 ! | | |a3*a1| | | | |
1735 ! *| | | | | | | | 2|
1736 ! +|a3*a3|a2*a2|a1*a1|a0*a0|
1737 ! |--+--+--+--+--+--+--+--|
1738 ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1739 !
1740 ! "can't overflow" below mark carrying into high part of
1741 ! multiplication result, which can't overflow, because it
1742 ! can never be all ones.
1743
1744 mulx $a1,$a0,$acc1 ! a[1]*a[0]
1745 umulxhi $a1,$a0,$t1
1746 mulx $a2,$a0,$acc2 ! a[2]*a[0]
1747 umulxhi $a2,$a0,$t2
1748 mulx $a3,$a0,$acc3 ! a[3]*a[0]
1749 umulxhi $a3,$a0,$acc4
1750
1751 addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication
1752 mulx $a2,$a1,$t0 ! a[2]*a[1]
1753 umulxhi $a2,$a1,$t1
1754 addxccc $acc3,$t2,$acc3
1755 mulx $a3,$a1,$t2 ! a[3]*a[1]
1756 umulxhi $a3,$a1,$t3
1757 addxc $acc4,%g0,$acc4 ! can't overflow
1758
1759 mulx $a3,$a2,$acc5 ! a[3]*a[2]
1760 not $poly3,$poly3 ! 0xFFFFFFFF00000001
1761 umulxhi $a3,$a2,$acc6
1762
1763 addcc $t2,$t1,$t1 ! accumulate high parts of multiplication
1764 mulx $a0,$a0,$acc0 ! a[0]*a[0]
1765 addxc $t3,%g0,$t2 ! can't overflow
1766
1767 addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication
1768 umulxhi $a0,$a0,$a0
1769 addxccc $acc4,$t1,$acc4
1770 mulx $a1,$a1,$t1 ! a[1]*a[1]
1771 addxccc $acc5,$t2,$acc5
1772 umulxhi $a1,$a1,$a1
1773 addxc $acc6,%g0,$acc6 ! can't overflow
1774
1775 addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2
1776 mulx $a2,$a2,$t2 ! a[2]*a[2]
1777 addxccc $acc2,$acc2,$acc2
1778 umulxhi $a2,$a2,$a2
1779 addxccc $acc3,$acc3,$acc3
1780 mulx $a3,$a3,$t3 ! a[3]*a[3]
1781 addxccc $acc4,$acc4,$acc4
1782 umulxhi $a3,$a3,$a3
1783 addxccc $acc5,$acc5,$acc5
1784 addxccc $acc6,$acc6,$acc6
1785 addxc %g0,%g0,$acc7
1786
1787 addcc $acc1,$a0,$acc1 ! +a[i]*a[i]
1788 addxccc $acc2,$t1,$acc2
1789 addxccc $acc3,$a1,$acc3
1790 addxccc $acc4,$t2,$acc4
1791 sllx $acc0,32,$t0
1792 addxccc $acc5,$a2,$acc5
1793 srlx $acc0,32,$t1
1794 addxccc $acc6,$t3,$acc6
1795 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1796 addxc $acc7,$a3,$acc7
1797___
1798for($i=0;$i<3;$i++) { # reductions, see commentary
1799 # in multiplication for details
1800$code.=<<___;
1801 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1802 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1803 sllx $acc0,32,$t0
1804 addxccc $acc2,$t1,$acc1
1805 srlx $acc0,32,$t1
1806 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1807 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1808 addxc %g0,$t3,$acc3 ! can't overflow
1809___
1810}
1811$code.=<<___;
1812 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1813 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1814 addxccc $acc2,$t1,$acc1
1815 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1816 addxc %g0,$t3,$acc3 ! can't overflow
1817
1818 addcc $acc0,$acc4,$acc0 ! accumulate upper half
1819 addxccc $acc1,$acc5,$acc1
1820 addxccc $acc2,$acc6,$acc2
1821 addxccc $acc3,$acc7,$acc3
1822 addxc %g0,%g0,$acc4
1823
1824.Lmul_final_vis3:
1825
1826 ! Final step is "if result > mod, subtract mod", but as comparison
1827 ! means subtraction, we do the subtraction and then copy outcome
1828 ! if it didn't borrow. But note that as we [have to] replace
1829 ! subtraction with addition with negative, carry/borrow logic is
1830 ! inverse.
1831
1832 addcc $acc0,1,$t0 ! add -modulus, i.e. subtract
1833 not $poly3,$poly3 ! restore 0x00000000FFFFFFFE
1834 addxccc $acc1,$poly1,$t1
1835 addxccc $acc2,$minus1,$t2
1836 addxccc $acc3,$poly3,$t3
1837 addxccc $acc4,$minus1,%g0 ! did it carry?
1838
1839 movcs %xcc,$t0,$acc0
1840 movcs %xcc,$t1,$acc1
1841 stx $acc0,[$rp]
1842 movcs %xcc,$t2,$acc2
1843 stx $acc1,[$rp+8]
1844 movcs %xcc,$t3,$acc3
1845 stx $acc2,[$rp+16]
1846 retl
1847 stx $acc3,[$rp+24]
1848.type __ecp_nistz256_sqr_mont_vis3,#function
1849.size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
1850___
1851
1852########################################################################
1853# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1854#
1855{
1856my ($res_x,$res_y,$res_z,
1857 $in_x,$in_y,$in_z,
1858 $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
1859# above map() describes stack layout with 10 temporary
1860# 256-bit vectors on top.
1861
1862$code.=<<___;
1863.align 32
1864ecp_nistz256_point_double_vis3:
1865 save %sp,-STACK64_FRAME-32*10,%sp
1866
1867 mov $rp,$rp_real
1868.Ldouble_shortcut_vis3:
1869 mov -1,$minus1
1870 mov -2,$poly3
1871 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000
1872 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE
1873
1874 ! convert input to uint64_t[4]
1875 ld [$ap],$a0 ! in_x
1876 ld [$ap+4],$t0
1877 ld [$ap+8],$a1
1878 ld [$ap+12],$t1
1879 ld [$ap+16],$a2
1880 ld [$ap+20],$t2
1881 ld [$ap+24],$a3
1882 ld [$ap+28],$t3
1883 sllx $t0,32,$t0
1884 sllx $t1,32,$t1
1885 ld [$ap+32],$acc0 ! in_y
1886 or $a0,$t0,$a0
1887 ld [$ap+32+4],$t0
1888 sllx $t2,32,$t2
1889 ld [$ap+32+8],$acc1
1890 or $a1,$t1,$a1
1891 ld [$ap+32+12],$t1
1892 sllx $t3,32,$t3
1893 ld [$ap+32+16],$acc2
1894 or $a2,$t2,$a2
1895 ld [$ap+32+20],$t2
1896 or $a3,$t3,$a3
1897 ld [$ap+32+24],$acc3
1898 sllx $t0,32,$t0
1899 ld [$ap+32+28],$t3
1900 sllx $t1,32,$t1
1901 stx $a0,[%sp+LOCALS64+$in_x]
1902 sllx $t2,32,$t2
1903 stx $a1,[%sp+LOCALS64+$in_x+8]
1904 sllx $t3,32,$t3
1905 stx $a2,[%sp+LOCALS64+$in_x+16]
1906 or $acc0,$t0,$acc0
1907 stx $a3,[%sp+LOCALS64+$in_x+24]
1908 or $acc1,$t1,$acc1
1909 stx $acc0,[%sp+LOCALS64+$in_y]
1910 or $acc2,$t2,$acc2
1911 stx $acc1,[%sp+LOCALS64+$in_y+8]
1912 or $acc3,$t3,$acc3
1913 stx $acc2,[%sp+LOCALS64+$in_y+16]
1914 stx $acc3,[%sp+LOCALS64+$in_y+24]
1915
1916 ld [$ap+64],$a0 ! in_z
1917 ld [$ap+64+4],$t0
1918 ld [$ap+64+8],$a1
1919 ld [$ap+64+12],$t1
1920 ld [$ap+64+16],$a2
1921 ld [$ap+64+20],$t2
1922 ld [$ap+64+24],$a3
1923 ld [$ap+64+28],$t3
1924 sllx $t0,32,$t0
1925 sllx $t1,32,$t1
1926 or $a0,$t0,$a0
1927 sllx $t2,32,$t2
1928 or $a1,$t1,$a1
1929 sllx $t3,32,$t3
1930 or $a2,$t2,$a2
1931 or $a3,$t3,$a3
1932 sllx $t0,32,$t0
1933 sllx $t1,32,$t1
1934 stx $a0,[%sp+LOCALS64+$in_z]
1935 sllx $t2,32,$t2
1936 stx $a1,[%sp+LOCALS64+$in_z+8]
1937 sllx $t3,32,$t3
1938 stx $a2,[%sp+LOCALS64+$in_z+16]
1939 stx $a3,[%sp+LOCALS64+$in_z+24]
1940
1941 ! in_y is still in $acc0-$acc3
1942 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y);
1943 add %sp,LOCALS64+$S,$rp
1944
1945 ! in_z is still in $a0-$a3
1946 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z);
1947 add %sp,LOCALS64+$Zsqr,$rp
1948
1949 mov $acc0,$a0 ! put Zsqr aside
1950 mov $acc1,$a1
1951 mov $acc2,$a2
1952 mov $acc3,$a3
1953
1954 add %sp,LOCALS64+$in_x,$bp
1955 call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x);
1956 add %sp,LOCALS64+$M,$rp
1957
1958 mov $a0,$acc0 ! restore Zsqr
1959 ldx [%sp+LOCALS64+$S],$a0 ! forward load
1960 mov $a1,$acc1
1961 ldx [%sp+LOCALS64+$S+8],$a1
1962 mov $a2,$acc2
1963 ldx [%sp+LOCALS64+$S+16],$a2
1964 mov $a3,$acc3
1965 ldx [%sp+LOCALS64+$S+24],$a3
1966
1967 add %sp,LOCALS64+$in_x,$bp
1968 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr);
1969 add %sp,LOCALS64+$Zsqr,$rp
1970
1971 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S);
1972 add %sp,LOCALS64+$S,$rp
1973
1974 ldx [%sp+LOCALS64+$in_z],$bi
1975 ldx [%sp+LOCALS64+$in_y],$a0
1976 ldx [%sp+LOCALS64+$in_y+8],$a1
1977 ldx [%sp+LOCALS64+$in_y+16],$a2
1978 ldx [%sp+LOCALS64+$in_y+24],$a3
1979 add %sp,LOCALS64+$in_z,$bp
1980 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y);
1981 add %sp,LOCALS64+$tmp0,$rp
1982
1983 ldx [%sp+LOCALS64+$M],$bi ! forward load
1984 ldx [%sp+LOCALS64+$Zsqr],$a0
1985 ldx [%sp+LOCALS64+$Zsqr+8],$a1
1986 ldx [%sp+LOCALS64+$Zsqr+16],$a2
1987 ldx [%sp+LOCALS64+$Zsqr+24],$a3
1988
1989 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0);
1990 add %sp,LOCALS64+$res_z,$rp
1991
1992 add %sp,LOCALS64+$M,$bp
1993 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr);
1994 add %sp,LOCALS64+$M,$rp
1995
1996 mov $acc0,$a0 ! put aside M
1997 mov $acc1,$a1
1998 mov $acc2,$a2
1999 mov $acc3,$a3
2000 call __ecp_nistz256_mul_by_2_vis3
2001 add %sp,LOCALS64+$M,$rp
2002 mov $a0,$t0 ! copy M
2003 ldx [%sp+LOCALS64+$S],$a0 ! forward load
2004 mov $a1,$t1
2005 ldx [%sp+LOCALS64+$S+8],$a1
2006 mov $a2,$t2
2007 ldx [%sp+LOCALS64+$S+16],$a2
2008 mov $a3,$t3
2009 ldx [%sp+LOCALS64+$S+24],$a3
2010 call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M);
2011 add %sp,LOCALS64+$M,$rp
2012
2013 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S);
2014 add %sp,LOCALS64+$tmp0,$rp
2015
2016 ldx [%sp+LOCALS64+$S],$bi ! forward load
2017 ldx [%sp+LOCALS64+$in_x],$a0
2018 ldx [%sp+LOCALS64+$in_x+8],$a1
2019 ldx [%sp+LOCALS64+$in_x+16],$a2
2020 ldx [%sp+LOCALS64+$in_x+24],$a3
2021
2022 call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0);
2023 add %sp,LOCALS64+$res_y,$rp
2024
2025 add %sp,LOCALS64+$S,$bp
2026 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x);
2027 add %sp,LOCALS64+$S,$rp
2028
2029 ldx [%sp+LOCALS64+$M],$a0 ! forward load
2030 ldx [%sp+LOCALS64+$M+8],$a1
2031 ldx [%sp+LOCALS64+$M+16],$a2
2032 ldx [%sp+LOCALS64+$M+24],$a3
2033
2034 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S);
2035 add %sp,LOCALS64+$tmp0,$rp
2036
2037 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M);
2038 add %sp,LOCALS64+$res_x,$rp
2039
2040 add %sp,LOCALS64+$tmp0,$bp
2041 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0);
2042 add %sp,LOCALS64+$res_x,$rp
2043
2044 ldx [%sp+LOCALS64+$M],$a0 ! forward load
2045 ldx [%sp+LOCALS64+$M+8],$a1
2046 ldx [%sp+LOCALS64+$M+16],$a2
2047 ldx [%sp+LOCALS64+$M+24],$a3
2048
2049 add %sp,LOCALS64+$S,$bp
2050 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x);
2051 add %sp,LOCALS64+$S,$rp
2052
2053 mov $acc0,$bi
2054 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M);
2055 add %sp,LOCALS64+$S,$rp
2056
2057 ldx [%sp+LOCALS64+$res_x],$a0 ! forward load
2058 ldx [%sp+LOCALS64+$res_x+8],$a1
2059 ldx [%sp+LOCALS64+$res_x+16],$a2
2060 ldx [%sp+LOCALS64+$res_x+24],$a3
2061
2062 add %sp,LOCALS64+$res_y,$bp
2063 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y);
2064 add %sp,LOCALS64+$res_y,$bp
2065
2066 ! convert output to uint_32[8]
2067 srlx $a0,32,$t0
2068 srlx $a1,32,$t1
2069 st $a0,[$rp_real] ! res_x
2070 srlx $a2,32,$t2
2071 st $t0,[$rp_real+4]
2072 srlx $a3,32,$t3
2073 st $a1,[$rp_real+8]
2074 st $t1,[$rp_real+12]
2075 st $a2,[$rp_real+16]
2076 st $t2,[$rp_real+20]
2077 st $a3,[$rp_real+24]
2078 st $t3,[$rp_real+28]
2079
2080 ldx [%sp+LOCALS64+$res_z],$a0 ! forward load
2081 srlx $acc0,32,$t0
2082 ldx [%sp+LOCALS64+$res_z+8],$a1
2083 srlx $acc1,32,$t1
2084 ldx [%sp+LOCALS64+$res_z+16],$a2
2085 srlx $acc2,32,$t2
2086 ldx [%sp+LOCALS64+$res_z+24],$a3
2087 srlx $acc3,32,$t3
2088 st $acc0,[$rp_real+32] ! res_y
2089 st $t0, [$rp_real+32+4]
2090 st $acc1,[$rp_real+32+8]
2091 st $t1, [$rp_real+32+12]
2092 st $acc2,[$rp_real+32+16]
2093 st $t2, [$rp_real+32+20]
2094 st $acc3,[$rp_real+32+24]
2095 st $t3, [$rp_real+32+28]
2096
2097 srlx $a0,32,$t0
2098 srlx $a1,32,$t1
2099 st $a0,[$rp_real+64] ! res_z
2100 srlx $a2,32,$t2
2101 st $t0,[$rp_real+64+4]
2102 srlx $a3,32,$t3
2103 st $a1,[$rp_real+64+8]
2104 st $t1,[$rp_real+64+12]
2105 st $a2,[$rp_real+64+16]
2106 st $t2,[$rp_real+64+20]
2107 st $a3,[$rp_real+64+24]
2108 st $t3,[$rp_real+64+28]
2109
2110 ret
2111 restore
2112.type ecp_nistz256_point_double_vis3,#function
2113.size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2114___
2115}
2116########################################################################
2117# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2118# const P256_POINT *in2);
2119{
2120my ($res_x,$res_y,$res_z,
2121 $in1_x,$in1_y,$in1_z,
2122 $in2_x,$in2_y,$in2_z,
2123 $H,$Hsqr,$R,$Rsqr,$Hcub,
2124 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2125my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2126
2127# above map() describes stack layout with 18 temporary
2128# 256-bit vectors on top. Then we reserve some space for
2129# !in1infty, !in2infty and result of check for zero.
2130
2131$code.=<<___;
2132.globl ecp_nistz256_point_add_vis3
2133.align 32
2134ecp_nistz256_point_add_vis3:
2135 save %sp,-STACK64_FRAME-32*18-32,%sp
2136
2137 mov $rp,$rp_real
2138 mov -1,$minus1
2139 mov -2,$poly3
2140 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000
2141 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE
2142
2143 ! convert input to uint64_t[4]
2144 ld [$bp],$a0 ! in2_x
2145 ld [$bp+4],$t0
2146 ld [$bp+8],$a1
2147 ld [$bp+12],$t1
2148 ld [$bp+16],$a2
2149 ld [$bp+20],$t2
2150 ld [$bp+24],$a3
2151 ld [$bp+28],$t3
2152 sllx $t0,32,$t0
2153 sllx $t1,32,$t1
2154 ld [$bp+32],$acc0 ! in2_y
2155 or $a0,$t0,$a0
2156 ld [$bp+32+4],$t0
2157 sllx $t2,32,$t2
2158 ld [$bp+32+8],$acc1
2159 or $a1,$t1,$a1
2160 ld [$bp+32+12],$t1
2161 sllx $t3,32,$t3
2162 ld [$bp+32+16],$acc2
2163 or $a2,$t2,$a2
2164 ld [$bp+32+20],$t2
2165 or $a3,$t3,$a3
2166 ld [$bp+32+24],$acc3
2167 sllx $t0,32,$t0
2168 ld [$bp+32+28],$t3
2169 sllx $t1,32,$t1
2170 stx $a0,[%sp+LOCALS64+$in2_x]
2171 sllx $t2,32,$t2
2172 stx $a1,[%sp+LOCALS64+$in2_x+8]
2173 sllx $t3,32,$t3
2174 stx $a2,[%sp+LOCALS64+$in2_x+16]
2175 or $acc0,$t0,$acc0
2176 stx $a3,[%sp+LOCALS64+$in2_x+24]
2177 or $acc1,$t1,$acc1
2178 stx $acc0,[%sp+LOCALS64+$in2_y]
2179 or $acc2,$t2,$acc2
2180 stx $acc1,[%sp+LOCALS64+$in2_y+8]
2181 or $acc3,$t3,$acc3
2182 stx $acc2,[%sp+LOCALS64+$in2_y+16]
2183 stx $acc3,[%sp+LOCALS64+$in2_y+24]
2184
2185 ld [$bp+64],$acc0 ! in2_z
2186 ld [$bp+64+4],$t0
2187 ld [$bp+64+8],$acc1
2188 ld [$bp+64+12],$t1
2189 ld [$bp+64+16],$acc2
2190 ld [$bp+64+20],$t2
2191 ld [$bp+64+24],$acc3
2192 ld [$bp+64+28],$t3
2193 sllx $t0,32,$t0
2194 sllx $t1,32,$t1
2195 ld [$ap],$a0 ! in1_x
2196 or $acc0,$t0,$acc0
2197 ld [$ap+4],$t0
2198 sllx $t2,32,$t2
2199 ld [$ap+8],$a1
2200 or $acc1,$t1,$acc1
2201 ld [$ap+12],$t1
2202 sllx $t3,32,$t3
2203 ld [$ap+16],$a2
2204 or $acc2,$t2,$acc2
2205 ld [$ap+20],$t2
2206 or $acc3,$t3,$acc3
2207 ld [$ap+24],$a3
2208 sllx $t0,32,$t0
2209 ld [$ap+28],$t3
2210 sllx $t1,32,$t1
2211 stx $acc0,[%sp+LOCALS64+$in2_z]
2212 sllx $t2,32,$t2
2213 stx $acc1,[%sp+LOCALS64+$in2_z+8]
2214 sllx $t3,32,$t3
2215 stx $acc2,[%sp+LOCALS64+$in2_z+16]
2216 stx $acc3,[%sp+LOCALS64+$in2_z+24]
2217
2218 or $acc1,$acc0,$acc0
2219 or $acc3,$acc2,$acc2
2220 or $acc2,$acc0,$acc0
2221 movrnz $acc0,-1,$acc0 ! !in2infty
2222 stx $acc0,[%fp+STACK_BIAS-8]
2223
2224 or $a0,$t0,$a0
2225 ld [$ap+32],$acc0 ! in1_y
2226 or $a1,$t1,$a1
2227 ld [$ap+32+4],$t0
2228 or $a2,$t2,$a2
2229 ld [$ap+32+8],$acc1
2230 or $a3,$t3,$a3
2231 ld [$ap+32+12],$t1
2232 ld [$ap+32+16],$acc2
2233 ld [$ap+32+20],$t2
2234 ld [$ap+32+24],$acc3
2235 sllx $t0,32,$t0
2236 ld [$ap+32+28],$t3
2237 sllx $t1,32,$t1
2238 stx $a0,[%sp+LOCALS64+$in1_x]
2239 sllx $t2,32,$t2
2240 stx $a1,[%sp+LOCALS64+$in1_x+8]
2241 sllx $t3,32,$t3
2242 stx $a2,[%sp+LOCALS64+$in1_x+16]
2243 or $acc0,$t0,$acc0
2244 stx $a3,[%sp+LOCALS64+$in1_x+24]
2245 or $acc1,$t1,$acc1
2246 stx $acc0,[%sp+LOCALS64+$in1_y]
2247 or $acc2,$t2,$acc2
2248 stx $acc1,[%sp+LOCALS64+$in1_y+8]
2249 or $acc3,$t3,$acc3
2250 stx $acc2,[%sp+LOCALS64+$in1_y+16]
2251 stx $acc3,[%sp+LOCALS64+$in1_y+24]
2252
2253 ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load
2254 ldx [%sp+LOCALS64+$in2_z+8],$a1
2255 ldx [%sp+LOCALS64+$in2_z+16],$a2
2256 ldx [%sp+LOCALS64+$in2_z+24],$a3
2257
2258 ld [$ap+64],$acc0 ! in1_z
2259 ld [$ap+64+4],$t0
2260 ld [$ap+64+8],$acc1
2261 ld [$ap+64+12],$t1
2262 ld [$ap+64+16],$acc2
2263 ld [$ap+64+20],$t2
2264 ld [$ap+64+24],$acc3
2265 ld [$ap+64+28],$t3
2266 sllx $t0,32,$t0
2267 sllx $t1,32,$t1
2268 or $acc0,$t0,$acc0
2269 sllx $t2,32,$t2
2270 or $acc1,$t1,$acc1
2271 sllx $t3,32,$t3
2272 stx $acc0,[%sp+LOCALS64+$in1_z]
2273 or $acc2,$t2,$acc2
2274 stx $acc1,[%sp+LOCALS64+$in1_z+8]
2275 or $acc3,$t3,$acc3
2276 stx $acc2,[%sp+LOCALS64+$in1_z+16]
2277 stx $acc3,[%sp+LOCALS64+$in1_z+24]
2278
2279 or $acc1,$acc0,$acc0
2280 or $acc3,$acc2,$acc2
2281 or $acc2,$acc0,$acc0
2282 movrnz $acc0,-1,$acc0 ! !in1infty
2283 stx $acc0,[%fp+STACK_BIAS-16]
2284
2285 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z);
2286 add %sp,LOCALS64+$Z2sqr,$rp
2287
2288 ldx [%sp+LOCALS64+$in1_z],$a0
2289 ldx [%sp+LOCALS64+$in1_z+8],$a1
2290 ldx [%sp+LOCALS64+$in1_z+16],$a2
2291 ldx [%sp+LOCALS64+$in1_z+24],$a3
2292 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z);
2293 add %sp,LOCALS64+$Z1sqr,$rp
2294
2295 ldx [%sp+LOCALS64+$Z2sqr],$bi
2296 ldx [%sp+LOCALS64+$in2_z],$a0
2297 ldx [%sp+LOCALS64+$in2_z+8],$a1
2298 ldx [%sp+LOCALS64+$in2_z+16],$a2
2299 ldx [%sp+LOCALS64+$in2_z+24],$a3
2300 add %sp,LOCALS64+$Z2sqr,$bp
2301 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z);
2302 add %sp,LOCALS64+$S1,$rp
2303
2304 ldx [%sp+LOCALS64+$Z1sqr],$bi
2305 ldx [%sp+LOCALS64+$in1_z],$a0
2306 ldx [%sp+LOCALS64+$in1_z+8],$a1
2307 ldx [%sp+LOCALS64+$in1_z+16],$a2
2308 ldx [%sp+LOCALS64+$in1_z+24],$a3
2309 add %sp,LOCALS64+$Z1sqr,$bp
2310 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z);
2311 add %sp,LOCALS64+$S2,$rp
2312
2313 ldx [%sp+LOCALS64+$S1],$bi
2314 ldx [%sp+LOCALS64+$in1_y],$a0
2315 ldx [%sp+LOCALS64+$in1_y+8],$a1
2316 ldx [%sp+LOCALS64+$in1_y+16],$a2
2317 ldx [%sp+LOCALS64+$in1_y+24],$a3
2318 add %sp,LOCALS64+$S1,$bp
2319 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y);
2320 add %sp,LOCALS64+$S1,$rp
2321
2322 ldx [%sp+LOCALS64+$S2],$bi
2323 ldx [%sp+LOCALS64+$in2_y],$a0
2324 ldx [%sp+LOCALS64+$in2_y+8],$a1
2325 ldx [%sp+LOCALS64+$in2_y+16],$a2
2326 ldx [%sp+LOCALS64+$in2_y+24],$a3
2327 add %sp,LOCALS64+$S2,$bp
2328 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y);
2329 add %sp,LOCALS64+$S2,$rp
2330
2331 ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load
2332 ldx [%sp+LOCALS64+$in1_x],$a0
2333 ldx [%sp+LOCALS64+$in1_x+8],$a1
2334 ldx [%sp+LOCALS64+$in1_x+16],$a2
2335 ldx [%sp+LOCALS64+$in1_x+24],$a3
2336
2337 add %sp,LOCALS64+$S1,$bp
2338 call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1);
2339 add %sp,LOCALS64+$R,$rp
2340
2341 or $acc1,$acc0,$acc0 ! see if result is zero
2342 or $acc3,$acc2,$acc2
2343 or $acc2,$acc0,$acc0
2344 stx $acc0,[%fp+STACK_BIAS-24]
2345
2346 add %sp,LOCALS64+$Z2sqr,$bp
2347 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr);
2348 add %sp,LOCALS64+$U1,$rp
2349
2350 ldx [%sp+LOCALS64+$Z1sqr],$bi
2351 ldx [%sp+LOCALS64+$in2_x],$a0
2352 ldx [%sp+LOCALS64+$in2_x+8],$a1
2353 ldx [%sp+LOCALS64+$in2_x+16],$a2
2354 ldx [%sp+LOCALS64+$in2_x+24],$a3
2355 add %sp,LOCALS64+$Z1sqr,$bp
2356 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr);
2357 add %sp,LOCALS64+$U2,$rp
2358
2359 ldx [%sp+LOCALS64+$R],$a0 ! forward load
2360 ldx [%sp+LOCALS64+$R+8],$a1
2361 ldx [%sp+LOCALS64+$R+16],$a2
2362 ldx [%sp+LOCALS64+$R+24],$a3
2363
2364 add %sp,LOCALS64+$U1,$bp
2365 call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1);
2366 add %sp,LOCALS64+$H,$rp
2367
2368 or $acc1,$acc0,$acc0 ! see if result is zero
2369 or $acc3,$acc2,$acc2
2370 orcc $acc2,$acc0,$acc0
2371
2372 bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)?
2373 nop
2374
2375 ldx [%fp+STACK_BIAS-8],$t0
2376 ldx [%fp+STACK_BIAS-16],$t1
2377 ldx [%fp+STACK_BIAS-24],$t2
2378 andcc $t0,$t1,%g0
2379 be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)?
2380 nop
2381 andcc $t2,$t2,%g0
2382 be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)?
2383 add %sp,32*(12-10)+32,%sp ! difference in frame sizes
2384
2385 st %g0,[$rp_real]
2386 st %g0,[$rp_real+4]
2387 st %g0,[$rp_real+8]
2388 st %g0,[$rp_real+12]
2389 st %g0,[$rp_real+16]
2390 st %g0,[$rp_real+20]
2391 st %g0,[$rp_real+24]
2392 st %g0,[$rp_real+28]
2393 st %g0,[$rp_real+32]
2394 st %g0,[$rp_real+32+4]
2395 st %g0,[$rp_real+32+8]
2396 st %g0,[$rp_real+32+12]
2397 st %g0,[$rp_real+32+16]
2398 st %g0,[$rp_real+32+20]
2399 st %g0,[$rp_real+32+24]
2400 st %g0,[$rp_real+32+28]
2401 st %g0,[$rp_real+64]
2402 st %g0,[$rp_real+64+4]
2403 st %g0,[$rp_real+64+8]
2404 st %g0,[$rp_real+64+12]
2405 st %g0,[$rp_real+64+16]
2406 st %g0,[$rp_real+64+20]
2407 st %g0,[$rp_real+64+24]
2408 st %g0,[$rp_real+64+28]
2409 b .Ladd_done_vis3
2410 nop
2411
2412.align 16
2413.Ladd_proceed_vis3:
2414 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R);
2415 add %sp,LOCALS64+$Rsqr,$rp
2416
2417 ldx [%sp+LOCALS64+$H],$bi
2418 ldx [%sp+LOCALS64+$in1_z],$a0
2419 ldx [%sp+LOCALS64+$in1_z+8],$a1
2420 ldx [%sp+LOCALS64+$in1_z+16],$a2
2421 ldx [%sp+LOCALS64+$in1_z+24],$a3
2422 add %sp,LOCALS64+$H,$bp
2423 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z);
2424 add %sp,LOCALS64+$res_z,$rp
2425
2426 ldx [%sp+LOCALS64+$H],$a0
2427 ldx [%sp+LOCALS64+$H+8],$a1
2428 ldx [%sp+LOCALS64+$H+16],$a2
2429 ldx [%sp+LOCALS64+$H+24],$a3
2430 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H);
2431 add %sp,LOCALS64+$Hsqr,$rp
2432
2433 ldx [%sp+LOCALS64+$res_z],$bi
2434 ldx [%sp+LOCALS64+$in2_z],$a0
2435 ldx [%sp+LOCALS64+$in2_z+8],$a1
2436 ldx [%sp+LOCALS64+$in2_z+16],$a2
2437 ldx [%sp+LOCALS64+$in2_z+24],$a3
2438 add %sp,LOCALS64+$res_z,$bp
2439 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z);
2440 add %sp,LOCALS64+$res_z,$rp
2441
2442 ldx [%sp+LOCALS64+$H],$bi
2443 ldx [%sp+LOCALS64+$Hsqr],$a0
2444 ldx [%sp+LOCALS64+$Hsqr+8],$a1
2445 ldx [%sp+LOCALS64+$Hsqr+16],$a2
2446 ldx [%sp+LOCALS64+$Hsqr+24],$a3
2447 add %sp,LOCALS64+$H,$bp
2448 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H);
2449 add %sp,LOCALS64+$Hcub,$rp
2450
2451 ldx [%sp+LOCALS64+$U1],$bi
2452 ldx [%sp+LOCALS64+$Hsqr],$a0
2453 ldx [%sp+LOCALS64+$Hsqr+8],$a1
2454 ldx [%sp+LOCALS64+$Hsqr+16],$a2
2455 ldx [%sp+LOCALS64+$Hsqr+24],$a3
2456 add %sp,LOCALS64+$U1,$bp
2457 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr);
2458 add %sp,LOCALS64+$U2,$rp
2459
2460 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2);
2461 add %sp,LOCALS64+$Hsqr,$rp
2462
2463 add %sp,LOCALS64+$Rsqr,$bp
2464 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr);
2465 add %sp,LOCALS64+$res_x,$rp
2466
2467 add %sp,LOCALS64+$Hcub,$bp
2468 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub);
2469 add %sp,LOCALS64+$res_x,$rp
2470
2471 ldx [%sp+LOCALS64+$S1],$bi ! forward load
2472 ldx [%sp+LOCALS64+$Hcub],$a0
2473 ldx [%sp+LOCALS64+$Hcub+8],$a1
2474 ldx [%sp+LOCALS64+$Hcub+16],$a2
2475 ldx [%sp+LOCALS64+$Hcub+24],$a3
2476
2477 add %sp,LOCALS64+$U2,$bp
2478 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x);
2479 add %sp,LOCALS64+$res_y,$rp
2480
2481 add %sp,LOCALS64+$S1,$bp
2482 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub);
2483 add %sp,LOCALS64+$S2,$rp
2484
2485 ldx [%sp+LOCALS64+$R],$bi
2486 ldx [%sp+LOCALS64+$res_y],$a0
2487 ldx [%sp+LOCALS64+$res_y+8],$a1
2488 ldx [%sp+LOCALS64+$res_y+16],$a2
2489 ldx [%sp+LOCALS64+$res_y+24],$a3
2490 add %sp,LOCALS64+$R,$bp
2491 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R);
2492 add %sp,LOCALS64+$res_y,$rp
2493
2494 add %sp,LOCALS64+$S2,$bp
2495 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2);
2496 add %sp,LOCALS64+$res_y,$rp
2497
2498 ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty
2499 ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty
2500___
2501for($i=0;$i<96;$i+=16) { # conditional moves
2502$code.=<<___;
2503 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res
2504 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1
2505 ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2
2506 ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3
2507 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1
2508 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5
2509 movrz $t1,$acc2,$acc0
2510 movrz $t1,$acc3,$acc1
2511 movrz $t2,$acc4,$acc0
2512 movrz $t2,$acc5,$acc1
2513 srlx $acc0,32,$acc2
2514 srlx $acc1,32,$acc3
2515 st $acc0,[$rp_real+$i]
2516 st $acc2,[$rp_real+$i+4]
2517 st $acc1,[$rp_real+$i+8]
2518 st $acc3,[$rp_real+$i+12]
2519___
2520}
2521$code.=<<___;
2522.Ladd_done_vis3:
2523 ret
2524 restore
2525.type ecp_nistz256_point_add_vis3,#function
2526.size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2527___
2528}
2529########################################################################
2530# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2531# const P256_POINT_AFFINE *in2);
2532{
2533my ($res_x,$res_y,$res_z,
2534 $in1_x,$in1_y,$in1_z,
2535 $in2_x,$in2_y,
2536 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2537my $Z1sqr = $S2;
2538# above map() describes stack layout with 15 temporary
2539# 256-bit vectors on top. Then we reserve some space for
2540# !in1infty and !in2infty.
2541
2542$code.=<<___;
2543.align 32
2544ecp_nistz256_point_add_affine_vis3:
2545 save %sp,-STACK64_FRAME-32*15-32,%sp
2546
2547 mov $rp,$rp_real
2548 mov -1,$minus1
2549 mov -2,$poly3
2550 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000
2551 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE
2552
2553 ! convert input to uint64_t[4]
2554 ld [$bp],$a0 ! in2_x
2555 ld [$bp+4],$t0
2556 ld [$bp+8],$a1
2557 ld [$bp+12],$t1
2558 ld [$bp+16],$a2
2559 ld [$bp+20],$t2
2560 ld [$bp+24],$a3
2561 ld [$bp+28],$t3
2562 sllx $t0,32,$t0
2563 sllx $t1,32,$t1
2564 ld [$bp+32],$acc0 ! in2_y
2565 or $a0,$t0,$a0
2566 ld [$bp+32+4],$t0
2567 sllx $t2,32,$t2
2568 ld [$bp+32+8],$acc1
2569 or $a1,$t1,$a1
2570 ld [$bp+32+12],$t1
2571 sllx $t3,32,$t3
2572 ld [$bp+32+16],$acc2
2573 or $a2,$t2,$a2
2574 ld [$bp+32+20],$t2
2575 or $a3,$t3,$a3
2576 ld [$bp+32+24],$acc3
2577 sllx $t0,32,$t0
2578 ld [$bp+32+28],$t3
2579 sllx $t1,32,$t1
2580 stx $a0,[%sp+LOCALS64+$in2_x]
2581 sllx $t2,32,$t2
2582 stx $a1,[%sp+LOCALS64+$in2_x+8]
2583 sllx $t3,32,$t3
2584 stx $a2,[%sp+LOCALS64+$in2_x+16]
2585 or $acc0,$t0,$acc0
2586 stx $a3,[%sp+LOCALS64+$in2_x+24]
2587 or $acc1,$t1,$acc1
2588 stx $acc0,[%sp+LOCALS64+$in2_y]
2589 or $acc2,$t2,$acc2
2590 stx $acc1,[%sp+LOCALS64+$in2_y+8]
2591 or $acc3,$t3,$acc3
2592 stx $acc2,[%sp+LOCALS64+$in2_y+16]
2593 stx $acc3,[%sp+LOCALS64+$in2_y+24]
2594
2595 or $a1,$a0,$a0
2596 or $a3,$a2,$a2
2597 or $acc1,$acc0,$acc0
2598 or $acc3,$acc2,$acc2
2599 or $a2,$a0,$a0
2600 or $acc2,$acc0,$acc0
2601 or $acc0,$a0,$a0
2602 movrnz $a0,-1,$a0 ! !in2infty
2603 stx $a0,[%fp+STACK_BIAS-8]
2604
2605 ld [$ap],$a0 ! in1_x
2606 ld [$ap+4],$t0
2607 ld [$ap+8],$a1
2608 ld [$ap+12],$t1
2609 ld [$ap+16],$a2
2610 ld [$ap+20],$t2
2611 ld [$ap+24],$a3
2612 ld [$ap+28],$t3
2613 sllx $t0,32,$t0
2614 sllx $t1,32,$t1
2615 ld [$ap+32],$acc0 ! in1_y
2616 or $a0,$t0,$a0
2617 ld [$ap+32+4],$t0
2618 sllx $t2,32,$t2
2619 ld [$ap+32+8],$acc1
2620 or $a1,$t1,$a1
2621 ld [$ap+32+12],$t1
2622 sllx $t3,32,$t3
2623 ld [$ap+32+16],$acc2
2624 or $a2,$t2,$a2
2625 ld [$ap+32+20],$t2
2626 or $a3,$t3,$a3
2627 ld [$ap+32+24],$acc3
2628 sllx $t0,32,$t0
2629 ld [$ap+32+28],$t3
2630 sllx $t1,32,$t1
2631 stx $a0,[%sp+LOCALS64+$in1_x]
2632 sllx $t2,32,$t2
2633 stx $a1,[%sp+LOCALS64+$in1_x+8]
2634 sllx $t3,32,$t3
2635 stx $a2,[%sp+LOCALS64+$in1_x+16]
2636 or $acc0,$t0,$acc0
2637 stx $a3,[%sp+LOCALS64+$in1_x+24]
2638 or $acc1,$t1,$acc1
2639 stx $acc0,[%sp+LOCALS64+$in1_y]
2640 or $acc2,$t2,$acc2
2641 stx $acc1,[%sp+LOCALS64+$in1_y+8]
2642 or $acc3,$t3,$acc3
2643 stx $acc2,[%sp+LOCALS64+$in1_y+16]
2644 stx $acc3,[%sp+LOCALS64+$in1_y+24]
2645
2646 ld [$ap+64],$a0 ! in1_z
2647 ld [$ap+64+4],$t0
2648 ld [$ap+64+8],$a1
2649 ld [$ap+64+12],$t1
2650 ld [$ap+64+16],$a2
2651 ld [$ap+64+20],$t2
2652 ld [$ap+64+24],$a3
2653 ld [$ap+64+28],$t3
2654 sllx $t0,32,$t0
2655 sllx $t1,32,$t1
2656 or $a0,$t0,$a0
2657 sllx $t2,32,$t2
2658 or $a1,$t1,$a1
2659 sllx $t3,32,$t3
2660 stx $a0,[%sp+LOCALS64+$in1_z]
2661 or $a2,$t2,$a2
2662 stx $a1,[%sp+LOCALS64+$in1_z+8]
2663 or $a3,$t3,$a3
2664 stx $a2,[%sp+LOCALS64+$in1_z+16]
2665 stx $a3,[%sp+LOCALS64+$in1_z+24]
2666
2667 or $a1,$a0,$t0
2668 or $a3,$a2,$t2
2669 or $t2,$t0,$t0
2670 movrnz $t0,-1,$t0 ! !in1infty
2671 stx $t0,[%fp+STACK_BIAS-16]
2672
2673 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z);
2674 add %sp,LOCALS64+$Z1sqr,$rp
2675
2676 ldx [%sp+LOCALS64+$in2_x],$bi
2677 mov $acc0,$a0
2678 mov $acc1,$a1
2679 mov $acc2,$a2
2680 mov $acc3,$a3
2681 add %sp,LOCALS64+$in2_x,$bp
2682 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x);
2683 add %sp,LOCALS64+$U2,$rp
2684
2685 ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load
2686 ldx [%sp+LOCALS64+$in1_z],$a0
2687 ldx [%sp+LOCALS64+$in1_z+8],$a1
2688 ldx [%sp+LOCALS64+$in1_z+16],$a2
2689 ldx [%sp+LOCALS64+$in1_z+24],$a3
2690
2691 add %sp,LOCALS64+$in1_x,$bp
2692 call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x);
2693 add %sp,LOCALS64+$H,$rp
2694
2695 add %sp,LOCALS64+$Z1sqr,$bp
2696 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z);
2697 add %sp,LOCALS64+$S2,$rp
2698
2699 ldx [%sp+LOCALS64+$H],$bi
2700 ldx [%sp+LOCALS64+$in1_z],$a0
2701 ldx [%sp+LOCALS64+$in1_z+8],$a1
2702 ldx [%sp+LOCALS64+$in1_z+16],$a2
2703 ldx [%sp+LOCALS64+$in1_z+24],$a3
2704 add %sp,LOCALS64+$H,$bp
2705 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z);
2706 add %sp,LOCALS64+$res_z,$rp
2707
2708 ldx [%sp+LOCALS64+$S2],$bi
2709 ldx [%sp+LOCALS64+$in2_y],$a0
2710 ldx [%sp+LOCALS64+$in2_y+8],$a1
2711 ldx [%sp+LOCALS64+$in2_y+16],$a2
2712 ldx [%sp+LOCALS64+$in2_y+24],$a3
2713 add %sp,LOCALS64+$S2,$bp
2714 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y);
2715 add %sp,LOCALS64+$S2,$rp
2716
2717 ldx [%sp+LOCALS64+$H],$a0 ! forward load
2718 ldx [%sp+LOCALS64+$H+8],$a1
2719 ldx [%sp+LOCALS64+$H+16],$a2
2720 ldx [%sp+LOCALS64+$H+24],$a3
2721
2722 add %sp,LOCALS64+$in1_y,$bp
2723 call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y);
2724 add %sp,LOCALS64+$R,$rp
2725
2726 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H);
2727 add %sp,LOCALS64+$Hsqr,$rp
2728
2729 ldx [%sp+LOCALS64+$R],$a0
2730 ldx [%sp+LOCALS64+$R+8],$a1
2731 ldx [%sp+LOCALS64+$R+16],$a2
2732 ldx [%sp+LOCALS64+$R+24],$a3
2733 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R);
2734 add %sp,LOCALS64+$Rsqr,$rp
2735
2736 ldx [%sp+LOCALS64+$H],$bi
2737 ldx [%sp+LOCALS64+$Hsqr],$a0
2738 ldx [%sp+LOCALS64+$Hsqr+8],$a1
2739 ldx [%sp+LOCALS64+$Hsqr+16],$a2
2740 ldx [%sp+LOCALS64+$Hsqr+24],$a3
2741 add %sp,LOCALS64+$H,$bp
2742 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H);
2743 add %sp,LOCALS64+$Hcub,$rp
2744
2745 ldx [%sp+LOCALS64+$Hsqr],$bi
2746 ldx [%sp+LOCALS64+$in1_x],$a0
2747 ldx [%sp+LOCALS64+$in1_x+8],$a1
2748 ldx [%sp+LOCALS64+$in1_x+16],$a2
2749 ldx [%sp+LOCALS64+$in1_x+24],$a3
2750 add %sp,LOCALS64+$Hsqr,$bp
2751 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr);
2752 add %sp,LOCALS64+$U2,$rp
2753
2754 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2);
2755 add %sp,LOCALS64+$Hsqr,$rp
2756
2757 add %sp,LOCALS64+$Rsqr,$bp
2758 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr);
2759 add %sp,LOCALS64+$res_x,$rp
2760
2761 add %sp,LOCALS64+$Hcub,$bp
2762 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub);
2763 add %sp,LOCALS64+$res_x,$rp
2764
2765 ldx [%sp+LOCALS64+$Hcub],$bi ! forward load
2766 ldx [%sp+LOCALS64+$in1_y],$a0
2767 ldx [%sp+LOCALS64+$in1_y+8],$a1
2768 ldx [%sp+LOCALS64+$in1_y+16],$a2
2769 ldx [%sp+LOCALS64+$in1_y+24],$a3
2770
2771 add %sp,LOCALS64+$U2,$bp
2772 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x);
2773 add %sp,LOCALS64+$res_y,$rp
2774
2775 add %sp,LOCALS64+$Hcub,$bp
2776 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub);
2777 add %sp,LOCALS64+$S2,$rp
2778
2779 ldx [%sp+LOCALS64+$R],$bi
2780 ldx [%sp+LOCALS64+$res_y],$a0
2781 ldx [%sp+LOCALS64+$res_y+8],$a1
2782 ldx [%sp+LOCALS64+$res_y+16],$a2
2783 ldx [%sp+LOCALS64+$res_y+24],$a3
2784 add %sp,LOCALS64+$R,$bp
2785 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R);
2786 add %sp,LOCALS64+$res_y,$rp
2787
2788 add %sp,LOCALS64+$S2,$bp
2789 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2);
2790 add %sp,LOCALS64+$res_y,$rp
2791
2792 ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty
2793 ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty
27941: call .+8
2795 add %o7,.Lone_mont_vis3-1b,$bp
2796___
2797for($i=0;$i<64;$i+=16) { # conditional moves
2798$code.=<<___;
2799 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res
2800 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1
2801 ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2
2802 ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3
2803 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1
2804 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5
2805 movrz $t1,$acc2,$acc0
2806 movrz $t1,$acc3,$acc1
2807 movrz $t2,$acc4,$acc0
2808 movrz $t2,$acc5,$acc1
2809 srlx $acc0,32,$acc2
2810 srlx $acc1,32,$acc3
2811 st $acc0,[$rp_real+$i]
2812 st $acc2,[$rp_real+$i+4]
2813 st $acc1,[$rp_real+$i+8]
2814 st $acc3,[$rp_real+$i+12]
2815___
2816}
2817for(;$i<96;$i+=16) {
2818$code.=<<___;
2819 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res
2820 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1
2821 ldx [$bp+$i-64],$acc2 ! "in2"
2822 ldx [$bp+$i-64+8],$acc3
2823 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1
2824 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5
2825 movrz $t1,$acc2,$acc0
2826 movrz $t1,$acc3,$acc1
2827 movrz $t2,$acc4,$acc0
2828 movrz $t2,$acc5,$acc1
2829 srlx $acc0,32,$acc2
2830 srlx $acc1,32,$acc3
2831 st $acc0,[$rp_real+$i]
2832 st $acc2,[$rp_real+$i+4]
2833 st $acc1,[$rp_real+$i+8]
2834 st $acc3,[$rp_real+$i+12]
2835___
2836}
2837$code.=<<___;
2838 ret
2839 restore
2840.type ecp_nistz256_point_add_affine_vis3,#function
2841.size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
2842.align 64
2843.Lone_mont_vis3:
2844.long 0x00000000,0x00000001, 0xffffffff,0x00000000
2845.long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe
2846.align 64
2847#endif
2848___
2849} }}}
2850
2851# Purpose of these subroutines is to explicitly encode VIS instructions,
2852# so that one can compile the module without having to specify VIS
2853# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
2854# Idea is to reserve for option to produce "universal" binary and let
2855# programmer detect if current CPU is VIS capable at run-time.
2856sub unvis3 {
2857my ($mnemonic,$rs1,$rs2,$rd)=@_;
2858my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
2859my ($ref,$opf);
2860my %visopf = ( "addxc" => 0x011,
2861 "addxccc" => 0x013,
2862 "umulxhi" => 0x016 );
2863
2864 $ref = "$mnemonic\t$rs1,$rs2,$rd";
2865
2866 if ($opf=$visopf{$mnemonic}) {
2867 foreach ($rs1,$rs2,$rd) {
2868 return $ref if (!/%([goli])([0-9])/);
2869 $_=$bias{$1}+$2;
2870 }
2871
2872 return sprintf ".word\t0x%08x !%s",
2873 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
2874 $ref;
2875 } else {
2876 return $ref;
2877 }
2878}
2879
2880foreach (split("\n",$code)) {
2881 s/\`([^\`]*)\`/eval $1/ge;
2882
2883 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
2884 &unvis3($1,$2,$3,$4)
2885 /ge;
2886
2887 print $_,"\n";
2888}
2889
2890close STDOUT;
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl
deleted file mode 100644
index 085d637e5d..0000000000
--- a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86.pl
+++ /dev/null
@@ -1,1740 +0,0 @@
1#! /usr/bin/env perl
2# $OpenBSD: ecp_nistz256-x86.pl,v 1.1 2016/11/04 17:33:20 miod Exp $
3#
4# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
5#
6# Licensed under the OpenSSL license (the "License"). You may not use
7# this file except in compliance with the License. You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see http://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18#
19# ECP_NISTZ256 module for x86/SSE2.
20#
21# October 2014.
22#
23# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
24# http://eprint.iacr.org/2013/816. In the process of adaptation
25# original .c module was made 32-bit savvy in order to make this
26# implementation possible.
27#
28# with/without -DECP_NISTZ256_ASM
29# Pentium +66-163%
30# PIII +72-172%
31# P4 +65-132%
32# Core2 +90-215%
33# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this)
34# Atom +65-155%
35# Opteron +54-110%
36# Bulldozer +99-240%
37# VIA Nano +93-290%
38#
39# Ranges denote minimum and maximum improvement coefficients depending
40# on benchmark. Lower coefficients are for ECDSA sign, server-side
41# operation. Keep in mind that +200% means 3x improvement.
42
43$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44push(@INC,"${dir}","${dir}../../perlasm");
45require "x86asm.pl";
46
47# Uncomment when all i386 assembly generators are updated to take the output
48# file as last argument...
49# $output=pop;
50# open STDOUT,">$output";
51
52&asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386");
53
54$sse2=0;
55for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
56
57&external_label("OPENSSL_ia32cap_P") if ($sse2);
58
59
60########################################################################
61# Keep in mind that constants are stored least to most significant word
62&static_label("ONE");
63&set_label("ONE",64);
64&data_word(1,0,0,0,0,0,0,0);
65&align(64);
66
67########################################################################
68# void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]);
69&function_begin("ecp_nistz256_mul_by_2");
70 &mov ("esi",&wparam(1));
71 &mov ("edi",&wparam(0));
72 &mov ("ebp","esi");
73########################################################################
74# common pattern for internal functions is that %edi is result pointer,
75# %esi and %ebp are input ones, %ebp being optional. %edi is preserved.
76 &call ("_ecp_nistz256_add");
77&function_end("ecp_nistz256_mul_by_2");
78
79########################################################################
80# void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]);
81&function_begin("ecp_nistz256_div_by_2");
82 &mov ("esi",&wparam(1));
83 &mov ("edi",&wparam(0));
84 &call ("_ecp_nistz256_div_by_2");
85&function_end("ecp_nistz256_div_by_2");
86
87&function_begin_B("_ecp_nistz256_div_by_2");
88 # tmp = a is odd ? a+mod : a
89 #
90 # note that because mod has special form, i.e. consists of
91 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
92 # assigning least significant bit of input to one register,
93 # %ebp, and its negative to another, %edx.
94
95 &mov ("ebp",&DWP(0,"esi"));
96 &xor ("edx","edx");
97 &mov ("ebx",&DWP(4,"esi"));
98 &mov ("eax","ebp");
99 &and ("ebp",1);
100 &mov ("ecx",&DWP(8,"esi"));
101 &sub ("edx","ebp");
102
103 &add ("eax","edx");
104 &adc ("ebx","edx");
105 &mov (&DWP(0,"edi"),"eax");
106 &adc ("ecx","edx");
107 &mov (&DWP(4,"edi"),"ebx");
108 &mov (&DWP(8,"edi"),"ecx");
109
110 &mov ("eax",&DWP(12,"esi"));
111 &mov ("ebx",&DWP(16,"esi"));
112 &adc ("eax",0);
113 &mov ("ecx",&DWP(20,"esi"));
114 &adc ("ebx",0);
115 &mov (&DWP(12,"edi"),"eax");
116 &adc ("ecx",0);
117 &mov (&DWP(16,"edi"),"ebx");
118 &mov (&DWP(20,"edi"),"ecx");
119
120 &mov ("eax",&DWP(24,"esi"));
121 &mov ("ebx",&DWP(28,"esi"));
122 &adc ("eax","ebp");
123 &adc ("ebx","edx");
124 &mov (&DWP(24,"edi"),"eax");
125 &sbb ("esi","esi"); # broadcast carry bit
126 &mov (&DWP(28,"edi"),"ebx");
127
128 # ret = tmp >> 1
129
130 &mov ("eax",&DWP(0,"edi"));
131 &mov ("ebx",&DWP(4,"edi"));
132 &mov ("ecx",&DWP(8,"edi"));
133 &mov ("edx",&DWP(12,"edi"));
134
135 &shr ("eax",1);
136 &mov ("ebp","ebx");
137 &shl ("ebx",31);
138 &or ("eax","ebx");
139
140 &shr ("ebp",1);
141 &mov ("ebx","ecx");
142 &shl ("ecx",31);
143 &mov (&DWP(0,"edi"),"eax");
144 &or ("ebp","ecx");
145 &mov ("eax",&DWP(16,"edi"));
146
147 &shr ("ebx",1);
148 &mov ("ecx","edx");
149 &shl ("edx",31);
150 &mov (&DWP(4,"edi"),"ebp");
151 &or ("ebx","edx");
152 &mov ("ebp",&DWP(20,"edi"));
153
154 &shr ("ecx",1);
155 &mov ("edx","eax");
156 &shl ("eax",31);
157 &mov (&DWP(8,"edi"),"ebx");
158 &or ("ecx","eax");
159 &mov ("ebx",&DWP(24,"edi"));
160
161 &shr ("edx",1);
162 &mov ("eax","ebp");
163 &shl ("ebp",31);
164 &mov (&DWP(12,"edi"),"ecx");
165 &or ("edx","ebp");
166 &mov ("ecx",&DWP(28,"edi"));
167
168 &shr ("eax",1);
169 &mov ("ebp","ebx");
170 &shl ("ebx",31);
171 &mov (&DWP(16,"edi"),"edx");
172 &or ("eax","ebx");
173
174 &shr ("ebp",1);
175 &mov ("ebx","ecx");
176 &shl ("ecx",31);
177 &mov (&DWP(20,"edi"),"eax");
178 &or ("ebp","ecx");
179
180 &shr ("ebx",1);
181 &shl ("esi",31);
182 &mov (&DWP(24,"edi"),"ebp");
183 &or ("ebx","esi"); # handle top-most carry bit
184 &mov (&DWP(28,"edi"),"ebx");
185
186 &ret ();
187&function_end_B("_ecp_nistz256_div_by_2");
188
189########################################################################
190# void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8],
191# const BN_ULONG ebp[8]);
192&function_begin("ecp_nistz256_add");
193 &mov ("esi",&wparam(1));
194 &mov ("ebp",&wparam(2));
195 &mov ("edi",&wparam(0));
196 &call ("_ecp_nistz256_add");
197&function_end("ecp_nistz256_add");
198
199&function_begin_B("_ecp_nistz256_add");
200 &mov ("eax",&DWP(0,"esi"));
201 &mov ("ebx",&DWP(4,"esi"));
202 &mov ("ecx",&DWP(8,"esi"));
203 &add ("eax",&DWP(0,"ebp"));
204 &mov ("edx",&DWP(12,"esi"));
205 &adc ("ebx",&DWP(4,"ebp"));
206 &mov (&DWP(0,"edi"),"eax");
207 &adc ("ecx",&DWP(8,"ebp"));
208 &mov (&DWP(4,"edi"),"ebx");
209 &adc ("edx",&DWP(12,"ebp"));
210 &mov (&DWP(8,"edi"),"ecx");
211 &mov (&DWP(12,"edi"),"edx");
212
213 &mov ("eax",&DWP(16,"esi"));
214 &mov ("ebx",&DWP(20,"esi"));
215 &mov ("ecx",&DWP(24,"esi"));
216 &adc ("eax",&DWP(16,"ebp"));
217 &mov ("edx",&DWP(28,"esi"));
218 &adc ("ebx",&DWP(20,"ebp"));
219 &mov (&DWP(16,"edi"),"eax");
220 &adc ("ecx",&DWP(24,"ebp"));
221 &mov (&DWP(20,"edi"),"ebx");
222 &mov ("esi",0);
223 &adc ("edx",&DWP(28,"ebp"));
224 &mov (&DWP(24,"edi"),"ecx");
225 &adc ("esi",0);
226 &mov (&DWP(28,"edi"),"edx");
227
228 # if a+b >= modulus, subtract modulus.
229 #
230 # But since comparison implies subtraction, we subtract modulus
231 # to see if it borrows, and then subtract it for real if
232 # subtraction didn't borrow.
233
234 &mov ("eax",&DWP(0,"edi"));
235 &mov ("ebx",&DWP(4,"edi"));
236 &mov ("ecx",&DWP(8,"edi"));
237 &sub ("eax",-1);
238 &mov ("edx",&DWP(12,"edi"));
239 &sbb ("ebx",-1);
240 &mov ("eax",&DWP(16,"edi"));
241 &sbb ("ecx",-1);
242 &mov ("ebx",&DWP(20,"edi"));
243 &sbb ("edx",0);
244 &mov ("ecx",&DWP(24,"edi"));
245 &sbb ("eax",0);
246 &mov ("edx",&DWP(28,"edi"));
247 &sbb ("ebx",0);
248 &sbb ("ecx",1);
249 &sbb ("edx",-1);
250 &sbb ("esi",0);
251
252 # Note that because mod has special form, i.e. consists of
253 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
254 # by using borrow.
255
256 &not ("esi");
257 &mov ("eax",&DWP(0,"edi"));
258 &mov ("ebp","esi");
259 &mov ("ebx",&DWP(4,"edi"));
260 &shr ("ebp",31);
261 &mov ("ecx",&DWP(8,"edi"));
262 &sub ("eax","esi");
263 &mov ("edx",&DWP(12,"edi"));
264 &sbb ("ebx","esi");
265 &mov (&DWP(0,"edi"),"eax");
266 &sbb ("ecx","esi");
267 &mov (&DWP(4,"edi"),"ebx");
268 &sbb ("edx",0);
269 &mov (&DWP(8,"edi"),"ecx");
270 &mov (&DWP(12,"edi"),"edx");
271
272 &mov ("eax",&DWP(16,"edi"));
273 &mov ("ebx",&DWP(20,"edi"));
274 &mov ("ecx",&DWP(24,"edi"));
275 &sbb ("eax",0);
276 &mov ("edx",&DWP(28,"edi"));
277 &sbb ("ebx",0);
278 &mov (&DWP(16,"edi"),"eax");
279 &sbb ("ecx","ebp");
280 &mov (&DWP(20,"edi"),"ebx");
281 &sbb ("edx","esi");
282 &mov (&DWP(24,"edi"),"ecx");
283 &mov (&DWP(28,"edi"),"edx");
284
285 &ret ();
286&function_end_B("_ecp_nistz256_add");
287
288########################################################################
289# void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8],
290# const BN_ULONG ebp[8]);
291&function_begin("ecp_nistz256_sub");
292 &mov ("esi",&wparam(1));
293 &mov ("ebp",&wparam(2));
294 &mov ("edi",&wparam(0));
295 &call ("_ecp_nistz256_sub");
296&function_end("ecp_nistz256_sub");
297
298&function_begin_B("_ecp_nistz256_sub");
299 &mov ("eax",&DWP(0,"esi"));
300 &mov ("ebx",&DWP(4,"esi"));
301 &mov ("ecx",&DWP(8,"esi"));
302 &sub ("eax",&DWP(0,"ebp"));
303 &mov ("edx",&DWP(12,"esi"));
304 &sbb ("ebx",&DWP(4,"ebp"));
305 &mov (&DWP(0,"edi"),"eax");
306 &sbb ("ecx",&DWP(8,"ebp"));
307 &mov (&DWP(4,"edi"),"ebx");
308 &sbb ("edx",&DWP(12,"ebp"));
309 &mov (&DWP(8,"edi"),"ecx");
310 &mov (&DWP(12,"edi"),"edx");
311
312 &mov ("eax",&DWP(16,"esi"));
313 &mov ("ebx",&DWP(20,"esi"));
314 &mov ("ecx",&DWP(24,"esi"));
315 &sbb ("eax",&DWP(16,"ebp"));
316 &mov ("edx",&DWP(28,"esi"));
317 &sbb ("ebx",&DWP(20,"ebp"));
318 &sbb ("ecx",&DWP(24,"ebp"));
319 &mov (&DWP(16,"edi"),"eax");
320 &sbb ("edx",&DWP(28,"ebp"));
321 &mov (&DWP(20,"edi"),"ebx");
322 &sbb ("esi","esi"); # broadcast borrow bit
323 &mov (&DWP(24,"edi"),"ecx");
324 &mov (&DWP(28,"edi"),"edx");
325
326 # if a-b borrows, add modulus.
327 #
328 # Note that because mod has special form, i.e. consists of
329 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
330 # assigning borrow bit to one register, %ebp, and its negative
331 # to another, %esi. But we started by calculating %esi...
332
333 &mov ("eax",&DWP(0,"edi"));
334 &mov ("ebp","esi");
335 &mov ("ebx",&DWP(4,"edi"));
336 &shr ("ebp",31);
337 &mov ("ecx",&DWP(8,"edi"));
338 &add ("eax","esi");
339 &mov ("edx",&DWP(12,"edi"));
340 &adc ("ebx","esi");
341 &mov (&DWP(0,"edi"),"eax");
342 &adc ("ecx","esi");
343 &mov (&DWP(4,"edi"),"ebx");
344 &adc ("edx",0);
345 &mov (&DWP(8,"edi"),"ecx");
346 &mov (&DWP(12,"edi"),"edx");
347
348 &mov ("eax",&DWP(16,"edi"));
349 &mov ("ebx",&DWP(20,"edi"));
350 &mov ("ecx",&DWP(24,"edi"));
351 &adc ("eax",0);
352 &mov ("edx",&DWP(28,"edi"));
353 &adc ("ebx",0);
354 &mov (&DWP(16,"edi"),"eax");
355 &adc ("ecx","ebp");
356 &mov (&DWP(20,"edi"),"ebx");
357 &adc ("edx","esi");
358 &mov (&DWP(24,"edi"),"ecx");
359 &mov (&DWP(28,"edi"),"edx");
360
361 &ret ();
362&function_end_B("_ecp_nistz256_sub");
363
364########################################################################
365# void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]);
366&function_begin("ecp_nistz256_neg");
367 &mov ("ebp",&wparam(1));
368 &mov ("edi",&wparam(0));
369
370 &xor ("eax","eax");
371 &stack_push(8);
372 &mov (&DWP(0,"esp"),"eax");
373 &mov ("esi","esp");
374 &mov (&DWP(4,"esp"),"eax");
375 &mov (&DWP(8,"esp"),"eax");
376 &mov (&DWP(12,"esp"),"eax");
377 &mov (&DWP(16,"esp"),"eax");
378 &mov (&DWP(20,"esp"),"eax");
379 &mov (&DWP(24,"esp"),"eax");
380 &mov (&DWP(28,"esp"),"eax");
381
382 &call ("_ecp_nistz256_sub");
383
384 &stack_pop(8);
385&function_end("ecp_nistz256_neg");
386
387&function_begin_B("_picup_eax");
388 &mov ("eax",&DWP(0,"esp"));
389 &ret ();
390&function_end_B("_picup_eax");
391
392########################################################################
393# void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]);
394&function_begin("ecp_nistz256_from_mont");
395 &mov ("esi",&wparam(1));
396 &call ("_picup_eax");
397 &set_label("pic");
398 &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax"));
399 if ($sse2) {
400 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic"));
401 &mov ("eax",&DWP(0,"eax")); }
402 &mov ("edi",&wparam(0));
403 &call ("_ecp_nistz256_mul_mont");
404&function_end("ecp_nistz256_from_mont");
405
406########################################################################
407# void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8],
408# const BN_ULONG ebp[8]);
409&function_begin("ecp_nistz256_mul_mont");
410 &mov ("esi",&wparam(1));
411 &mov ("ebp",&wparam(2));
412 if ($sse2) {
413 &call ("_picup_eax");
414 &set_label("pic");
415 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic"));
416 &mov ("eax",&DWP(0,"eax")); }
417 &mov ("edi",&wparam(0));
418 &call ("_ecp_nistz256_mul_mont");
419&function_end("ecp_nistz256_mul_mont");
420
421########################################################################
422# void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]);
423&function_begin("ecp_nistz256_sqr_mont");
424 &mov ("esi",&wparam(1));
425 if ($sse2) {
426 &call ("_picup_eax");
427 &set_label("pic");
428 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic"));
429 &mov ("eax",&DWP(0,"eax")); }
430 &mov ("edi",&wparam(0));
431 &mov ("ebp","esi");
432 &call ("_ecp_nistz256_mul_mont");
433&function_end("ecp_nistz256_sqr_mont");
434
435&function_begin_B("_ecp_nistz256_mul_mont");
436 if ($sse2) {
437 # see if XMM+SSE2 is on
438 &and ("eax","\$(IA32CAP_MASK0_FXSR | IA32CAP_MASK0_SSE2)");
439 &cmp ("eax","\$(IA32CAP_MASK0_FXSR | IA32CAP_MASK0_SSE2)");
440 &jne (&label("mul_mont_ialu"));
441
442 ########################################
443 # SSE2 code path featuring 32x16-bit
444 # multiplications is ~2x faster than
445 # IALU counterpart (except on Atom)...
446 ########################################
447 # stack layout:
448 # +------------------------------------+< %esp
449 # | 7 16-byte temporary XMM words, |
450 # | "sliding" toward lower address |
451 # . .
452 # +------------------------------------+
453 # | unused XMM word |
454 # +------------------------------------+< +128,%ebx
455 # | 8 16-byte XMM words holding copies |
456 # | of a[i]<<64|a[i] |
457 # . .
458 # . .
459 # +------------------------------------+< +256
460 &mov ("edx","esp");
461 &sub ("esp",0x100);
462
463 &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy
464 &lea ("ebp",&DWP(4,"ebp"));
465 &pcmpeqd("xmm6","xmm6");
466 &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff
467
468 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y
469 &and ("esp",-64);
470 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y
471 &lea ("ebx",&DWP(0x80,"esp"));
472
473 &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy
474 &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy
475 &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ...
476 &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0]
477 &pmuludq("xmm0","xmm7"); # a[0]*b[0]
478
479 &movd ("xmm2",&DWP(4*2,"esi"));
480 &pshufd ("xmm1","xmm1",0b11001100);
481 &movdqa (&QWP(0x10,"ebx"),"xmm1");
482 &pmuludq("xmm1","xmm7"); # a[1]*b[0]
483
484 &movq ("xmm4","xmm0"); # clear upper 64 bits
485 &pslldq("xmm4",6);
486 &paddq ("xmm4","xmm0");
487 &movdqa("xmm5","xmm4");
488 &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0]
489 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0]
490
491 # Upper half of a[0]*b[i] is carried into next multiplication
492 # iteration, while lower one "participates" in actual reduction.
493 # Normally latter is done by accumulating result of multiplication
494 # of modulus by "magic" digit, but thanks to special form of modulus
495 # and "magic" digit it can be performed only with additions and
496 # subtractions (see note in IALU section below). Note that we are
497 # not bothered with carry bits, they are accumulated in "flatten"
498 # phase after all multiplications and reductions.
499
500 &movd ("xmm3",&DWP(4*3,"esi"));
501 &pshufd ("xmm2","xmm2",0b11001100);
502 &movdqa (&QWP(0x20,"ebx"),"xmm2");
503 &pmuludq("xmm2","xmm7"); # a[2]*b[0]
504 &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry
505 &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0]
506
507 &movd ("xmm0",&DWP(4*4,"esi"));
508 &pshufd ("xmm3","xmm3",0b11001100);
509 &movdqa (&QWP(0x30,"ebx"),"xmm3");
510 &pmuludq("xmm3","xmm7"); # a[3]*b[0]
511 &movdqa (&QWP(0x10,"esp"),"xmm2");
512
513 &movd ("xmm1",&DWP(4*5,"esi"));
514 &pshufd ("xmm0","xmm0",0b11001100);
515 &movdqa (&QWP(0x40,"ebx"),"xmm0");
516 &pmuludq("xmm0","xmm7"); # a[4]*b[0]
517 &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step
518 &movdqa (&QWP(0x20,"esp"),"xmm3");
519
520 &movd ("xmm2",&DWP(4*6,"esi"));
521 &pshufd ("xmm1","xmm1",0b11001100);
522 &movdqa (&QWP(0x50,"ebx"),"xmm1");
523 &pmuludq("xmm1","xmm7"); # a[5]*b[0]
524 &movdqa (&QWP(0x30,"esp"),"xmm0");
525 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step
526
527 &movd ("xmm3",&DWP(4*7,"esi"));
528 &pshufd ("xmm2","xmm2",0b11001100);
529 &movdqa (&QWP(0x60,"ebx"),"xmm2");
530 &pmuludq("xmm2","xmm7"); # a[6]*b[0]
531 &movdqa (&QWP(0x40,"esp"),"xmm1");
532 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step
533
534 &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy
535 &pshufd ("xmm3","xmm3",0b11001100);
536 &movdqa (&QWP(0x70,"ebx"),"xmm3");
537 &pmuludq("xmm3","xmm7"); # a[7]*b[0]
538
539 &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y
540 &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0]
541 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y
542
543 &mov ("ecx",6);
544 &lea ("ebp",&DWP(4,"ebp"));
545 &jmp (&label("madd_sse2"));
546
547&set_label("madd_sse2",16);
548 &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled]
549 &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled]
550 &movdqa ("xmm1",&QWP(0x10,"ebx"));
551 &pmuludq("xmm0","xmm7"); # a[0]*b[i]
552 &movdqa(&QWP(0x50,"esp"),"xmm2");
553
554 &movdqa ("xmm2",&QWP(0x20,"ebx"));
555 &pmuludq("xmm1","xmm7"); # a[1]*b[i]
556 &movdqa(&QWP(0x60,"esp"),"xmm3");
557 &paddq ("xmm0",&QWP(0x00,"esp"));
558
559 &movdqa ("xmm3",&QWP(0x30,"ebx"));
560 &pmuludq("xmm2","xmm7"); # a[2]*b[i]
561 &movq ("xmm4","xmm0"); # clear upper 64 bits
562 &pslldq("xmm4",6);
563 &paddq ("xmm1",&QWP(0x10,"esp"));
564 &paddq ("xmm4","xmm0");
565 &movdqa("xmm5","xmm4");
566 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0]
567
568 &movdqa ("xmm0",&QWP(0x40,"ebx"));
569 &pmuludq("xmm3","xmm7"); # a[3]*b[i]
570 &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry
571 &paddq ("xmm2",&QWP(0x20,"esp"));
572 &movdqa (&QWP(0x00,"esp"),"xmm1");
573
574 &movdqa ("xmm1",&QWP(0x50,"ebx"));
575 &pmuludq("xmm0","xmm7"); # a[4]*b[i]
576 &paddq ("xmm3",&QWP(0x30,"esp"));
577 &movdqa (&QWP(0x10,"esp"),"xmm2");
578 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i]
579
580 &movdqa ("xmm2",&QWP(0x60,"ebx"));
581 &pmuludq("xmm1","xmm7"); # a[5]*b[i]
582 &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step
583 &paddq ("xmm0",&QWP(0x40,"esp"));
584 &movdqa (&QWP(0x20,"esp"),"xmm3");
585 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step
586
587 &movdqa ("xmm3","xmm7");
588 &pmuludq("xmm2","xmm7"); # a[6]*b[i]
589 &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy
590 &lea ("ebp",&DWP(4,"ebp"));
591 &paddq ("xmm1",&QWP(0x50,"esp"));
592 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step
593 &movdqa (&QWP(0x30,"esp"),"xmm0");
594 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y
595
596 &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i]
597 &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y
598 &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0]
599 &movdqa (&QWP(0x40,"esp"),"xmm1");
600 &paddq ("xmm2",&QWP(0x60,"esp"));
601
602 &dec ("ecx");
603 &jnz (&label("madd_sse2"));
604
605 &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled]
606 &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled]
607 &movdqa ("xmm1",&QWP(0x10,"ebx"));
608 &pmuludq("xmm0","xmm7"); # a[0]*b[7]
609 &movdqa(&QWP(0x50,"esp"),"xmm2");
610
611 &movdqa ("xmm2",&QWP(0x20,"ebx"));
612 &pmuludq("xmm1","xmm7"); # a[1]*b[7]
613 &movdqa(&QWP(0x60,"esp"),"xmm3");
614 &paddq ("xmm0",&QWP(0x00,"esp"));
615
616 &movdqa ("xmm3",&QWP(0x30,"ebx"));
617 &pmuludq("xmm2","xmm7"); # a[2]*b[7]
618 &movq ("xmm4","xmm0"); # clear upper 64 bits
619 &pslldq("xmm4",6);
620 &paddq ("xmm1",&QWP(0x10,"esp"));
621 &paddq ("xmm4","xmm0");
622 &movdqa("xmm5","xmm4");
623 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0]
624
625 &movdqa ("xmm0",&QWP(0x40,"ebx"));
626 &pmuludq("xmm3","xmm7"); # a[3]*b[7]
627 &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry
628 &paddq ("xmm2",&QWP(0x20,"esp"));
629 &movdqa (&QWP(0x00,"esp"),"xmm1");
630
631 &movdqa ("xmm1",&QWP(0x50,"ebx"));
632 &pmuludq("xmm0","xmm7"); # a[4]*b[7]
633 &paddq ("xmm3",&QWP(0x30,"esp"));
634 &movdqa (&QWP(0x10,"esp"),"xmm2");
635 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i]
636
637 &movdqa ("xmm2",&QWP(0x60,"ebx"));
638 &pmuludq("xmm1","xmm7"); # a[5]*b[7]
639 &paddq ("xmm3","xmm5"); # reduction step
640 &paddq ("xmm0",&QWP(0x40,"esp"));
641 &movdqa (&QWP(0x20,"esp"),"xmm3");
642 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step
643
644 &movdqa ("xmm3",&QWP(0x70,"ebx"));
645 &pmuludq("xmm2","xmm7"); # a[6]*b[7]
646 &paddq ("xmm1",&QWP(0x50,"esp"));
647 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step
648 &movdqa (&QWP(0x30,"esp"),"xmm0");
649
650 &pmuludq("xmm3","xmm7"); # a[7]*b[7]
651 &pcmpeqd("xmm7","xmm7");
652 &movdqa ("xmm0",&QWP(0x00,"esp"));
653 &pslldq ("xmm7",8);
654 &movdqa (&QWP(0x40,"esp"),"xmm1");
655 &paddq ("xmm2",&QWP(0x60,"esp"));
656
657 &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step
658 &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step
659 &movdqa(&QWP(0x50,"esp"),"xmm2");
660 &movdqa(&QWP(0x60,"esp"),"xmm3");
661
662 &movdqa ("xmm1",&QWP(0x10,"esp"));
663 &movdqa ("xmm2",&QWP(0x20,"esp"));
664 &movdqa ("xmm3",&QWP(0x30,"esp"));
665
666 &movq ("xmm4","xmm0"); # "flatten"
667 &pand ("xmm0","xmm7");
668 &xor ("ebp","ebp");
669 &pslldq ("xmm4",6);
670 &movq ("xmm5","xmm1");
671 &paddq ("xmm0","xmm4");
672 &pand ("xmm1","xmm7");
673 &psrldq ("xmm0",6);
674 &movd ("eax","xmm0");
675 &psrldq ("xmm0",4);
676
677 &paddq ("xmm5","xmm0");
678 &movdqa ("xmm0",&QWP(0x40,"esp"));
679 &sub ("eax",-1); # start subtracting modulus,
680 # this is used to determine
681 # if result is larger/smaller
682 # than modulus (see below)
683 &pslldq ("xmm5",6);
684 &movq ("xmm4","xmm2");
685 &paddq ("xmm1","xmm5");
686 &pand ("xmm2","xmm7");
687 &psrldq ("xmm1",6);
688 &mov (&DWP(4*0,"edi"),"eax");
689 &movd ("eax","xmm1");
690 &psrldq ("xmm1",4);
691
692 &paddq ("xmm4","xmm1");
693 &movdqa ("xmm1",&QWP(0x50,"esp"));
694 &sbb ("eax",-1);
695 &pslldq ("xmm4",6);
696 &movq ("xmm5","xmm3");
697 &paddq ("xmm2","xmm4");
698 &pand ("xmm3","xmm7");
699 &psrldq ("xmm2",6);
700 &mov (&DWP(4*1,"edi"),"eax");
701 &movd ("eax","xmm2");
702 &psrldq ("xmm2",4);
703
704 &paddq ("xmm5","xmm2");
705 &movdqa ("xmm2",&QWP(0x60,"esp"));
706 &sbb ("eax",-1);
707 &pslldq ("xmm5",6);
708 &movq ("xmm4","xmm0");
709 &paddq ("xmm3","xmm5");
710 &pand ("xmm0","xmm7");
711 &psrldq ("xmm3",6);
712 &mov (&DWP(4*2,"edi"),"eax");
713 &movd ("eax","xmm3");
714 &psrldq ("xmm3",4);
715
716 &paddq ("xmm4","xmm3");
717 &sbb ("eax",0);
718 &pslldq ("xmm4",6);
719 &movq ("xmm5","xmm1");
720 &paddq ("xmm0","xmm4");
721 &pand ("xmm1","xmm7");
722 &psrldq ("xmm0",6);
723 &mov (&DWP(4*3,"edi"),"eax");
724 &movd ("eax","xmm0");
725 &psrldq ("xmm0",4);
726
727 &paddq ("xmm5","xmm0");
728 &sbb ("eax",0);
729 &pslldq ("xmm5",6);
730 &movq ("xmm4","xmm2");
731 &paddq ("xmm1","xmm5");
732 &pand ("xmm2","xmm7");
733 &psrldq ("xmm1",6);
734 &movd ("ebx","xmm1");
735 &psrldq ("xmm1",4);
736 &mov ("esp","edx");
737
738 &paddq ("xmm4","xmm1");
739 &pslldq ("xmm4",6);
740 &paddq ("xmm2","xmm4");
741 &psrldq ("xmm2",6);
742 &movd ("ecx","xmm2");
743 &psrldq ("xmm2",4);
744 &sbb ("ebx",0);
745 &movd ("edx","xmm2");
746 &pextrw ("esi","xmm2",2); # top-most overflow bit
747 &sbb ("ecx",1);
748 &sbb ("edx",-1);
749 &sbb ("esi",0); # borrow from subtraction
750
751 # Final step is "if result > mod, subtract mod", and at this point
752 # we have result - mod written to output buffer, as well as borrow
753 # bit from this subtraction, and if borrow bit is set, we add
754 # modulus back.
755 #
756 # Note that because mod has special form, i.e. consists of
757 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
758 # assigning borrow bit to one register, %ebp, and its negative
759 # to another, %esi. But we started by calculating %esi...
760
761 &sub ("ebp","esi");
762 &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero
763 &adc (&DWP(4*1,"edi"),"esi");
764 &adc (&DWP(4*2,"edi"),"esi");
765 &adc (&DWP(4*3,"edi"),0);
766 &adc ("eax",0);
767 &adc ("ebx",0);
768 &mov (&DWP(4*4,"edi"),"eax");
769 &adc ("ecx","ebp");
770 &mov (&DWP(4*5,"edi"),"ebx");
771 &adc ("edx","esi");
772 &mov (&DWP(4*6,"edi"),"ecx");
773 &mov (&DWP(4*7,"edi"),"edx");
774
775 &ret ();
776
777&set_label("mul_mont_ialu",16); }
778
779 ########################################
780 # IALU code path suitable for all CPUs.
781 ########################################
782 # stack layout:
783 # +------------------------------------+< %esp
784 # | 8 32-bit temporary words, accessed |
785 # | as circular buffer |
786 # . .
787 # . .
788 # +------------------------------------+< +32
789 # | offloaded destination pointer |
790 # +------------------------------------+
791 # | unused |
792 # +------------------------------------+< +40
793 &sub ("esp",10*4);
794
795 &mov ("eax",&DWP(0*4,"esi")); # a[0]
796 &mov ("ebx",&DWP(0*4,"ebp")); # b[0]
797 &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr
798
799 &mul ("ebx"); # a[0]*b[0]
800 &mov (&DWP(0*4,"esp"),"eax"); # t[0]
801 &mov ("eax",&DWP(1*4,"esi"));
802 &mov ("ecx","edx")
803
804 &mul ("ebx"); # a[1]*b[0]
805 &add ("ecx","eax");
806 &mov ("eax",&DWP(2*4,"esi"));
807 &adc ("edx",0);
808 &mov (&DWP(1*4,"esp"),"ecx"); # t[1]
809 &mov ("ecx","edx");
810
811 &mul ("ebx"); # a[2]*b[0]
812 &add ("ecx","eax");
813 &mov ("eax",&DWP(3*4,"esi"));
814 &adc ("edx",0);
815 &mov (&DWP(2*4,"esp"),"ecx"); # t[2]
816 &mov ("ecx","edx");
817
818 &mul ("ebx"); # a[3]*b[0]
819 &add ("ecx","eax");
820 &mov ("eax",&DWP(4*4,"esi"));
821 &adc ("edx",0);
822 &mov (&DWP(3*4,"esp"),"ecx"); # t[3]
823 &mov ("ecx","edx");
824
825 &mul ("ebx"); # a[4]*b[0]
826 &add ("ecx","eax");
827 &mov ("eax",&DWP(5*4,"esi"));
828 &adc ("edx",0);
829 &mov (&DWP(4*4,"esp"),"ecx"); # t[4]
830 &mov ("ecx","edx");
831
832 &mul ("ebx"); # a[5]*b[0]
833 &add ("ecx","eax");
834 &mov ("eax",&DWP(6*4,"esi"));
835 &adc ("edx",0);
836 &mov (&DWP(5*4,"esp"),"ecx"); # t[5]
837 &mov ("ecx","edx");
838
839 &mul ("ebx"); # a[6]*b[0]
840 &add ("ecx","eax");
841 &mov ("eax",&DWP(7*4,"esi"));
842 &adc ("edx",0);
843 &mov (&DWP(6*4,"esp"),"ecx"); # t[6]
844 &mov ("ecx","edx");
845
846 &xor ("edi","edi"); # initial top-most carry
847 &mul ("ebx"); # a[7]*b[0]
848 &add ("ecx","eax"); # t[7]
849 &mov ("eax",&DWP(0*4,"esp")); # t[0]
850 &adc ("edx",0); # t[8]
851
852for ($i=0;$i<7;$i++) {
853 my $j=$i+1;
854
855 # Reduction iteration is normally performed by accumulating
856 # result of multiplication of modulus by "magic" digit [and
857 # omitting least significant word, which is guaranteed to
858 # be 0], but thanks to special form of modulus and "magic"
859 # digit being equal to least significant word, it can be
860 # performed with additions and subtractions alone. Indeed:
861 #
862 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
863 # * abcd
864 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
865 #
866 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
867 # rewrite above as:
868 #
869 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
870 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
871 # - abcd.0000.0000.0000.0000.0000.0000.abcd
872 #
873 # or marking redundant operations:
874 #
875 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
876 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
877 # - abcd.----.----.----.----.----.----.----
878
879 &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0]
880 &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0
881 &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0
882 &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0]
883 &adc ("ecx",0); # t[7]+=0
884 &adc ("edx","eax"); # t[8]+=t[0]
885 &adc ("edi",0); # top-most carry
886 &mov ("ebx",&DWP($j*4,"ebp")); # b[i]
887 &sub ("ecx","eax"); # t[7]-=t[0]
888 &mov ("eax",&DWP(0*4,"esi")); # a[0]
889 &sbb ("edx",0); # t[8]-=0
890 &mov (&DWP((($i+7)%8)*4,"esp"),"ecx");
891 &sbb ("edi",0); # top-most carry,
892 # keep in mind that
893 # netto result is
894 # *addition* of value
895 # with (abcd<<32)-abcd
896 # on top, so that
897 # underflow is
898 # impossible, because
899 # (abcd<<32)-abcd
900 # doesn't underflow
901 &mov (&DWP((($i+8)%8)*4,"esp"),"edx");
902
903 &mul ("ebx"); # a[0]*b[i]
904 &add ("eax",&DWP((($j+0)%8)*4,"esp"));
905 &adc ("edx",0);
906 &mov (&DWP((($j+0)%8)*4,"esp"),"eax");
907 &mov ("eax",&DWP(1*4,"esi"));
908 &mov ("ecx","edx")
909
910 &mul ("ebx"); # a[1]*b[i]
911 &add ("ecx",&DWP((($j+1)%8)*4,"esp"));
912 &adc ("edx",0);
913 &add ("ecx","eax");
914 &adc ("edx",0);
915 &mov ("eax",&DWP(2*4,"esi"));
916 &mov (&DWP((($j+1)%8)*4,"esp"),"ecx");
917 &mov ("ecx","edx");
918
919 &mul ("ebx"); # a[2]*b[i]
920 &add ("ecx",&DWP((($j+2)%8)*4,"esp"));
921 &adc ("edx",0);
922 &add ("ecx","eax");
923 &adc ("edx",0);
924 &mov ("eax",&DWP(3*4,"esi"));
925 &mov (&DWP((($j+2)%8)*4,"esp"),"ecx");
926 &mov ("ecx","edx");
927
928 &mul ("ebx"); # a[3]*b[i]
929 &add ("ecx",&DWP((($j+3)%8)*4,"esp"));
930 &adc ("edx",0);
931 &add ("ecx","eax");
932 &adc ("edx",0);
933 &mov ("eax",&DWP(4*4,"esi"));
934 &mov (&DWP((($j+3)%8)*4,"esp"),"ecx");
935 &mov ("ecx","edx");
936
937 &mul ("ebx"); # a[4]*b[i]
938 &add ("ecx",&DWP((($j+4)%8)*4,"esp"));
939 &adc ("edx",0);
940 &add ("ecx","eax");
941 &adc ("edx",0);
942 &mov ("eax",&DWP(5*4,"esi"));
943 &mov (&DWP((($j+4)%8)*4,"esp"),"ecx");
944 &mov ("ecx","edx");
945
946 &mul ("ebx"); # a[5]*b[i]
947 &add ("ecx",&DWP((($j+5)%8)*4,"esp"));
948 &adc ("edx",0);
949 &add ("ecx","eax");
950 &adc ("edx",0);
951 &mov ("eax",&DWP(6*4,"esi"));
952 &mov (&DWP((($j+5)%8)*4,"esp"),"ecx");
953 &mov ("ecx","edx");
954
955 &mul ("ebx"); # a[6]*b[i]
956 &add ("ecx",&DWP((($j+6)%8)*4,"esp"));
957 &adc ("edx",0);
958 &add ("ecx","eax");
959 &adc ("edx",0);
960 &mov ("eax",&DWP(7*4,"esi"));
961 &mov (&DWP((($j+6)%8)*4,"esp"),"ecx");
962 &mov ("ecx","edx");
963
964 &mul ("ebx"); # a[7]*b[i]
965 &add ("ecx",&DWP((($j+7)%8)*4,"esp"));
966 &adc ("edx",0);
967 &add ("ecx","eax"); # t[7]
968 &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0]
969 &adc ("edx","edi"); # t[8]
970 &mov ("edi",0);
971 &adc ("edi",0); # top-most carry
972}
973 &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr
974 &xor ("esi","esi");
975 my $j=$i+1;
976
977 # last multiplication-less reduction
978 &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0]
979 &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0
980 &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0
981 &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0]
982 &adc ("ecx",0); # t[7]+=0
983 &adc ("edx","eax"); # t[8]+=t[0]
984 &adc ("edi",0); # top-most carry
985 &mov ("ebx",&DWP((($j+1)%8)*4,"esp"));
986 &sub ("ecx","eax"); # t[7]-=t[0]
987 &mov ("eax",&DWP((($j+0)%8)*4,"esp"));
988 &sbb ("edx",0); # t[8]-=0
989 &mov (&DWP((($i+7)%8)*4,"esp"),"ecx");
990 &sbb ("edi",0); # top-most carry
991 &mov (&DWP((($i+8)%8)*4,"esp"),"edx");
992
993 # Final step is "if result > mod, subtract mod", but we do it
994 # "other way around", namely write result - mod to output buffer
995 # and if subtraction borrowed, add modulus back.
996
997 &mov ("ecx",&DWP((($j+2)%8)*4,"esp"));
998 &sub ("eax",-1);
999 &mov ("edx",&DWP((($j+3)%8)*4,"esp"));
1000 &sbb ("ebx",-1);
1001 &mov (&DWP(0*4,"ebp"),"eax");
1002 &sbb ("ecx",-1);
1003 &mov (&DWP(1*4,"ebp"),"ebx");
1004 &sbb ("edx",0);
1005 &mov (&DWP(2*4,"ebp"),"ecx");
1006 &mov (&DWP(3*4,"ebp"),"edx");
1007
1008 &mov ("eax",&DWP((($j+4)%8)*4,"esp"));
1009 &mov ("ebx",&DWP((($j+5)%8)*4,"esp"));
1010 &mov ("ecx",&DWP((($j+6)%8)*4,"esp"));
1011 &sbb ("eax",0);
1012 &mov ("edx",&DWP((($j+7)%8)*4,"esp"));
1013 &sbb ("ebx",0);
1014 &sbb ("ecx",1);
1015 &sbb ("edx",-1);
1016 &sbb ("edi",0);
1017
1018 # Note that because mod has special form, i.e. consists of
1019 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1020 # assigning borrow bit to one register, %ebp, and its negative
1021 # to another, %esi. But we started by calculating %esi...
1022
1023 &sub ("esi","edi");
1024 &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero
1025 &adc (&DWP(1*4,"ebp"),"edi");
1026 &adc (&DWP(2*4,"ebp"),"edi");
1027 &adc (&DWP(3*4,"ebp"),0);
1028 &adc ("eax",0);
1029 &adc ("ebx",0);
1030 &mov (&DWP(4*4,"ebp"),"eax");
1031 &adc ("ecx","esi");
1032 &mov (&DWP(5*4,"ebp"),"ebx");
1033 &adc ("edx","edi");
1034 &mov (&DWP(6*4,"ebp"),"ecx");
1035 &mov ("edi","ebp"); # fulfill contract
1036 &mov (&DWP(7*4,"ebp"),"edx");
1037
1038 &add ("esp",10*4);
1039 &ret ();
1040&function_end_B("_ecp_nistz256_mul_mont");
1041
1042########################################################################
1043# void ecp_nistz256_select_w5(P256_POINT *edi,const void *esi,
1044# int ebp);
1045&function_begin("ecp_nistz256_select_w5");
1046 &mov ("esi",&wparam(1));
1047 &mov ("ebp",&wparam(2));
1048
1049 &lea ("esi",&DWP(0,"esi","ebp",4));
1050 &neg ("ebp");
1051 &sar ("ebp",31);
1052 &mov ("edi",&wparam(0));
1053 &lea ("esi",&DWP(0,"esi","ebp",4));
1054
1055 for($i=0;$i<24;$i+=4) {
1056 &mov ("eax",&DWP(64*($i+0),"esi"));
1057 &mov ("ebx",&DWP(64*($i+1),"esi"));
1058 &mov ("ecx",&DWP(64*($i+2),"esi"));
1059 &mov ("edx",&DWP(64*($i+3),"esi"));
1060 &and ("eax","ebp");
1061 &and ("ebx","ebp");
1062 &and ("ecx","ebp");
1063 &and ("edx","ebp");
1064 &mov (&DWP(4*($i+0),"edi"),"eax");
1065 &mov (&DWP(4*($i+1),"edi"),"ebx");
1066 &mov (&DWP(4*($i+2),"edi"),"ecx");
1067 &mov (&DWP(4*($i+3),"edi"),"edx");
1068 }
1069&function_end("ecp_nistz256_select_w5");
1070
1071########################################################################
1072# void ecp_nistz256_select_w7(P256_POINT_AFFINE *edi,const void *esi,
1073# int ebp);
1074&function_begin("ecp_nistz256_select_w7");
1075 &mov ("esi",&wparam(1));
1076 &mov ("ebp",&wparam(2));
1077
1078 &add ("esi","ebp");
1079 &neg ("ebp"),
1080 &sar ("ebp",31);
1081 &mov ("edi",&wparam(0));
1082 &lea ("esi",&DWP(0,"esi","ebp"));
1083
1084 for($i=0;$i<64;$i+=4) {
1085 &movz ("eax",&BP(64*($i+0),"esi"));
1086 &movz ("ebx",&BP(64*($i+1),"esi"));
1087 &movz ("ecx",&BP(64*($i+2),"esi"));
1088 &and ("eax","ebp");
1089 &movz ("edx",&BP(64*($i+3),"esi"));
1090 &and ("ebx","ebp");
1091 &mov (&BP($i+0,"edi"),"al");
1092 &and ("ecx","ebp");
1093 &mov (&BP($i+1,"edi"),"bl");
1094 &and ("edx","ebp");
1095 &mov (&BP($i+2,"edi"),"cl");
1096 &mov (&BP($i+3,"edi"),"dl");
1097 }
1098&function_end("ecp_nistz256_select_w7");
1099
1100########################################################################
1101# following subroutines are "literal" implementation of those found in
1102# ecp_nistz256.c
1103#
1104########################################################################
1105# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1106#
1107&static_label("point_double_shortcut");
1108&function_begin("ecp_nistz256_point_double");
1109{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1110
1111 &mov ("esi",&wparam(1));
1112
1113 # above map() describes stack layout with 5 temporary
1114 # 256-bit vectors on top, then we take extra word for
1115 # OPENSSL_ia32cap_P copy.
1116 &stack_push(8*5+1);
1117 if ($sse2) {
1118 &call ("_picup_eax");
1119 &set_label("pic");
1120 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic"));
1121 &mov ("ebp",&DWP(0,"edx")); }
1122
1123&set_label("point_double_shortcut");
1124 &mov ("eax",&DWP(0,"esi")); # copy in_x
1125 &mov ("ebx",&DWP(4,"esi"));
1126 &mov ("ecx",&DWP(8,"esi"));
1127 &mov ("edx",&DWP(12,"esi"));
1128 &mov (&DWP($in_x+0,"esp"),"eax");
1129 &mov (&DWP($in_x+4,"esp"),"ebx");
1130 &mov (&DWP($in_x+8,"esp"),"ecx");
1131 &mov (&DWP($in_x+12,"esp"),"edx");
1132 &mov ("eax",&DWP(16,"esi"));
1133 &mov ("ebx",&DWP(20,"esi"));
1134 &mov ("ecx",&DWP(24,"esi"));
1135 &mov ("edx",&DWP(28,"esi"));
1136 &mov (&DWP($in_x+16,"esp"),"eax");
1137 &mov (&DWP($in_x+20,"esp"),"ebx");
1138 &mov (&DWP($in_x+24,"esp"),"ecx");
1139 &mov (&DWP($in_x+28,"esp"),"edx");
1140 &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy
1141
1142 &lea ("ebp",&DWP(32,"esi"));
1143 &lea ("esi",&DWP(32,"esi"));
1144 &lea ("edi",&DWP($S,"esp"));
1145 &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y);
1146
1147 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1148 &mov ("esi",64);
1149 &add ("esi",&wparam(1));
1150 &lea ("edi",&DWP($Zsqr,"esp"));
1151 &mov ("ebp","esi");
1152 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z);
1153
1154 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1155 &lea ("esi",&DWP($S,"esp"));
1156 &lea ("ebp",&DWP($S,"esp"));
1157 &lea ("edi",&DWP($S,"esp"));
1158 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S);
1159
1160 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1161 &mov ("ebp",&wparam(1));
1162 &lea ("esi",&DWP(32,"ebp"));
1163 &lea ("ebp",&DWP(64,"ebp"));
1164 &lea ("edi",&DWP($tmp0,"esp"));
1165 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y);
1166
1167 &lea ("esi",&DWP($in_x,"esp"));
1168 &lea ("ebp",&DWP($Zsqr,"esp"));
1169 &lea ("edi",&DWP($M,"esp"));
1170 &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr);
1171
1172 &mov ("edi",64);
1173 &lea ("esi",&DWP($tmp0,"esp"));
1174 &lea ("ebp",&DWP($tmp0,"esp"));
1175 &add ("edi",&wparam(0));
1176 &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0);
1177
1178 &lea ("esi",&DWP($in_x,"esp"));
1179 &lea ("ebp",&DWP($Zsqr,"esp"));
1180 &lea ("edi",&DWP($Zsqr,"esp"));
1181 &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr);
1182
1183 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1184 &lea ("esi",&DWP($S,"esp"));
1185 &lea ("ebp",&DWP($S,"esp"));
1186 &lea ("edi",&DWP($tmp0,"esp"));
1187 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S);
1188
1189 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1190 &lea ("esi",&DWP($M,"esp"));
1191 &lea ("ebp",&DWP($Zsqr,"esp"));
1192 &lea ("edi",&DWP($M,"esp"));
1193 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr);
1194
1195 &mov ("edi",32);
1196 &lea ("esi",&DWP($tmp0,"esp"));
1197 &add ("edi",&wparam(0));
1198 &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0);
1199
1200 &lea ("esi",&DWP($M,"esp"));
1201 &lea ("ebp",&DWP($M,"esp"));
1202 &lea ("edi",&DWP($tmp0,"esp"));
1203 &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M);
1204
1205 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1206 &lea ("esi",&DWP($in_x,"esp"));
1207 &lea ("ebp",&DWP($S,"esp"));
1208 &lea ("edi",&DWP($S,"esp"));
1209 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x);
1210
1211 &lea ("esi",&DWP($tmp0,"esp"));
1212 &lea ("ebp",&DWP($M,"esp"));
1213 &lea ("edi",&DWP($M,"esp"));
1214 &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M);
1215
1216 &lea ("esi",&DWP($S,"esp"));
1217 &lea ("ebp",&DWP($S,"esp"));
1218 &lea ("edi",&DWP($tmp0,"esp"));
1219 &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S);
1220
1221 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1222 &lea ("esi",&DWP($M,"esp"));
1223 &lea ("ebp",&DWP($M,"esp"));
1224 &mov ("edi",&wparam(0));
1225 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M);
1226
1227 &mov ("esi","edi"); # %edi is still res_x here
1228 &lea ("ebp",&DWP($tmp0,"esp"));
1229 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0);
1230
1231 &lea ("esi",&DWP($S,"esp"));
1232 &mov ("ebp","edi"); # %edi is still res_x
1233 &lea ("edi",&DWP($S,"esp"));
1234 &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x);
1235
1236 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy
1237 &mov ("esi","edi"); # %edi is still &S
1238 &lea ("ebp",&DWP($M,"esp"));
1239 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M);
1240
1241 &mov ("ebp",32);
1242 &lea ("esi",&DWP($S,"esp"));
1243 &add ("ebp",&wparam(0));
1244 &mov ("edi","ebp");
1245 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y);
1246
1247 &stack_pop(8*5+1);
1248} &function_end("ecp_nistz256_point_double");
1249
1250########################################################################
1251# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1252# const P256_POINT *in2);
1253&function_begin("ecp_nistz256_point_add");
1254{ my ($res_x,$res_y,$res_z,
1255 $in1_x,$in1_y,$in1_z,
1256 $in2_x,$in2_y,$in2_z,
1257 $H,$Hsqr,$R,$Rsqr,$Hcub,
1258 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1259 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1260
1261 &mov ("esi",&wparam(2));
1262
1263 # above map() describes stack layout with 18 temporary
1264 # 256-bit vectors on top, then we take extra words for
1265 # !in1infty, !in2infty, result of check for zero and
1266 # OPENSSL_ia32cap_P copy. [one unused word for padding]
1267 &stack_push(8*18+5);
1268 if ($sse2) {
1269 &call ("_picup_eax");
1270 &set_label("pic");
1271 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic"));
1272 &mov ("ebp",&DWP(0,"edx")); }
1273
1274 &lea ("edi",&DWP($in2_x,"esp"));
1275 for($i=0;$i<96;$i+=16) {
1276 &mov ("eax",&DWP($i+0,"esi")); # copy in2
1277 &mov ("ebx",&DWP($i+4,"esi"));
1278 &mov ("ecx",&DWP($i+8,"esi"));
1279 &mov ("edx",&DWP($i+12,"esi"));
1280 &mov (&DWP($i+0,"edi"),"eax");
1281 &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0);
1282 &mov ("ebp","eax") if ($i==64);
1283 &or ("ebp","eax") if ($i>64);
1284 &mov (&DWP($i+4,"edi"),"ebx");
1285 &or ("ebp","ebx") if ($i>=64);
1286 &mov (&DWP($i+8,"edi"),"ecx");
1287 &or ("ebp","ecx") if ($i>=64);
1288 &mov (&DWP($i+12,"edi"),"edx");
1289 &or ("ebp","edx") if ($i>=64);
1290 }
1291 &xor ("eax","eax");
1292 &mov ("esi",&wparam(1));
1293 &sub ("eax","ebp");
1294 &or ("ebp","eax");
1295 &sar ("ebp",31);
1296 &mov (&DWP(32*18+4,"esp"),"ebp"); # !in2infty
1297
1298 &lea ("edi",&DWP($in1_x,"esp"));
1299 for($i=0;$i<96;$i+=16) {
1300 &mov ("eax",&DWP($i+0,"esi")); # copy in1
1301 &mov ("ebx",&DWP($i+4,"esi"));
1302 &mov ("ecx",&DWP($i+8,"esi"));
1303 &mov ("edx",&DWP($i+12,"esi"));
1304 &mov (&DWP($i+0,"edi"),"eax");
1305 &mov ("ebp","eax") if ($i==64);
1306 &or ("ebp","eax") if ($i>64);
1307 &mov (&DWP($i+4,"edi"),"ebx");
1308 &or ("ebp","ebx") if ($i>=64);
1309 &mov (&DWP($i+8,"edi"),"ecx");
1310 &or ("ebp","ecx") if ($i>=64);
1311 &mov (&DWP($i+12,"edi"),"edx");
1312 &or ("ebp","edx") if ($i>=64);
1313 }
1314 &xor ("eax","eax");
1315 &sub ("eax","ebp");
1316 &or ("ebp","eax");
1317 &sar ("ebp",31);
1318 &mov (&DWP(32*18+0,"esp"),"ebp"); # !in1infty
1319
1320 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1321 &lea ("esi",&DWP($in2_z,"esp"));
1322 &lea ("ebp",&DWP($in2_z,"esp"));
1323 &lea ("edi",&DWP($Z2sqr,"esp"));
1324 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z);
1325
1326 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1327 &lea ("esi",&DWP($in1_z,"esp"));
1328 &lea ("ebp",&DWP($in1_z,"esp"));
1329 &lea ("edi",&DWP($Z1sqr,"esp"));
1330 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z);
1331
1332 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1333 &lea ("esi",&DWP($Z2sqr,"esp"));
1334 &lea ("ebp",&DWP($in2_z,"esp"));
1335 &lea ("edi",&DWP($S1,"esp"));
1336 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z);
1337
1338 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1339 &lea ("esi",&DWP($Z1sqr,"esp"));
1340 &lea ("ebp",&DWP($in1_z,"esp"));
1341 &lea ("edi",&DWP($S2,"esp"));
1342 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z);
1343
1344 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1345 &lea ("esi",&DWP($in1_y,"esp"));
1346 &lea ("ebp",&DWP($S1,"esp"));
1347 &lea ("edi",&DWP($S1,"esp"));
1348 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y);
1349
1350 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1351 &lea ("esi",&DWP($in2_y,"esp"));
1352 &lea ("ebp",&DWP($S2,"esp"));
1353 &lea ("edi",&DWP($S2,"esp"));
1354 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y);
1355
1356 &lea ("esi",&DWP($S2,"esp"));
1357 &lea ("ebp",&DWP($S1,"esp"));
1358 &lea ("edi",&DWP($R,"esp"));
1359 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1);
1360
1361 &or ("ebx","eax"); # see if result is zero
1362 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1363 &or ("ebx","ecx");
1364 &or ("ebx","edx");
1365 &or ("ebx",&DWP(0,"edi"));
1366 &or ("ebx",&DWP(4,"edi"));
1367 &lea ("esi",&DWP($in1_x,"esp"));
1368 &or ("ebx",&DWP(8,"edi"));
1369 &lea ("ebp",&DWP($Z2sqr,"esp"));
1370 &or ("ebx",&DWP(12,"edi"));
1371 &lea ("edi",&DWP($U1,"esp"));
1372 &mov (&DWP(32*18+8,"esp"),"ebx");
1373
1374 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr);
1375
1376 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1377 &lea ("esi",&DWP($in2_x,"esp"));
1378 &lea ("ebp",&DWP($Z1sqr,"esp"));
1379 &lea ("edi",&DWP($U2,"esp"));
1380 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr);
1381
1382 &lea ("esi",&DWP($U2,"esp"));
1383 &lea ("ebp",&DWP($U1,"esp"));
1384 &lea ("edi",&DWP($H,"esp"));
1385 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1);
1386
1387 &or ("eax","ebx"); # see if result is zero
1388 &or ("eax","ecx");
1389 &or ("eax","edx");
1390 &or ("eax",&DWP(0,"edi"));
1391 &or ("eax",&DWP(4,"edi"));
1392 &or ("eax",&DWP(8,"edi"));
1393 &or ("eax",&DWP(12,"edi"));
1394
1395 &data_byte(0x3e); # predict taken
1396 &jnz (&label("add_proceed")); # is_equal(U1,U2)?
1397
1398 &mov ("eax",&DWP(32*18+0,"esp"));
1399 &and ("eax",&DWP(32*18+4,"esp"));
1400 &mov ("ebx",&DWP(32*18+8,"esp"));
1401 &jz (&label("add_proceed")); # (in1infty || in2infty)?
1402 &test ("ebx","ebx");
1403 &jz (&label("add_double")); # is_equal(S1,S2)?
1404
1405 &mov ("edi",&wparam(0));
1406 &xor ("eax","eax");
1407 &mov ("ecx",96/4);
1408 &data_byte(0xfc,0xf3,0xab); # cld; stosd
1409 &jmp (&label("add_done"));
1410
1411&set_label("add_double",16);
1412 &mov ("esi",&wparam(1));
1413 &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1414 &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes
1415 &jmp (&label("point_double_shortcut"));
1416
1417&set_label("add_proceed",16);
1418 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1419 &lea ("esi",&DWP($R,"esp"));
1420 &lea ("ebp",&DWP($R,"esp"));
1421 &lea ("edi",&DWP($Rsqr,"esp"));
1422 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R);
1423
1424 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1425 &lea ("esi",&DWP($H,"esp"));
1426 &lea ("ebp",&DWP($in1_z,"esp"));
1427 &lea ("edi",&DWP($res_z,"esp"));
1428 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z);
1429
1430 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1431 &lea ("esi",&DWP($H,"esp"));
1432 &lea ("ebp",&DWP($H,"esp"));
1433 &lea ("edi",&DWP($Hsqr,"esp"));
1434 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H);
1435
1436 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1437 &lea ("esi",&DWP($in2_z,"esp"));
1438 &lea ("ebp",&DWP($res_z,"esp"));
1439 &lea ("edi",&DWP($res_z,"esp"));
1440 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z);
1441
1442 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1443 &lea ("esi",&DWP($Hsqr,"esp"));
1444 &lea ("ebp",&DWP($U1,"esp"));
1445 &lea ("edi",&DWP($U2,"esp"));
1446 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr);
1447
1448 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1449 &lea ("esi",&DWP($H,"esp"));
1450 &lea ("ebp",&DWP($Hsqr,"esp"));
1451 &lea ("edi",&DWP($Hcub,"esp"));
1452 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H);
1453
1454 &lea ("esi",&DWP($U2,"esp"));
1455 &lea ("ebp",&DWP($U2,"esp"));
1456 &lea ("edi",&DWP($Hsqr,"esp"));
1457 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2);
1458
1459 &lea ("esi",&DWP($Rsqr,"esp"));
1460 &lea ("ebp",&DWP($Hsqr,"esp"));
1461 &lea ("edi",&DWP($res_x,"esp"));
1462 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr);
1463
1464 &lea ("esi",&DWP($res_x,"esp"));
1465 &lea ("ebp",&DWP($Hcub,"esp"));
1466 &lea ("edi",&DWP($res_x,"esp"));
1467 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub);
1468
1469 &lea ("esi",&DWP($U2,"esp"));
1470 &lea ("ebp",&DWP($res_x,"esp"));
1471 &lea ("edi",&DWP($res_y,"esp"));
1472 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x);
1473
1474 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1475 &lea ("esi",&DWP($Hcub,"esp"));
1476 &lea ("ebp",&DWP($S1,"esp"));
1477 &lea ("edi",&DWP($S2,"esp"));
1478 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub);
1479
1480 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
1481 &lea ("esi",&DWP($R,"esp"));
1482 &lea ("ebp",&DWP($res_y,"esp"));
1483 &lea ("edi",&DWP($res_y,"esp"));
1484 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y);
1485
1486 &lea ("esi",&DWP($res_y,"esp"));
1487 &lea ("ebp",&DWP($S2,"esp"));
1488 &lea ("edi",&DWP($res_y,"esp"));
1489 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2);
1490
1491 &mov ("ebp",&DWP(32*18+0,"esp")); # !in1infty
1492 &mov ("esi",&DWP(32*18+4,"esp")); # !in2infty
1493 &mov ("edi",&wparam(0));
1494 &mov ("edx","ebp");
1495 &not ("ebp");
1496 &and ("edx","esi");
1497 &and ("ebp","esi");
1498 &not ("esi");
1499
1500 ########################################
1501 # conditional moves
1502 for($i=64;$i<96;$i+=4) {
1503 &mov ("eax","edx");
1504 &and ("eax",&DWP($res_x+$i,"esp"));
1505 &mov ("ebx","ebp");
1506 &and ("ebx",&DWP($in2_x+$i,"esp"));
1507 &mov ("ecx","esi");
1508 &and ("ecx",&DWP($in1_x+$i,"esp"));
1509 &or ("eax","ebx");
1510 &or ("eax","ecx");
1511 &mov (&DWP($i,"edi"),"eax");
1512 }
1513 for($i=0;$i<64;$i+=4) {
1514 &mov ("eax","edx");
1515 &and ("eax",&DWP($res_x+$i,"esp"));
1516 &mov ("ebx","ebp");
1517 &and ("ebx",&DWP($in2_x+$i,"esp"));
1518 &mov ("ecx","esi");
1519 &and ("ecx",&DWP($in1_x+$i,"esp"));
1520 &or ("eax","ebx");
1521 &or ("eax","ecx");
1522 &mov (&DWP($i,"edi"),"eax");
1523 }
1524 &set_label("add_done");
1525 &stack_pop(8*18+5);
1526} &function_end("ecp_nistz256_point_add");
1527
1528########################################################################
1529# void ecp_nistz256_point_add_affine(P256_POINT *out,
1530# const P256_POINT *in1,
1531# const P256_POINT_AFFINE *in2);
1532&function_begin("ecp_nistz256_point_add_affine");
1533{
1534 my ($res_x,$res_y,$res_z,
1535 $in1_x,$in1_y,$in1_z,
1536 $in2_x,$in2_y,
1537 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1538 my $Z1sqr = $S2;
1539 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1540
1541 &mov ("esi",&wparam(1));
1542
1543 # above map() describes stack layout with 15 temporary
1544 # 256-bit vectors on top, then we take extra words for
1545 # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy.
1546 &stack_push(8*15+3);
1547 if ($sse2) {
1548 &call ("_picup_eax");
1549 &set_label("pic");
1550 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic"));
1551 &mov ("ebp",&DWP(0,"edx")); }
1552
1553 &lea ("edi",&DWP($in1_x,"esp"));
1554 for($i=0;$i<96;$i+=16) {
1555 &mov ("eax",&DWP($i+0,"esi")); # copy in1
1556 &mov ("ebx",&DWP($i+4,"esi"));
1557 &mov ("ecx",&DWP($i+8,"esi"));
1558 &mov ("edx",&DWP($i+12,"esi"));
1559 &mov (&DWP($i+0,"edi"),"eax");
1560 &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0);
1561 &mov ("ebp","eax") if ($i==64);
1562 &or ("ebp","eax") if ($i>64);
1563 &mov (&DWP($i+4,"edi"),"ebx");
1564 &or ("ebp","ebx") if ($i>=64);
1565 &mov (&DWP($i+8,"edi"),"ecx");
1566 &or ("ebp","ecx") if ($i>=64);
1567 &mov (&DWP($i+12,"edi"),"edx");
1568 &or ("ebp","edx") if ($i>=64);
1569 }
1570 &xor ("eax","eax");
1571 &mov ("esi",&wparam(2));
1572 &sub ("eax","ebp");
1573 &or ("ebp","eax");
1574 &sar ("ebp",31);
1575 &mov (&DWP(32*15+0,"esp"),"ebp"); # !in1infty
1576
1577 &lea ("edi",&DWP($in2_x,"esp"));
1578 for($i=0;$i<64;$i+=16) {
1579 &mov ("eax",&DWP($i+0,"esi")); # copy in2
1580 &mov ("ebx",&DWP($i+4,"esi"));
1581 &mov ("ecx",&DWP($i+8,"esi"));
1582 &mov ("edx",&DWP($i+12,"esi"));
1583 &mov (&DWP($i+0,"edi"),"eax");
1584 &mov ("ebp","eax") if ($i==0);
1585 &or ("ebp","eax") if ($i!=0);
1586 &mov (&DWP($i+4,"edi"),"ebx");
1587 &or ("ebp","ebx");
1588 &mov (&DWP($i+8,"edi"),"ecx");
1589 &or ("ebp","ecx");
1590 &mov (&DWP($i+12,"edi"),"edx");
1591 &or ("ebp","edx");
1592 }
1593 &xor ("ebx","ebx");
1594 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1595 &sub ("ebx","ebp");
1596 &lea ("esi",&DWP($in1_z,"esp"));
1597 &or ("ebx","ebp");
1598 &lea ("ebp",&DWP($in1_z,"esp"));
1599 &sar ("ebx",31);
1600 &lea ("edi",&DWP($Z1sqr,"esp"));
1601 &mov (&DWP(32*15+4,"esp"),"ebx"); # !in2infty
1602
1603 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z);
1604
1605 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1606 &lea ("esi",&DWP($in2_x,"esp"));
1607 &mov ("ebp","edi"); # %esi is stull &Z1sqr
1608 &lea ("edi",&DWP($U2,"esp"));
1609 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x);
1610
1611 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1612 &lea ("esi",&DWP($in1_z,"esp"));
1613 &lea ("ebp",&DWP($Z1sqr,"esp"));
1614 &lea ("edi",&DWP($S2,"esp"));
1615 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z);
1616
1617 &lea ("esi",&DWP($U2,"esp"));
1618 &lea ("ebp",&DWP($in1_x,"esp"));
1619 &lea ("edi",&DWP($H,"esp"));
1620 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x);
1621
1622 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1623 &lea ("esi",&DWP($in2_y,"esp"));
1624 &lea ("ebp",&DWP($S2,"esp"));
1625 &lea ("edi",&DWP($S2,"esp"));
1626 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y);
1627
1628 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1629 &lea ("esi",&DWP($in1_z,"esp"));
1630 &lea ("ebp",&DWP($H,"esp"));
1631 &lea ("edi",&DWP($res_z,"esp"));
1632 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z);
1633
1634 &lea ("esi",&DWP($S2,"esp"));
1635 &lea ("ebp",&DWP($in1_y,"esp"));
1636 &lea ("edi",&DWP($R,"esp"));
1637 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y);
1638
1639 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1640 &lea ("esi",&DWP($H,"esp"));
1641 &lea ("ebp",&DWP($H,"esp"));
1642 &lea ("edi",&DWP($Hsqr,"esp"));
1643 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H);
1644
1645 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1646 &lea ("esi",&DWP($R,"esp"));
1647 &lea ("ebp",&DWP($R,"esp"));
1648 &lea ("edi",&DWP($Rsqr,"esp"));
1649 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R);
1650
1651 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1652 &lea ("esi",&DWP($in1_x,"esp"));
1653 &lea ("ebp",&DWP($Hsqr,"esp"));
1654 &lea ("edi",&DWP($U2,"esp"));
1655 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr);
1656
1657 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1658 &lea ("esi",&DWP($H,"esp"));
1659 &lea ("ebp",&DWP($Hsqr,"esp"));
1660 &lea ("edi",&DWP($Hcub,"esp"));
1661 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H);
1662
1663 &lea ("esi",&DWP($U2,"esp"));
1664 &lea ("ebp",&DWP($U2,"esp"));
1665 &lea ("edi",&DWP($Hsqr,"esp"));
1666 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2);
1667
1668 &lea ("esi",&DWP($Rsqr,"esp"));
1669 &lea ("ebp",&DWP($Hsqr,"esp"));
1670 &lea ("edi",&DWP($res_x,"esp"));
1671 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr);
1672
1673 &lea ("esi",&DWP($res_x,"esp"));
1674 &lea ("ebp",&DWP($Hcub,"esp"));
1675 &lea ("edi",&DWP($res_x,"esp"));
1676 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub);
1677
1678 &lea ("esi",&DWP($U2,"esp"));
1679 &lea ("ebp",&DWP($res_x,"esp"));
1680 &lea ("edi",&DWP($res_y,"esp"));
1681 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x);
1682
1683 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1684 &lea ("esi",&DWP($Hcub,"esp"));
1685 &lea ("ebp",&DWP($in1_y,"esp"));
1686 &lea ("edi",&DWP($S2,"esp"));
1687 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y);
1688
1689 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy
1690 &lea ("esi",&DWP($R,"esp"));
1691 &lea ("ebp",&DWP($res_y,"esp"));
1692 &lea ("edi",&DWP($res_y,"esp"));
1693 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R);
1694
1695 &lea ("esi",&DWP($res_y,"esp"));
1696 &lea ("ebp",&DWP($S2,"esp"));
1697 &lea ("edi",&DWP($res_y,"esp"));
1698 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2);
1699
1700 &mov ("ebp",&DWP(32*15+0,"esp")); # !in1infty
1701 &mov ("esi",&DWP(32*15+4,"esp")); # !in2infty
1702 &mov ("edi",&wparam(0));
1703 &mov ("edx","ebp");
1704 &not ("ebp");
1705 &and ("edx","esi");
1706 &and ("ebp","esi");
1707 &not ("esi");
1708
1709 ########################################
1710 # conditional moves
1711 for($i=64;$i<96;$i+=4) {
1712 my $one=@ONE_mont[($i-64)/4];
1713
1714 &mov ("eax","edx");
1715 &and ("eax",&DWP($res_x+$i,"esp"));
1716 &mov ("ebx","ebp") if ($one && $one!=-1);
1717 &and ("ebx",$one) if ($one && $one!=-1);
1718 &mov ("ecx","esi");
1719 &and ("ecx",&DWP($in1_x+$i,"esp"));
1720 &or ("eax",$one==-1?"ebp":"ebx") if ($one);
1721 &or ("eax","ecx");
1722 &mov (&DWP($i,"edi"),"eax");
1723 }
1724 for($i=0;$i<64;$i+=4) {
1725 &mov ("eax","edx");
1726 &and ("eax",&DWP($res_x+$i,"esp"));
1727 &mov ("ebx","ebp");
1728 &and ("ebx",&DWP($in2_x+$i,"esp"));
1729 &mov ("ecx","esi");
1730 &and ("ecx",&DWP($in1_x+$i,"esp"));
1731 &or ("eax","ebx");
1732 &or ("eax","ecx");
1733 &mov (&DWP($i,"edi"),"eax");
1734 }
1735 &stack_pop(8*15+3);
1736} &function_end("ecp_nistz256_point_add_affine");
1737
1738&asm_finish();
1739
1740close STDOUT;
diff --git a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl b/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl
deleted file mode 100644
index b772aae742..0000000000
--- a/src/lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl
+++ /dev/null
@@ -1,1971 +0,0 @@
1#!/usr/bin/env perl
2# $OpenBSD: ecp_nistz256-x86_64.pl,v 1.1 2016/11/04 17:33:20 miod Exp $
3#
4# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
5#
6# Licensed under the OpenSSL license (the "License"). You may not use
7# this file except in compliance with the License. You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10
11# Copyright (c) 2014, Intel Corporation.
12#
13# Permission to use, copy, modify, and/or distribute this software for any
14# purpose with or without fee is hereby granted, provided that the above
15# copyright notice and this permission notice appear in all copies.
16#
17# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
20# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
22# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
23# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24
25# Developers and authors:
26# Shay Gueron (1, 2), and Vlad Krasnov (1)
27# (1) Intel Corporation, Israel Development Center
28# (2) University of Haifa
29
30# Reference:
31# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
32# 256 Bit Primes"
33
34# Further optimization by <appro@openssl.org>:
35#
36# this/original with/without -DECP_NISTZ256_ASM(*)
37# Opteron +12-49% +110-150%
38# Bulldozer +14-45% +175-210%
39# P4 +18-46% n/a :-(
40# Westmere +12-34% +80-87%
41# Sandy Bridge +9-35% +110-120%
42# Ivy Bridge +9-35% +110-125%
43# Haswell +8-37% +140-160%
44# Broadwell +18-58% +145-210%
45# Atom +15-50% +130-180%
46# VIA Nano +43-160% +300-480%
47#
48# (*) "without -DECP_NISTZ256_ASM" refers to build with
49# "enable-ec_nistp_64_gcc_128";
50#
51# Ranges denote minimum and maximum improvement coefficients depending
52# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
53# server-side operation. Keep in mind that +100% means 2x improvement.
54
55$flavour = shift;
56$output = shift;
57if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
58
59$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60
61$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64die "can't locate x86_64-xlate.pl";
65
66open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67*STDOUT=*OUT;
68
69$code.=<<___;
70.text
71
72# The polynomial
73.align 64
74.Lpoly:
75.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
76
77.LOne:
78.long 1,1,1,1,1,1,1,1
79.LTwo:
80.long 2,2,2,2,2,2,2,2
81.LThree:
82.long 3,3,3,3,3,3,3,3
83.LONE_mont:
84.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
85___
86
87{
88################################################################################
89# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
90
91my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
92my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
93my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
94
95$code.=<<___;
96
97.globl ecp_nistz256_mul_by_2
98.type ecp_nistz256_mul_by_2,\@function,2
99.align 64
100ecp_nistz256_mul_by_2:
101 push %r12
102 push %r13
103
104 mov 8*0($a_ptr), $a0
105 mov 8*1($a_ptr), $a1
106 add $a0, $a0 # a0:a3+a0:a3
107 mov 8*2($a_ptr), $a2
108 adc $a1, $a1
109 mov 8*3($a_ptr), $a3
110 lea .Lpoly(%rip), $a_ptr
111 mov $a0, $t0
112 adc $a2, $a2
113 adc $a3, $a3
114 mov $a1, $t1
115 sbb $t4, $t4
116
117 sub 8*0($a_ptr), $a0
118 mov $a2, $t2
119 sbb 8*1($a_ptr), $a1
120 sbb 8*2($a_ptr), $a2
121 mov $a3, $t3
122 sbb 8*3($a_ptr), $a3
123 test $t4, $t4
124
125 cmovz $t0, $a0
126 cmovz $t1, $a1
127 mov $a0, 8*0($r_ptr)
128 cmovz $t2, $a2
129 mov $a1, 8*1($r_ptr)
130 cmovz $t3, $a3
131 mov $a2, 8*2($r_ptr)
132 mov $a3, 8*3($r_ptr)
133
134 pop %r13
135 pop %r12
136 ret
137.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
138
139################################################################################
140# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
141.globl ecp_nistz256_neg
142.type ecp_nistz256_neg,\@function,2
143.align 32
144ecp_nistz256_neg:
145 push %r12
146 push %r13
147
148 xor $a0, $a0
149 xor $a1, $a1
150 xor $a2, $a2
151 xor $a3, $a3
152 xor $t4, $t4
153
154 sub 8*0($a_ptr), $a0
155 sbb 8*1($a_ptr), $a1
156 sbb 8*2($a_ptr), $a2
157 mov $a0, $t0
158 sbb 8*3($a_ptr), $a3
159 lea .Lpoly(%rip), $a_ptr
160 mov $a1, $t1
161 sbb \$0, $t4
162
163 add 8*0($a_ptr), $a0
164 mov $a2, $t2
165 adc 8*1($a_ptr), $a1
166 adc 8*2($a_ptr), $a2
167 mov $a3, $t3
168 adc 8*3($a_ptr), $a3
169 test $t4, $t4
170
171 cmovz $t0, $a0
172 cmovz $t1, $a1
173 mov $a0, 8*0($r_ptr)
174 cmovz $t2, $a2
175 mov $a1, 8*1($r_ptr)
176 cmovz $t3, $a3
177 mov $a2, 8*2($r_ptr)
178 mov $a3, 8*3($r_ptr)
179
180 pop %r13
181 pop %r12
182 ret
183.size ecp_nistz256_neg,.-ecp_nistz256_neg
184___
185}
186{
187my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
188my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
189my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
190my ($poly1,$poly3)=($acc6,$acc7);
191
192$code.=<<___;
193################################################################################
194# void ecp_nistz256_mul_mont(
195# uint64_t res[4],
196# uint64_t a[4],
197# uint64_t b[4]);
198
199.globl ecp_nistz256_mul_mont
200.type ecp_nistz256_mul_mont,\@function,3
201.align 32
202ecp_nistz256_mul_mont:
203.Lmul_mont:
204 push %rbp
205 push %rbx
206 push %r12
207 push %r13
208 push %r14
209 push %r15
210
211 mov $b_org, $b_ptr
212 mov 8*0($b_org), %rax
213 mov 8*0($a_ptr), $acc1
214 mov 8*1($a_ptr), $acc2
215 mov 8*2($a_ptr), $acc3
216 mov 8*3($a_ptr), $acc4
217
218 call __ecp_nistz256_mul_montq
219
220 pop %r15
221 pop %r14
222 pop %r13
223 pop %r12
224 pop %rbx
225 pop %rbp
226 ret
227.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
228
229.type __ecp_nistz256_mul_montq,\@abi-omnipotent
230.align 32
231__ecp_nistz256_mul_montq:
232 ########################################################################
233 # Multiply a by b[0]
234 mov %rax, $t1
235 mulq $acc1
236 mov .Lpoly+8*1(%rip),$poly1
237 mov %rax, $acc0
238 mov $t1, %rax
239 mov %rdx, $acc1
240
241 mulq $acc2
242 mov .Lpoly+8*3(%rip),$poly3
243 add %rax, $acc1
244 mov $t1, %rax
245 adc \$0, %rdx
246 mov %rdx, $acc2
247
248 mulq $acc3
249 add %rax, $acc2
250 mov $t1, %rax
251 adc \$0, %rdx
252 mov %rdx, $acc3
253
254 mulq $acc4
255 add %rax, $acc3
256 mov $acc0, %rax
257 adc \$0, %rdx
258 xor $acc5, $acc5
259 mov %rdx, $acc4
260
261 ########################################################################
262 # First reduction step
263 # Basically now we want to multiply acc[0] by p256,
264 # and add the result to the acc.
265 # Due to the special form of p256 we do some optimizations
266 #
267 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
268 # then we add acc[0] and get acc[0] x 2^96
269
270 mov $acc0, $t1
271 shl \$32, $acc0
272 mulq $poly3
273 shr \$32, $t1
274 add $acc0, $acc1 # +=acc[0]<<96
275 adc $t1, $acc2
276 adc %rax, $acc3
277 mov 8*1($b_ptr), %rax
278 adc %rdx, $acc4
279 adc \$0, $acc5
280 xor $acc0, $acc0
281
282 ########################################################################
283 # Multiply by b[1]
284 mov %rax, $t1
285 mulq 8*0($a_ptr)
286 add %rax, $acc1
287 mov $t1, %rax
288 adc \$0, %rdx
289 mov %rdx, $t0
290
291 mulq 8*1($a_ptr)
292 add $t0, $acc2
293 adc \$0, %rdx
294 add %rax, $acc2
295 mov $t1, %rax
296 adc \$0, %rdx
297 mov %rdx, $t0
298
299 mulq 8*2($a_ptr)
300 add $t0, $acc3
301 adc \$0, %rdx
302 add %rax, $acc3
303 mov $t1, %rax
304 adc \$0, %rdx
305 mov %rdx, $t0
306
307 mulq 8*3($a_ptr)
308 add $t0, $acc4
309 adc \$0, %rdx
310 add %rax, $acc4
311 mov $acc1, %rax
312 adc %rdx, $acc5
313 adc \$0, $acc0
314
315 ########################################################################
316 # Second reduction step
317 mov $acc1, $t1
318 shl \$32, $acc1
319 mulq $poly3
320 shr \$32, $t1
321 add $acc1, $acc2
322 adc $t1, $acc3
323 adc %rax, $acc4
324 mov 8*2($b_ptr), %rax
325 adc %rdx, $acc5
326 adc \$0, $acc0
327 xor $acc1, $acc1
328
329 ########################################################################
330 # Multiply by b[2]
331 mov %rax, $t1
332 mulq 8*0($a_ptr)
333 add %rax, $acc2
334 mov $t1, %rax
335 adc \$0, %rdx
336 mov %rdx, $t0
337
338 mulq 8*1($a_ptr)
339 add $t0, $acc3
340 adc \$0, %rdx
341 add %rax, $acc3
342 mov $t1, %rax
343 adc \$0, %rdx
344 mov %rdx, $t0
345
346 mulq 8*2($a_ptr)
347 add $t0, $acc4
348 adc \$0, %rdx
349 add %rax, $acc4
350 mov $t1, %rax
351 adc \$0, %rdx
352 mov %rdx, $t0
353
354 mulq 8*3($a_ptr)
355 add $t0, $acc5
356 adc \$0, %rdx
357 add %rax, $acc5
358 mov $acc2, %rax
359 adc %rdx, $acc0
360 adc \$0, $acc1
361
362 ########################################################################
363 # Third reduction step
364 mov $acc2, $t1
365 shl \$32, $acc2
366 mulq $poly3
367 shr \$32, $t1
368 add $acc2, $acc3
369 adc $t1, $acc4
370 adc %rax, $acc5
371 mov 8*3($b_ptr), %rax
372 adc %rdx, $acc0
373 adc \$0, $acc1
374 xor $acc2, $acc2
375
376 ########################################################################
377 # Multiply by b[3]
378 mov %rax, $t1
379 mulq 8*0($a_ptr)
380 add %rax, $acc3
381 mov $t1, %rax
382 adc \$0, %rdx
383 mov %rdx, $t0
384
385 mulq 8*1($a_ptr)
386 add $t0, $acc4
387 adc \$0, %rdx
388 add %rax, $acc4
389 mov $t1, %rax
390 adc \$0, %rdx
391 mov %rdx, $t0
392
393 mulq 8*2($a_ptr)
394 add $t0, $acc5
395 adc \$0, %rdx
396 add %rax, $acc5
397 mov $t1, %rax
398 adc \$0, %rdx
399 mov %rdx, $t0
400
401 mulq 8*3($a_ptr)
402 add $t0, $acc0
403 adc \$0, %rdx
404 add %rax, $acc0
405 mov $acc3, %rax
406 adc %rdx, $acc1
407 adc \$0, $acc2
408
409 ########################################################################
410 # Final reduction step
411 mov $acc3, $t1
412 shl \$32, $acc3
413 mulq $poly3
414 shr \$32, $t1
415 add $acc3, $acc4
416 adc $t1, $acc5
417 mov $acc4, $t0
418 adc %rax, $acc0
419 adc %rdx, $acc1
420 mov $acc5, $t1
421 adc \$0, $acc2
422
423 ########################################################################
424 # Branch-less conditional subtraction of P
425 sub \$-1, $acc4 # .Lpoly[0]
426 mov $acc0, $t2
427 sbb $poly1, $acc5 # .Lpoly[1]
428 sbb \$0, $acc0 # .Lpoly[2]
429 mov $acc1, $t3
430 sbb $poly3, $acc1 # .Lpoly[3]
431 sbb \$0, $acc2
432
433 cmovc $t0, $acc4
434 cmovc $t1, $acc5
435 mov $acc4, 8*0($r_ptr)
436 cmovc $t2, $acc0
437 mov $acc5, 8*1($r_ptr)
438 cmovc $t3, $acc1
439 mov $acc0, 8*2($r_ptr)
440 mov $acc1, 8*3($r_ptr)
441
442 ret
443.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
444
445################################################################################
446# void ecp_nistz256_sqr_mont(
447# uint64_t res[4],
448# uint64_t a[4]);
449
450# we optimize the square according to S.Gueron and V.Krasnov,
451# "Speeding up Big-Number Squaring"
452.globl ecp_nistz256_sqr_mont
453.type ecp_nistz256_sqr_mont,\@function,2
454.align 32
455ecp_nistz256_sqr_mont:
456 push %rbp
457 push %rbx
458 push %r12
459 push %r13
460 push %r14
461 push %r15
462
463 mov 8*0($a_ptr), %rax
464 mov 8*1($a_ptr), $acc6
465 mov 8*2($a_ptr), $acc7
466 mov 8*3($a_ptr), $acc0
467
468 call __ecp_nistz256_sqr_montq
469
470 pop %r15
471 pop %r14
472 pop %r13
473 pop %r12
474 pop %rbx
475 pop %rbp
476 ret
477.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
478
479.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
480.align 32
481__ecp_nistz256_sqr_montq:
482 mov %rax, $acc5
483 mulq $acc6 # a[1]*a[0]
484 mov %rax, $acc1
485 mov $acc7, %rax
486 mov %rdx, $acc2
487
488 mulq $acc5 # a[0]*a[2]
489 add %rax, $acc2
490 mov $acc0, %rax
491 adc \$0, %rdx
492 mov %rdx, $acc3
493
494 mulq $acc5 # a[0]*a[3]
495 add %rax, $acc3
496 mov $acc7, %rax
497 adc \$0, %rdx
498 mov %rdx, $acc4
499
500 #################################
501 mulq $acc6 # a[1]*a[2]
502 add %rax, $acc3
503 mov $acc0, %rax
504 adc \$0, %rdx
505 mov %rdx, $t1
506
507 mulq $acc6 # a[1]*a[3]
508 add %rax, $acc4
509 mov $acc0, %rax
510 adc \$0, %rdx
511 add $t1, $acc4
512 mov %rdx, $acc5
513 adc \$0, $acc5
514
515 #################################
516 mulq $acc7 # a[2]*a[3]
517 xor $acc7, $acc7
518 add %rax, $acc5
519 mov 8*0($a_ptr), %rax
520 mov %rdx, $acc6
521 adc \$0, $acc6
522
523 add $acc1, $acc1 # acc1:6<<1
524 adc $acc2, $acc2
525 adc $acc3, $acc3
526 adc $acc4, $acc4
527 adc $acc5, $acc5
528 adc $acc6, $acc6
529 adc \$0, $acc7
530
531 mulq %rax
532 mov %rax, $acc0
533 mov 8*1($a_ptr), %rax
534 mov %rdx, $t0
535
536 mulq %rax
537 add $t0, $acc1
538 adc %rax, $acc2
539 mov 8*2($a_ptr), %rax
540 adc \$0, %rdx
541 mov %rdx, $t0
542
543 mulq %rax
544 add $t0, $acc3
545 adc %rax, $acc4
546 mov 8*3($a_ptr), %rax
547 adc \$0, %rdx
548 mov %rdx, $t0
549
550 mulq %rax
551 add $t0, $acc5
552 adc %rax, $acc6
553 mov $acc0, %rax
554 adc %rdx, $acc7
555
556 mov .Lpoly+8*1(%rip), $a_ptr
557 mov .Lpoly+8*3(%rip), $t1
558
559 ##########################################
560 # Now the reduction
561 # First iteration
562 mov $acc0, $t0
563 shl \$32, $acc0
564 mulq $t1
565 shr \$32, $t0
566 add $acc0, $acc1 # +=acc[0]<<96
567 adc $t0, $acc2
568 adc %rax, $acc3
569 mov $acc1, %rax
570 adc \$0, %rdx
571
572 ##########################################
573 # Second iteration
574 mov $acc1, $t0
575 shl \$32, $acc1
576 mov %rdx, $acc0
577 mulq $t1
578 shr \$32, $t0
579 add $acc1, $acc2
580 adc $t0, $acc3
581 adc %rax, $acc0
582 mov $acc2, %rax
583 adc \$0, %rdx
584
585 ##########################################
586 # Third iteration
587 mov $acc2, $t0
588 shl \$32, $acc2
589 mov %rdx, $acc1
590 mulq $t1
591 shr \$32, $t0
592 add $acc2, $acc3
593 adc $t0, $acc0
594 adc %rax, $acc1
595 mov $acc3, %rax
596 adc \$0, %rdx
597
598 ###########################################
599 # Last iteration
600 mov $acc3, $t0
601 shl \$32, $acc3
602 mov %rdx, $acc2
603 mulq $t1
604 shr \$32, $t0
605 add $acc3, $acc0
606 adc $t0, $acc1
607 adc %rax, $acc2
608 adc \$0, %rdx
609 xor $acc3, $acc3
610
611 ############################################
612 # Add the rest of the acc
613 add $acc0, $acc4
614 adc $acc1, $acc5
615 mov $acc4, $acc0
616 adc $acc2, $acc6
617 adc %rdx, $acc7
618 mov $acc5, $acc1
619 adc \$0, $acc3
620
621 sub \$-1, $acc4 # .Lpoly[0]
622 mov $acc6, $acc2
623 sbb $a_ptr, $acc5 # .Lpoly[1]
624 sbb \$0, $acc6 # .Lpoly[2]
625 mov $acc7, $t0
626 sbb $t1, $acc7 # .Lpoly[3]
627 sbb \$0, $acc3
628
629 cmovc $acc0, $acc4
630 cmovc $acc1, $acc5
631 mov $acc4, 8*0($r_ptr)
632 cmovc $acc2, $acc6
633 mov $acc5, 8*1($r_ptr)
634 cmovc $t0, $acc7
635 mov $acc6, 8*2($r_ptr)
636 mov $acc7, 8*3($r_ptr)
637
638 ret
639.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
640___
641
642}
643{
644my ($r_ptr,$in_ptr)=("%rdi","%rsi");
645my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
646my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
647
648$code.=<<___;
649################################################################################
650# void ecp_nistz256_from_mont(
651# uint64_t res[4],
652# uint64_t in[4]);
653# This one performs Montgomery multiplication by 1, so we only need the reduction
654
655.globl ecp_nistz256_from_mont
656.type ecp_nistz256_from_mont,\@function,2
657.align 32
658ecp_nistz256_from_mont:
659 push %r12
660 push %r13
661
662 mov 8*0($in_ptr), %rax
663 mov .Lpoly+8*3(%rip), $t2
664 mov 8*1($in_ptr), $acc1
665 mov 8*2($in_ptr), $acc2
666 mov 8*3($in_ptr), $acc3
667 mov %rax, $acc0
668 mov .Lpoly+8*1(%rip), $t1
669
670 #########################################
671 # First iteration
672 mov %rax, $t0
673 shl \$32, $acc0
674 mulq $t2
675 shr \$32, $t0
676 add $acc0, $acc1
677 adc $t0, $acc2
678 adc %rax, $acc3
679 mov $acc1, %rax
680 adc \$0, %rdx
681
682 #########################################
683 # Second iteration
684 mov $acc1, $t0
685 shl \$32, $acc1
686 mov %rdx, $acc0
687 mulq $t2
688 shr \$32, $t0
689 add $acc1, $acc2
690 adc $t0, $acc3
691 adc %rax, $acc0
692 mov $acc2, %rax
693 adc \$0, %rdx
694
695 ##########################################
696 # Third iteration
697 mov $acc2, $t0
698 shl \$32, $acc2
699 mov %rdx, $acc1
700 mulq $t2
701 shr \$32, $t0
702 add $acc2, $acc3
703 adc $t0, $acc0
704 adc %rax, $acc1
705 mov $acc3, %rax
706 adc \$0, %rdx
707
708 ###########################################
709 # Last iteration
710 mov $acc3, $t0
711 shl \$32, $acc3
712 mov %rdx, $acc2
713 mulq $t2
714 shr \$32, $t0
715 add $acc3, $acc0
716 adc $t0, $acc1
717 mov $acc0, $t0
718 adc %rax, $acc2
719 mov $acc1, $in_ptr
720 adc \$0, %rdx
721
722 ###########################################
723 # Branch-less conditional subtraction
724 sub \$-1, $acc0
725 mov $acc2, %rax
726 sbb $t1, $acc1
727 sbb \$0, $acc2
728 mov %rdx, $acc3
729 sbb $t2, %rdx
730 sbb $t2, $t2
731
732 cmovnz $t0, $acc0
733 cmovnz $in_ptr, $acc1
734 mov $acc0, 8*0($r_ptr)
735 cmovnz %rax, $acc2
736 mov $acc1, 8*1($r_ptr)
737 cmovz %rdx, $acc3
738 mov $acc2, 8*2($r_ptr)
739 mov $acc3, 8*3($r_ptr)
740
741 pop %r13
742 pop %r12
743 ret
744.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
745___
746}
747{
748my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
749my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
750my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
751my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
752
753$code.=<<___;
754################################################################################
755# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
756.globl ecp_nistz256_select_w5
757.type ecp_nistz256_select_w5,\@abi-omnipotent
758.align 32
759ecp_nistz256_select_w5:
760___
761$code.=<<___ if ($win64);
762 lea -0x88(%rsp), %rax
763.LSEH_begin_ecp_nistz256_select_w5:
764 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
765 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
766 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
767 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
768 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
769 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
770 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
771 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
772 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
773 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
774 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
775___
776$code.=<<___;
777 movdqa .LOne(%rip), $ONE
778 movd $index, $INDEX
779
780 pxor $Ra, $Ra
781 pxor $Rb, $Rb
782 pxor $Rc, $Rc
783 pxor $Rd, $Rd
784 pxor $Re, $Re
785 pxor $Rf, $Rf
786
787 movdqa $ONE, $M0
788 pshufd \$0, $INDEX, $INDEX
789
790 mov \$16, %rax
791.Lselect_loop_sse_w5:
792
793 movdqa $M0, $TMP0
794 paddd $ONE, $M0
795 pcmpeqd $INDEX, $TMP0
796
797 movdqa 16*0($in_t), $T0a
798 movdqa 16*1($in_t), $T0b
799 movdqa 16*2($in_t), $T0c
800 movdqa 16*3($in_t), $T0d
801 movdqa 16*4($in_t), $T0e
802 movdqa 16*5($in_t), $T0f
803 lea 16*6($in_t), $in_t
804
805 pand $TMP0, $T0a
806 pand $TMP0, $T0b
807 por $T0a, $Ra
808 pand $TMP0, $T0c
809 por $T0b, $Rb
810 pand $TMP0, $T0d
811 por $T0c, $Rc
812 pand $TMP0, $T0e
813 por $T0d, $Rd
814 pand $TMP0, $T0f
815 por $T0e, $Re
816 por $T0f, $Rf
817
818 dec %rax
819 jnz .Lselect_loop_sse_w5
820
821 movdqu $Ra, 16*0($val)
822 movdqu $Rb, 16*1($val)
823 movdqu $Rc, 16*2($val)
824 movdqu $Rd, 16*3($val)
825 movdqu $Re, 16*4($val)
826 movdqu $Rf, 16*5($val)
827___
828$code.=<<___ if ($win64);
829 movaps (%rsp), %xmm6
830 movaps 0x10(%rsp), %xmm7
831 movaps 0x20(%rsp), %xmm8
832 movaps 0x30(%rsp), %xmm9
833 movaps 0x40(%rsp), %xmm10
834 movaps 0x50(%rsp), %xmm11
835 movaps 0x60(%rsp), %xmm12
836 movaps 0x70(%rsp), %xmm13
837 movaps 0x80(%rsp), %xmm14
838 movaps 0x90(%rsp), %xmm15
839 lea 0xa8(%rsp), %rsp
840.LSEH_end_ecp_nistz256_select_w5:
841___
842$code.=<<___;
843 ret
844.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
845
846################################################################################
847# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
848.globl ecp_nistz256_select_w7
849.type ecp_nistz256_select_w7,\@abi-omnipotent
850.align 32
851ecp_nistz256_select_w7:
852___
853$code.=<<___ if ($win64);
854 lea -0x88(%rsp), %rax
855.LSEH_begin_ecp_nistz256_select_w7:
856 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
857 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
858 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
859 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
860 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
861 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
862 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
863 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
864 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
865 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
866 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
867___
868$code.=<<___;
869 movdqa .LOne(%rip), $M0
870 movd $index, $INDEX
871
872 pxor $Ra, $Ra
873 pxor $Rb, $Rb
874 pxor $Rc, $Rc
875 pxor $Rd, $Rd
876
877 movdqa $M0, $ONE
878 pshufd \$0, $INDEX, $INDEX
879 mov \$64, %rax
880
881.Lselect_loop_sse_w7:
882 movdqa $M0, $TMP0
883 paddd $ONE, $M0
884 movdqa 16*0($in_t), $T0a
885 movdqa 16*1($in_t), $T0b
886 pcmpeqd $INDEX, $TMP0
887 movdqa 16*2($in_t), $T0c
888 movdqa 16*3($in_t), $T0d
889 lea 16*4($in_t), $in_t
890
891 pand $TMP0, $T0a
892 pand $TMP0, $T0b
893 por $T0a, $Ra
894 pand $TMP0, $T0c
895 por $T0b, $Rb
896 pand $TMP0, $T0d
897 por $T0c, $Rc
898 prefetcht0 255($in_t)
899 por $T0d, $Rd
900
901 dec %rax
902 jnz .Lselect_loop_sse_w7
903
904 movdqu $Ra, 16*0($val)
905 movdqu $Rb, 16*1($val)
906 movdqu $Rc, 16*2($val)
907 movdqu $Rd, 16*3($val)
908___
909$code.=<<___ if ($win64);
910 movaps (%rsp), %xmm6
911 movaps 0x10(%rsp), %xmm7
912 movaps 0x20(%rsp), %xmm8
913 movaps 0x30(%rsp), %xmm9
914 movaps 0x40(%rsp), %xmm10
915 movaps 0x50(%rsp), %xmm11
916 movaps 0x60(%rsp), %xmm12
917 movaps 0x70(%rsp), %xmm13
918 movaps 0x80(%rsp), %xmm14
919 movaps 0x90(%rsp), %xmm15
920 lea 0xa8(%rsp), %rsp
921.LSEH_end_ecp_nistz256_select_w7:
922___
923$code.=<<___;
924 ret
925.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
926___
927}
928{{{
929########################################################################
930# This block implements higher level point_double, point_add and
931# point_add_affine. The key to performance in this case is to allow
932# out-of-order execution logic to overlap computations from next step
933# with tail processing from current step. By using tailored calling
934# sequence we minimize inter-step overhead to give processor better
935# shot at overlapping operations...
936#
937# You will notice that input data is copied to stack. Trouble is that
938# there are no registers to spare for holding original pointers and
939# reloading them, pointers, would create undesired dependencies on
940# effective addresses calculation paths. In other words it's too done
941# to favour out-of-order execution logic.
942# <appro@openssl.org>
943
944my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
945my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
946my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
947my ($poly1,$poly3)=($acc6,$acc7);
948
949sub load_for_mul () {
950my ($a,$b,$src0) = @_;
951my $bias = $src0 eq "%rax" ? 0 : -128;
952
953" mov $b, $src0
954 lea $b, $b_ptr
955 mov 8*0+$a, $acc1
956 mov 8*1+$a, $acc2
957 lea $bias+$a, $a_ptr
958 mov 8*2+$a, $acc3
959 mov 8*3+$a, $acc4"
960}
961
962sub load_for_sqr () {
963my ($a,$src0) = @_;
964my $bias = $src0 eq "%rax" ? 0 : -128;
965
966" mov 8*0+$a, $src0
967 mov 8*1+$a, $acc6
968 lea $bias+$a, $a_ptr
969 mov 8*2+$a, $acc7
970 mov 8*3+$a, $acc0"
971}
972
973 {
974########################################################################
975# operate in 4-5-0-1 "name space" that matches multiplication output
976#
977my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
978
979$code.=<<___;
980.type __ecp_nistz256_add_toq,\@abi-omnipotent
981.align 32
982__ecp_nistz256_add_toq:
983 add 8*0($b_ptr), $a0
984 adc 8*1($b_ptr), $a1
985 mov $a0, $t0
986 adc 8*2($b_ptr), $a2
987 adc 8*3($b_ptr), $a3
988 mov $a1, $t1
989 sbb $t4, $t4
990
991 sub \$-1, $a0
992 mov $a2, $t2
993 sbb $poly1, $a1
994 sbb \$0, $a2
995 mov $a3, $t3
996 sbb $poly3, $a3
997 test $t4, $t4
998
999 cmovz $t0, $a0
1000 cmovz $t1, $a1
1001 mov $a0, 8*0($r_ptr)
1002 cmovz $t2, $a2
1003 mov $a1, 8*1($r_ptr)
1004 cmovz $t3, $a3
1005 mov $a2, 8*2($r_ptr)
1006 mov $a3, 8*3($r_ptr)
1007
1008 ret
1009.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1010
1011.type __ecp_nistz256_sub_fromq,\@abi-omnipotent
1012.align 32
1013__ecp_nistz256_sub_fromq:
1014 sub 8*0($b_ptr), $a0
1015 sbb 8*1($b_ptr), $a1
1016 mov $a0, $t0
1017 sbb 8*2($b_ptr), $a2
1018 sbb 8*3($b_ptr), $a3
1019 mov $a1, $t1
1020 sbb $t4, $t4
1021
1022 add \$-1, $a0
1023 mov $a2, $t2
1024 adc $poly1, $a1
1025 adc \$0, $a2
1026 mov $a3, $t3
1027 adc $poly3, $a3
1028 test $t4, $t4
1029
1030 cmovz $t0, $a0
1031 cmovz $t1, $a1
1032 mov $a0, 8*0($r_ptr)
1033 cmovz $t2, $a2
1034 mov $a1, 8*1($r_ptr)
1035 cmovz $t3, $a3
1036 mov $a2, 8*2($r_ptr)
1037 mov $a3, 8*3($r_ptr)
1038
1039 ret
1040.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1041
1042.type __ecp_nistz256_subq,\@abi-omnipotent
1043.align 32
1044__ecp_nistz256_subq:
1045 sub $a0, $t0
1046 sbb $a1, $t1
1047 mov $t0, $a0
1048 sbb $a2, $t2
1049 sbb $a3, $t3
1050 mov $t1, $a1
1051 sbb $t4, $t4
1052
1053 add \$-1, $t0
1054 mov $t2, $a2
1055 adc $poly1, $t1
1056 adc \$0, $t2
1057 mov $t3, $a3
1058 adc $poly3, $t3
1059 test $t4, $t4
1060
1061 cmovnz $t0, $a0
1062 cmovnz $t1, $a1
1063 cmovnz $t2, $a2
1064 cmovnz $t3, $a3
1065
1066 ret
1067.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
1068
1069.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
1070.align 32
1071__ecp_nistz256_mul_by_2q:
1072 add $a0, $a0 # a0:a3+a0:a3
1073 adc $a1, $a1
1074 mov $a0, $t0
1075 adc $a2, $a2
1076 adc $a3, $a3
1077 mov $a1, $t1
1078 sbb $t4, $t4
1079
1080 sub \$-1, $a0
1081 mov $a2, $t2
1082 sbb $poly1, $a1
1083 sbb \$0, $a2
1084 mov $a3, $t3
1085 sbb $poly3, $a3
1086 test $t4, $t4
1087
1088 cmovz $t0, $a0
1089 cmovz $t1, $a1
1090 mov $a0, 8*0($r_ptr)
1091 cmovz $t2, $a2
1092 mov $a1, 8*1($r_ptr)
1093 cmovz $t3, $a3
1094 mov $a2, 8*2($r_ptr)
1095 mov $a3, 8*3($r_ptr)
1096
1097 ret
1098.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1099___
1100 }
1101sub gen_double () {
1102 my $x = shift;
1103 my ($src0,$sfx,$bias);
1104 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1105
1106 if ($x ne "x") {
1107 $src0 = "%rax";
1108 $sfx = "";
1109 $bias = 0;
1110
1111$code.=<<___;
1112.globl ecp_nistz256_point_double
1113.type ecp_nistz256_point_double,\@function,2
1114.align 32
1115ecp_nistz256_point_double:
1116___
1117 } else {
1118 $src0 = "%rdx";
1119 $sfx = "x";
1120 $bias = 128;
1121
1122$code.=<<___;
1123.type ecp_nistz256_point_doublex,\@function,2
1124.align 32
1125ecp_nistz256_point_doublex:
1126.Lpoint_doublex:
1127___
1128 }
1129$code.=<<___;
1130 push %rbp
1131 push %rbx
1132 push %r12
1133 push %r13
1134 push %r14
1135 push %r15
1136 sub \$32*5+8, %rsp
1137
1138.Lpoint_double_shortcut$x:
1139 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
1140 mov $a_ptr, $b_ptr # backup copy
1141 movdqu 0x10($a_ptr), %xmm1
1142 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
1143 mov 0x20+8*1($a_ptr), $acc5
1144 mov 0x20+8*2($a_ptr), $acc0
1145 mov 0x20+8*3($a_ptr), $acc1
1146 mov .Lpoly+8*1(%rip), $poly1
1147 mov .Lpoly+8*3(%rip), $poly3
1148 movdqa %xmm0, $in_x(%rsp)
1149 movdqa %xmm1, $in_x+0x10(%rsp)
1150 lea 0x20($r_ptr), $acc2
1151 lea 0x40($r_ptr), $acc3
1152 movq $r_ptr, %xmm0
1153 movq $acc2, %xmm1
1154 movq $acc3, %xmm2
1155
1156 lea $S(%rsp), $r_ptr
1157 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
1158
1159 mov 0x40+8*0($a_ptr), $src0
1160 mov 0x40+8*1($a_ptr), $acc6
1161 mov 0x40+8*2($a_ptr), $acc7
1162 mov 0x40+8*3($a_ptr), $acc0
1163 lea 0x40-$bias($a_ptr), $a_ptr
1164 lea $Zsqr(%rsp), $r_ptr
1165 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
1166
1167 `&load_for_sqr("$S(%rsp)", "$src0")`
1168 lea $S(%rsp), $r_ptr
1169 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
1170
1171 mov 0x20($b_ptr), $src0 # $b_ptr is still valid
1172 mov 0x40+8*0($b_ptr), $acc1
1173 mov 0x40+8*1($b_ptr), $acc2
1174 mov 0x40+8*2($b_ptr), $acc3
1175 mov 0x40+8*3($b_ptr), $acc4
1176 lea 0x40-$bias($b_ptr), $a_ptr
1177 lea 0x20($b_ptr), $b_ptr
1178 movq %xmm2, $r_ptr
1179 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
1180 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
1181
1182 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1183 mov $in_x+8*1(%rsp), $acc5
1184 lea $Zsqr(%rsp), $b_ptr
1185 mov $in_x+8*2(%rsp), $acc0
1186 mov $in_x+8*3(%rsp), $acc1
1187 lea $M(%rsp), $r_ptr
1188 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
1189
1190 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1191 mov $in_x+8*1(%rsp), $acc5
1192 lea $Zsqr(%rsp), $b_ptr
1193 mov $in_x+8*2(%rsp), $acc0
1194 mov $in_x+8*3(%rsp), $acc1
1195 lea $Zsqr(%rsp), $r_ptr
1196 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
1197
1198 `&load_for_sqr("$S(%rsp)", "$src0")`
1199 movq %xmm1, $r_ptr
1200 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
1201___
1202{
1203######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1204# operate in 4-5-6-7 "name space" that matches squaring output
1205#
1206my ($poly1,$poly3)=($a_ptr,$t1);
1207my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1208
1209$code.=<<___;
1210 xor $t4, $t4
1211 mov $a0, $t0
1212 add \$-1, $a0
1213 mov $a1, $t1
1214 adc $poly1, $a1
1215 mov $a2, $t2
1216 adc \$0, $a2
1217 mov $a3, $t3
1218 adc $poly3, $a3
1219 adc \$0, $t4
1220 xor $a_ptr, $a_ptr # borrow $a_ptr
1221 test \$1, $t0
1222
1223 cmovz $t0, $a0
1224 cmovz $t1, $a1
1225 cmovz $t2, $a2
1226 cmovz $t3, $a3
1227 cmovz $a_ptr, $t4
1228
1229 mov $a1, $t0 # a0:a3>>1
1230 shr \$1, $a0
1231 shl \$63, $t0
1232 mov $a2, $t1
1233 shr \$1, $a1
1234 or $t0, $a0
1235 shl \$63, $t1
1236 mov $a3, $t2
1237 shr \$1, $a2
1238 or $t1, $a1
1239 shl \$63, $t2
1240 mov $a0, 8*0($r_ptr)
1241 shr \$1, $a3
1242 mov $a1, 8*1($r_ptr)
1243 shl \$63, $t4
1244 or $t2, $a2
1245 or $t4, $a3
1246 mov $a2, 8*2($r_ptr)
1247 mov $a3, 8*3($r_ptr)
1248___
1249}
1250$code.=<<___;
1251 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1252 lea $M(%rsp), $r_ptr
1253 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
1254
1255 lea $tmp0(%rsp), $r_ptr
1256 call __ecp_nistz256_mul_by_2$x
1257
1258 lea $M(%rsp), $b_ptr
1259 lea $M(%rsp), $r_ptr
1260 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
1261
1262 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1263 lea $S(%rsp), $r_ptr
1264 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
1265
1266 lea $tmp0(%rsp), $r_ptr
1267 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
1268
1269 `&load_for_sqr("$M(%rsp)", "$src0")`
1270 movq %xmm0, $r_ptr
1271 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
1272
1273 lea $tmp0(%rsp), $b_ptr
1274 mov $acc6, $acc0 # harmonize sqr output and sub input
1275 mov $acc7, $acc1
1276 mov $a_ptr, $poly1
1277 mov $t1, $poly3
1278 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
1279
1280 mov $S+8*0(%rsp), $t0
1281 mov $S+8*1(%rsp), $t1
1282 mov $S+8*2(%rsp), $t2
1283 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
1284 lea $S(%rsp), $r_ptr
1285 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
1286
1287 mov $M(%rsp), $src0
1288 lea $M(%rsp), $b_ptr
1289 mov $acc4, $acc6 # harmonize sub output and mul input
1290 xor %ecx, %ecx
1291 mov $acc4, $S+8*0(%rsp) # have to save:-(
1292 mov $acc5, $acc2
1293 mov $acc5, $S+8*1(%rsp)
1294 cmovz $acc0, $acc3
1295 mov $acc0, $S+8*2(%rsp)
1296 lea $S-$bias(%rsp), $a_ptr
1297 cmovz $acc1, $acc4
1298 mov $acc1, $S+8*3(%rsp)
1299 mov $acc6, $acc1
1300 lea $S(%rsp), $r_ptr
1301 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
1302
1303 movq %xmm1, $b_ptr
1304 movq %xmm1, $r_ptr
1305 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
1306
1307 add \$32*5+8, %rsp
1308 pop %r15
1309 pop %r14
1310 pop %r13
1311 pop %r12
1312 pop %rbx
1313 pop %rbp
1314 ret
1315.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1316___
1317}
1318&gen_double("q");
1319
1320sub gen_add () {
1321 my $x = shift;
1322 my ($src0,$sfx,$bias);
1323 my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1324 $U1,$U2,$S1,$S2,
1325 $res_x,$res_y,$res_z,
1326 $in1_x,$in1_y,$in1_z,
1327 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1328 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1329
1330 if ($x ne "x") {
1331 $src0 = "%rax";
1332 $sfx = "";
1333 $bias = 0;
1334
1335$code.=<<___;
1336.globl ecp_nistz256_point_add
1337.type ecp_nistz256_point_add,\@function,3
1338.align 32
1339ecp_nistz256_point_add:
1340___
1341 } else {
1342 $src0 = "%rdx";
1343 $sfx = "x";
1344 $bias = 128;
1345 }
1346$code.=<<___;
1347 push %rbp
1348 push %rbx
1349 push %r12
1350 push %r13
1351 push %r14
1352 push %r15
1353 sub \$32*18+8, %rsp
1354
1355 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1356 movdqu 0x10($a_ptr), %xmm1
1357 movdqu 0x20($a_ptr), %xmm2
1358 movdqu 0x30($a_ptr), %xmm3
1359 movdqu 0x40($a_ptr), %xmm4
1360 movdqu 0x50($a_ptr), %xmm5
1361 mov $a_ptr, $b_ptr # reassign
1362 mov $b_org, $a_ptr # reassign
1363 movdqa %xmm0, $in1_x(%rsp)
1364 movdqa %xmm1, $in1_x+0x10(%rsp)
1365 por %xmm0, %xmm1
1366 movdqa %xmm2, $in1_y(%rsp)
1367 movdqa %xmm3, $in1_y+0x10(%rsp)
1368 por %xmm2, %xmm3
1369 movdqa %xmm4, $in1_z(%rsp)
1370 movdqa %xmm5, $in1_z+0x10(%rsp)
1371 por %xmm1, %xmm3
1372
1373 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
1374 pshufd \$0xb1, %xmm3, %xmm5
1375 movdqu 0x10($a_ptr), %xmm1
1376 movdqu 0x20($a_ptr), %xmm2
1377 por %xmm3, %xmm5
1378 movdqu 0x30($a_ptr), %xmm3
1379 mov 0x40+8*0($a_ptr), $src0 # load original in2_z
1380 mov 0x40+8*1($a_ptr), $acc6
1381 mov 0x40+8*2($a_ptr), $acc7
1382 mov 0x40+8*3($a_ptr), $acc0
1383 movdqa %xmm0, $in2_x(%rsp)
1384 pshufd \$0x1e, %xmm5, %xmm4
1385 movdqa %xmm1, $in2_x+0x10(%rsp)
1386 por %xmm0, %xmm1
1387 movq $r_ptr, %xmm0 # save $r_ptr
1388 movdqa %xmm2, $in2_y(%rsp)
1389 movdqa %xmm3, $in2_y+0x10(%rsp)
1390 por %xmm2, %xmm3
1391 por %xmm4, %xmm5
1392 pxor %xmm4, %xmm4
1393 por %xmm1, %xmm3
1394
1395 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1396 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
1397 mov $acc6, $in2_z+8*1(%rsp)
1398 mov $acc7, $in2_z+8*2(%rsp)
1399 mov $acc0, $in2_z+8*3(%rsp)
1400 lea $Z2sqr(%rsp), $r_ptr # Z2^2
1401 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
1402
1403 pcmpeqd %xmm4, %xmm5
1404 pshufd \$0xb1, %xmm3, %xmm4
1405 por %xmm3, %xmm4
1406 pshufd \$0, %xmm5, %xmm5 # in1infty
1407 pshufd \$0x1e, %xmm4, %xmm3
1408 por %xmm3, %xmm4
1409 pxor %xmm3, %xmm3
1410 pcmpeqd %xmm3, %xmm4
1411 pshufd \$0, %xmm4, %xmm4 # in2infty
1412 mov 0x40+8*0($b_ptr), $src0 # load original in1_z
1413 mov 0x40+8*1($b_ptr), $acc6
1414 mov 0x40+8*2($b_ptr), $acc7
1415 mov 0x40+8*3($b_ptr), $acc0
1416 movq $b_ptr, %xmm1
1417
1418 lea 0x40-$bias($b_ptr), $a_ptr
1419 lea $Z1sqr(%rsp), $r_ptr # Z1^2
1420 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
1421
1422 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
1423 lea $S1(%rsp), $r_ptr # S1 = Z2^3
1424 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
1425
1426 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1427 lea $S2(%rsp), $r_ptr # S2 = Z1^3
1428 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
1429
1430 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
1431 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
1432 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
1433
1434 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1435 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1436 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
1437
1438 lea $S1(%rsp), $b_ptr
1439 lea $R(%rsp), $r_ptr # R = S2 - S1
1440 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
1441
1442 or $acc5, $acc4 # see if result is zero
1443 movdqa %xmm4, %xmm2
1444 or $acc0, $acc4
1445 or $acc1, $acc4
1446 por %xmm5, %xmm2 # in1infty || in2infty
1447 movq $acc4, %xmm3
1448
1449 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1450 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
1451 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
1452
1453 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
1454 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
1455 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
1456
1457 lea $U1(%rsp), $b_ptr
1458 lea $H(%rsp), $r_ptr # H = U2 - U1
1459 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
1460
1461 or $acc5, $acc4 # see if result is zero
1462 or $acc0, $acc4
1463 or $acc1, $acc4
1464
1465 .byte 0x3e # predict taken
1466 jnz .Ladd_proceed$x # is_equal(U1,U2)?
1467 movq %xmm2, $acc0
1468 movq %xmm3, $acc1
1469 test $acc0, $acc0
1470 jnz .Ladd_proceed$x # (in1infty || in2infty)?
1471 test $acc1, $acc1
1472 jz .Ladd_double$x # is_equal(S1,S2)?
1473
1474 movq %xmm0, $r_ptr # restore $r_ptr
1475 pxor %xmm0, %xmm0
1476 movdqu %xmm0, 0x00($r_ptr)
1477 movdqu %xmm0, 0x10($r_ptr)
1478 movdqu %xmm0, 0x20($r_ptr)
1479 movdqu %xmm0, 0x30($r_ptr)
1480 movdqu %xmm0, 0x40($r_ptr)
1481 movdqu %xmm0, 0x50($r_ptr)
1482 jmp .Ladd_done$x
1483
1484.align 32
1485.Ladd_double$x:
1486 movq %xmm1, $a_ptr # restore $a_ptr
1487 movq %xmm0, $r_ptr # restore $r_ptr
1488 add \$`32*(18-5)`, %rsp # difference in frame sizes
1489 jmp .Lpoint_double_shortcut$x
1490
1491.align 32
1492.Ladd_proceed$x:
1493 `&load_for_sqr("$R(%rsp)", "$src0")`
1494 lea $Rsqr(%rsp), $r_ptr # R^2
1495 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
1496
1497 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1498 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1499 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
1500
1501 `&load_for_sqr("$H(%rsp)", "$src0")`
1502 lea $Hsqr(%rsp), $r_ptr # H^2
1503 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
1504
1505 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
1506 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1507 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
1508
1509 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
1510 lea $Hcub(%rsp), $r_ptr # H^3
1511 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
1512
1513 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
1514 lea $U2(%rsp), $r_ptr # U1*H^2
1515 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
1516___
1517{
1518#######################################################################
1519# operate in 4-5-0-1 "name space" that matches multiplication output
1520#
1521my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1522my ($poly1, $poly3)=($acc6,$acc7);
1523
1524$code.=<<___;
1525 #lea $U2(%rsp), $a_ptr
1526 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1527 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1528
1529 add $acc0, $acc0 # a0:a3+a0:a3
1530 lea $Rsqr(%rsp), $a_ptr
1531 adc $acc1, $acc1
1532 mov $acc0, $t0
1533 adc $acc2, $acc2
1534 adc $acc3, $acc3
1535 mov $acc1, $t1
1536 sbb $t4, $t4
1537
1538 sub \$-1, $acc0
1539 mov $acc2, $t2
1540 sbb $poly1, $acc1
1541 sbb \$0, $acc2
1542 mov $acc3, $t3
1543 sbb $poly3, $acc3
1544 test $t4, $t4
1545
1546 cmovz $t0, $acc0
1547 mov 8*0($a_ptr), $t0
1548 cmovz $t1, $acc1
1549 mov 8*1($a_ptr), $t1
1550 cmovz $t2, $acc2
1551 mov 8*2($a_ptr), $t2
1552 cmovz $t3, $acc3
1553 mov 8*3($a_ptr), $t3
1554
1555 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
1556
1557 lea $Hcub(%rsp), $b_ptr
1558 lea $res_x(%rsp), $r_ptr
1559 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
1560
1561 mov $U2+8*0(%rsp), $t0
1562 mov $U2+8*1(%rsp), $t1
1563 mov $U2+8*2(%rsp), $t2
1564 mov $U2+8*3(%rsp), $t3
1565 lea $res_y(%rsp), $r_ptr
1566
1567 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
1568
1569 mov $acc0, 8*0($r_ptr) # save the result, as
1570 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
1571 mov $acc2, 8*2($r_ptr)
1572 mov $acc3, 8*3($r_ptr)
1573___
1574}
1575$code.=<<___;
1576 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
1577 lea $S2(%rsp), $r_ptr
1578 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
1579
1580 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
1581 lea $res_y(%rsp), $r_ptr
1582 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
1583
1584 lea $S2(%rsp), $b_ptr
1585 lea $res_y(%rsp), $r_ptr
1586 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
1587
1588 movq %xmm0, $r_ptr # restore $r_ptr
1589
1590 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
1591 movdqa %xmm5, %xmm1
1592 pandn $res_z(%rsp), %xmm0
1593 movdqa %xmm5, %xmm2
1594 pandn $res_z+0x10(%rsp), %xmm1
1595 movdqa %xmm5, %xmm3
1596 pand $in2_z(%rsp), %xmm2
1597 pand $in2_z+0x10(%rsp), %xmm3
1598 por %xmm0, %xmm2
1599 por %xmm1, %xmm3
1600
1601 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
1602 movdqa %xmm4, %xmm1
1603 pandn %xmm2, %xmm0
1604 movdqa %xmm4, %xmm2
1605 pandn %xmm3, %xmm1
1606 movdqa %xmm4, %xmm3
1607 pand $in1_z(%rsp), %xmm2
1608 pand $in1_z+0x10(%rsp), %xmm3
1609 por %xmm0, %xmm2
1610 por %xmm1, %xmm3
1611 movdqu %xmm2, 0x40($r_ptr)
1612 movdqu %xmm3, 0x50($r_ptr)
1613
1614 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
1615 movdqa %xmm5, %xmm1
1616 pandn $res_x(%rsp), %xmm0
1617 movdqa %xmm5, %xmm2
1618 pandn $res_x+0x10(%rsp), %xmm1
1619 movdqa %xmm5, %xmm3
1620 pand $in2_x(%rsp), %xmm2
1621 pand $in2_x+0x10(%rsp), %xmm3
1622 por %xmm0, %xmm2
1623 por %xmm1, %xmm3
1624
1625 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
1626 movdqa %xmm4, %xmm1
1627 pandn %xmm2, %xmm0
1628 movdqa %xmm4, %xmm2
1629 pandn %xmm3, %xmm1
1630 movdqa %xmm4, %xmm3
1631 pand $in1_x(%rsp), %xmm2
1632 pand $in1_x+0x10(%rsp), %xmm3
1633 por %xmm0, %xmm2
1634 por %xmm1, %xmm3
1635 movdqu %xmm2, 0x00($r_ptr)
1636 movdqu %xmm3, 0x10($r_ptr)
1637
1638 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
1639 movdqa %xmm5, %xmm1
1640 pandn $res_y(%rsp), %xmm0
1641 movdqa %xmm5, %xmm2
1642 pandn $res_y+0x10(%rsp), %xmm1
1643 movdqa %xmm5, %xmm3
1644 pand $in2_y(%rsp), %xmm2
1645 pand $in2_y+0x10(%rsp), %xmm3
1646 por %xmm0, %xmm2
1647 por %xmm1, %xmm3
1648
1649 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
1650 movdqa %xmm4, %xmm1
1651 pandn %xmm2, %xmm0
1652 movdqa %xmm4, %xmm2
1653 pandn %xmm3, %xmm1
1654 movdqa %xmm4, %xmm3
1655 pand $in1_y(%rsp), %xmm2
1656 pand $in1_y+0x10(%rsp), %xmm3
1657 por %xmm0, %xmm2
1658 por %xmm1, %xmm3
1659 movdqu %xmm2, 0x20($r_ptr)
1660 movdqu %xmm3, 0x30($r_ptr)
1661
1662.Ladd_done$x:
1663 add \$32*18+8, %rsp
1664 pop %r15
1665 pop %r14
1666 pop %r13
1667 pop %r12
1668 pop %rbx
1669 pop %rbp
1670 ret
1671.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
1672___
1673}
1674&gen_add("q");
1675
1676sub gen_add_affine () {
1677 my $x = shift;
1678 my ($src0,$sfx,$bias);
1679 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
1680 $res_x,$res_y,$res_z,
1681 $in1_x,$in1_y,$in1_z,
1682 $in2_x,$in2_y)=map(32*$_,(0..14));
1683 my $Z1sqr = $S2;
1684
1685 if ($x ne "x") {
1686 $src0 = "%rax";
1687 $sfx = "";
1688 $bias = 0;
1689
1690$code.=<<___;
1691.globl ecp_nistz256_point_add_affine
1692.type ecp_nistz256_point_add_affine,\@function,3
1693.align 32
1694ecp_nistz256_point_add_affine:
1695___
1696 } else {
1697 $src0 = "%rdx";
1698 $sfx = "x";
1699 $bias = 128;
1700 }
1701$code.=<<___;
1702 push %rbp
1703 push %rbx
1704 push %r12
1705 push %r13
1706 push %r14
1707 push %r15
1708 sub \$32*15+8, %rsp
1709
1710 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1711 mov $b_org, $b_ptr # reassign
1712 movdqu 0x10($a_ptr), %xmm1
1713 movdqu 0x20($a_ptr), %xmm2
1714 movdqu 0x30($a_ptr), %xmm3
1715 movdqu 0x40($a_ptr), %xmm4
1716 movdqu 0x50($a_ptr), %xmm5
1717 mov 0x40+8*0($a_ptr), $src0 # load original in1_z
1718 mov 0x40+8*1($a_ptr), $acc6
1719 mov 0x40+8*2($a_ptr), $acc7
1720 mov 0x40+8*3($a_ptr), $acc0
1721 movdqa %xmm0, $in1_x(%rsp)
1722 movdqa %xmm1, $in1_x+0x10(%rsp)
1723 por %xmm0, %xmm1
1724 movdqa %xmm2, $in1_y(%rsp)
1725 movdqa %xmm3, $in1_y+0x10(%rsp)
1726 por %xmm2, %xmm3
1727 movdqa %xmm4, $in1_z(%rsp)
1728 movdqa %xmm5, $in1_z+0x10(%rsp)
1729 por %xmm1, %xmm3
1730
1731 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
1732 pshufd \$0xb1, %xmm3, %xmm5
1733 movdqu 0x10($b_ptr), %xmm1
1734 movdqu 0x20($b_ptr), %xmm2
1735 por %xmm3, %xmm5
1736 movdqu 0x30($b_ptr), %xmm3
1737 movdqa %xmm0, $in2_x(%rsp)
1738 pshufd \$0x1e, %xmm5, %xmm4
1739 movdqa %xmm1, $in2_x+0x10(%rsp)
1740 por %xmm0, %xmm1
1741 movq $r_ptr, %xmm0 # save $r_ptr
1742 movdqa %xmm2, $in2_y(%rsp)
1743 movdqa %xmm3, $in2_y+0x10(%rsp)
1744 por %xmm2, %xmm3
1745 por %xmm4, %xmm5
1746 pxor %xmm4, %xmm4
1747 por %xmm1, %xmm3
1748
1749 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1750 lea $Z1sqr(%rsp), $r_ptr # Z1^2
1751 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
1752
1753 pcmpeqd %xmm4, %xmm5
1754 pshufd \$0xb1, %xmm3, %xmm4
1755 mov 0x00($b_ptr), $src0 # $b_ptr is still valid
1756 #lea 0x00($b_ptr), $b_ptr
1757 mov $acc4, $acc1 # harmonize sqr output and mul input
1758 por %xmm3, %xmm4
1759 pshufd \$0, %xmm5, %xmm5 # in1infty
1760 pshufd \$0x1e, %xmm4, %xmm3
1761 mov $acc5, $acc2
1762 por %xmm3, %xmm4
1763 pxor %xmm3, %xmm3
1764 mov $acc6, $acc3
1765 pcmpeqd %xmm3, %xmm4
1766 pshufd \$0, %xmm4, %xmm4 # in2infty
1767
1768 lea $Z1sqr-$bias(%rsp), $a_ptr
1769 mov $acc7, $acc4
1770 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
1771 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
1772
1773 lea $in1_x(%rsp), $b_ptr
1774 lea $H(%rsp), $r_ptr # H = U2 - U1
1775 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
1776
1777 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1778 lea $S2(%rsp), $r_ptr # S2 = Z1^3
1779 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
1780
1781 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1782 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1783 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
1784
1785 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1786 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1787 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
1788
1789 lea $in1_y(%rsp), $b_ptr
1790 lea $R(%rsp), $r_ptr # R = S2 - S1
1791 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
1792
1793 `&load_for_sqr("$H(%rsp)", "$src0")`
1794 lea $Hsqr(%rsp), $r_ptr # H^2
1795 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
1796
1797 `&load_for_sqr("$R(%rsp)", "$src0")`
1798 lea $Rsqr(%rsp), $r_ptr # R^2
1799 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
1800
1801 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
1802 lea $Hcub(%rsp), $r_ptr # H^3
1803 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
1804
1805 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1806 lea $U2(%rsp), $r_ptr # U1*H^2
1807 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
1808___
1809{
1810#######################################################################
1811# operate in 4-5-0-1 "name space" that matches multiplication output
1812#
1813my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1814my ($poly1, $poly3)=($acc6,$acc7);
1815
1816$code.=<<___;
1817 #lea $U2(%rsp), $a_ptr
1818 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1819 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1820
1821 add $acc0, $acc0 # a0:a3+a0:a3
1822 lea $Rsqr(%rsp), $a_ptr
1823 adc $acc1, $acc1
1824 mov $acc0, $t0
1825 adc $acc2, $acc2
1826 adc $acc3, $acc3
1827 mov $acc1, $t1
1828 sbb $t4, $t4
1829
1830 sub \$-1, $acc0
1831 mov $acc2, $t2
1832 sbb $poly1, $acc1
1833 sbb \$0, $acc2
1834 mov $acc3, $t3
1835 sbb $poly3, $acc3
1836 test $t4, $t4
1837
1838 cmovz $t0, $acc0
1839 mov 8*0($a_ptr), $t0
1840 cmovz $t1, $acc1
1841 mov 8*1($a_ptr), $t1
1842 cmovz $t2, $acc2
1843 mov 8*2($a_ptr), $t2
1844 cmovz $t3, $acc3
1845 mov 8*3($a_ptr), $t3
1846
1847 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
1848
1849 lea $Hcub(%rsp), $b_ptr
1850 lea $res_x(%rsp), $r_ptr
1851 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
1852
1853 mov $U2+8*0(%rsp), $t0
1854 mov $U2+8*1(%rsp), $t1
1855 mov $U2+8*2(%rsp), $t2
1856 mov $U2+8*3(%rsp), $t3
1857 lea $H(%rsp), $r_ptr
1858
1859 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
1860
1861 mov $acc0, 8*0($r_ptr) # save the result, as
1862 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
1863 mov $acc2, 8*2($r_ptr)
1864 mov $acc3, 8*3($r_ptr)
1865___
1866}
1867$code.=<<___;
1868 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
1869 lea $S2(%rsp), $r_ptr
1870 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
1871
1872 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
1873 lea $H(%rsp), $r_ptr
1874 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
1875
1876 lea $S2(%rsp), $b_ptr
1877 lea $res_y(%rsp), $r_ptr
1878 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
1879
1880 movq %xmm0, $r_ptr # restore $r_ptr
1881
1882 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
1883 movdqa %xmm5, %xmm1
1884 pandn $res_z(%rsp), %xmm0
1885 movdqa %xmm5, %xmm2
1886 pandn $res_z+0x10(%rsp), %xmm1
1887 movdqa %xmm5, %xmm3
1888 pand .LONE_mont(%rip), %xmm2
1889 pand .LONE_mont+0x10(%rip), %xmm3
1890 por %xmm0, %xmm2
1891 por %xmm1, %xmm3
1892
1893 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
1894 movdqa %xmm4, %xmm1
1895 pandn %xmm2, %xmm0
1896 movdqa %xmm4, %xmm2
1897 pandn %xmm3, %xmm1
1898 movdqa %xmm4, %xmm3
1899 pand $in1_z(%rsp), %xmm2
1900 pand $in1_z+0x10(%rsp), %xmm3
1901 por %xmm0, %xmm2
1902 por %xmm1, %xmm3
1903 movdqu %xmm2, 0x40($r_ptr)
1904 movdqu %xmm3, 0x50($r_ptr)
1905
1906 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
1907 movdqa %xmm5, %xmm1
1908 pandn $res_x(%rsp), %xmm0
1909 movdqa %xmm5, %xmm2
1910 pandn $res_x+0x10(%rsp), %xmm1
1911 movdqa %xmm5, %xmm3
1912 pand $in2_x(%rsp), %xmm2
1913 pand $in2_x+0x10(%rsp), %xmm3
1914 por %xmm0, %xmm2
1915 por %xmm1, %xmm3
1916
1917 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
1918 movdqa %xmm4, %xmm1
1919 pandn %xmm2, %xmm0
1920 movdqa %xmm4, %xmm2
1921 pandn %xmm3, %xmm1
1922 movdqa %xmm4, %xmm3
1923 pand $in1_x(%rsp), %xmm2
1924 pand $in1_x+0x10(%rsp), %xmm3
1925 por %xmm0, %xmm2
1926 por %xmm1, %xmm3
1927 movdqu %xmm2, 0x00($r_ptr)
1928 movdqu %xmm3, 0x10($r_ptr)
1929
1930 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
1931 movdqa %xmm5, %xmm1
1932 pandn $res_y(%rsp), %xmm0
1933 movdqa %xmm5, %xmm2
1934 pandn $res_y+0x10(%rsp), %xmm1
1935 movdqa %xmm5, %xmm3
1936 pand $in2_y(%rsp), %xmm2
1937 pand $in2_y+0x10(%rsp), %xmm3
1938 por %xmm0, %xmm2
1939 por %xmm1, %xmm3
1940
1941 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
1942 movdqa %xmm4, %xmm1
1943 pandn %xmm2, %xmm0
1944 movdqa %xmm4, %xmm2
1945 pandn %xmm3, %xmm1
1946 movdqa %xmm4, %xmm3
1947 pand $in1_y(%rsp), %xmm2
1948 pand $in1_y+0x10(%rsp), %xmm3
1949 por %xmm0, %xmm2
1950 por %xmm1, %xmm3
1951 movdqu %xmm2, 0x20($r_ptr)
1952 movdqu %xmm3, 0x30($r_ptr)
1953
1954 add \$32*15+8, %rsp
1955 pop %r15
1956 pop %r14
1957 pop %r13
1958 pop %r12
1959 pop %rbx
1960 pop %rbp
1961 ret
1962.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
1963___
1964}
1965&gen_add_affine("q");
1966
1967}}}
1968
1969$code =~ s/\`([^\`]*)\`/eval $1/gem;
1970print $code;
1971close STDOUT;