summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorjsing <>2023-01-21 16:29:52 +0000
committerjsing <>2023-01-21 16:29:52 +0000
commit5b5ea3c7a3c1f266185429ffdf6d0fc5e57f9200 (patch)
treeceb8109da126da6fd39ebed0aab54ccd862a304f /src/lib
parent07faf72834674263f38b4e1ea4b614430398918f (diff)
downloadopenbsd-5b5ea3c7a3c1f266185429ffdf6d0fc5e57f9200.tar.gz
openbsd-5b5ea3c7a3c1f266185429ffdf6d0fc5e57f9200.tar.bz2
openbsd-5b5ea3c7a3c1f266185429ffdf6d0fc5e57f9200.zip
Bring in s2n-bignum's bignum_sqr() for amd64.
ok tb@
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S185
1 files changed, 185 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
new file mode 100644
index 0000000000..97cbebea7d
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -0,0 +1,185 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Square z := x^2
6// Input x[n]; output z[k]
7//
8// extern void bignum_sqr
9// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
10//
11// Does the "z := x^2" operation where x is n digits and result z is k.
12// Truncates the result in general unless k >= 2 * n
13//
14// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
15// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x
16// ----------------------------------------------------------------------------
17
18#include "_internal_s2n_bignum.h"
19
20 .intel_syntax noprefix
21 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
22 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
23 .text
24
25// First three are where arguments come in, but n is moved.
26
27#define p rdi
28#define z rsi
29#define x rcx
30#define n r8
31
32// These are always local scratch since multiplier result is in these
33
34#define a rax
35#define d rdx
36
37// Other variables
38
39#define i rbx
40#define ll rbp
41#define hh r9
42#define k r10
43#define y r11
44#define htop r12
45#define l r13
46#define h r14
47#define c r15
48
49// Short versions
50
51#define llshort ebp
52
53S2N_BN_SYMBOL(bignum_sqr):
54
55#if WINDOWS_ABI
56 push rdi
57 push rsi
58 mov rdi, rcx
59 mov rsi, rdx
60 mov rdx, r8
61 mov rcx, r9
62#endif
63
64// We use too many registers, and also we need rax:rdx for multiplications
65
66 push rbx
67 push rbp
68 push r12
69 push r13
70 push r14
71 push r15
72 mov n, rdx
73
74// If p = 0 the result is trivial and nothing needs doing
75
76 test p, p
77 jz end
78
79// initialize (hh,ll) = 0
80
81 xor llshort, llshort
82 xor hh, hh
83
84// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
85
86 xor k, k
87
88outerloop:
89
90// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
91// We want to accumulate all x[i] * x[k - i] for bot <= i < top
92// For the optimization of squaring we avoid duplication and do
93// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
94// Initialize i = bot; in fact just compute bot as i directly.
95
96 xor c, c
97 lea i, [k+1]
98 mov htop, i
99 shr htop, 1
100 sub i, n
101 cmovc i, c
102 cmp htop, n
103 cmovnc htop, n
104
105// Initialize the three-part local sum (c,h,l); c was already done above
106
107 xor l, l
108 xor h, h
109
110// If htop <= bot then main doubled part of the sum is empty
111
112 cmp i, htop
113 jnc nosumming
114
115// Use a moving pointer for [y] = x[k-i] for the cofactor
116
117 mov a, k
118 sub a, i
119 lea y, [x+8*a]
120
121// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
122
123innerloop:
124 mov a, [x+8*i]
125 mul QWORD PTR [y]
126 add l, a
127 adc h, d
128 adc c, 0
129 sub y, 8
130 inc i
131 cmp i, htop
132 jc innerloop
133
134// Now double it
135
136 add l, l
137 adc h, h
138 adc c, c
139
140// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
141
142nosumming:
143 test k, 1
144 jnz innerend
145 cmp i, n
146 jnc innerend
147
148 mov a, [x+8*i]
149 mul a
150 add l, a
151 adc h, d
152 adc c, 0
153
154// Now add the local sum into the global sum, store and shift
155
156innerend:
157 add l, ll
158 mov [z+8*k], l
159 adc h, hh
160 mov ll, h
161 adc c, 0
162 mov hh, c
163
164 inc k
165 cmp k, p
166 jc outerloop
167
168// Restore registers and return
169
170end:
171 pop r15
172 pop r14
173 pop r13
174 pop r12
175 pop rbp
176 pop rbx
177#if WINDOWS_ABI
178 pop rsi
179 pop rdi
180#endif
181 ret
182
183#if defined(__linux__) && defined(__ELF__)
184.section .note.GNU-stack,"",%progbits
185#endif