summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2025-08-12 10:14:24 +0000
committerjsing <>2025-08-12 10:14:24 +0000
commit9a1a39a85b5242df4ffd7c3a20eb012acdc716d7 (patch)
tree4f0c05ec3aa4f55ab6c1a37055a9219d952f3982 /src
parente5f4dc854c2c390526097888bf3654db21bdbb15 (diff)
downloadopenbsd-9a1a39a85b5242df4ffd7c3a20eb012acdc716d7.tar.gz
openbsd-9a1a39a85b5242df4ffd7c3a20eb012acdc716d7.tar.bz2
openbsd-9a1a39a85b5242df4ffd7c3a20eb012acdc716d7.zip
Bring in bignum_{mul,sqr}_{4_8,8_16}() from s2n-bignum.
These provide fast multiplication and squaring of inputs with 4 words or 8 words, producing an 8 or 16 word result. These versions require the CPU to support ADX instructions, while the _alt versions that have previously been imported do not.
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S174
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S260
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S145
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S298
4 files changed, 877 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..5e9aebd685
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,174 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[4], y[4]; output z[8]
7//
8// extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
9// const uint64_t y[static 4]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
12// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
20 .text
21
22// These are actually right
23
24#define z rdi
25#define x rsi
26
27// Copied in or set up
28
29#define y rcx
30
31// A zero register
32
33#define zero rbp
34#define zeroe ebp
35
36// Add in x[i] * rdx to the (i,i+1) position with the register window
37// Would be nice to have conditional expressions reg[i], reg[i+1] ...
38
39.macro mulpadd arg1,arg2
40 mulx rbx, rax, [x+8*\arg2]
41.if ((\arg1 + \arg2) % 4 == 0)
42 adcx r8, rax
43 adox r9, rbx
44.elseif ((\arg1 + \arg2) % 4 == 1)
45 adcx r9, rax
46 adox r10, rbx
47.elseif ((\arg1 + \arg2) % 4 == 2)
48 adcx r10, rax
49 adox r11, rbx
50.elseif ((\arg1 + \arg2) % 4 == 3)
51 adcx r11, rax
52 adox r8, rbx
53.endif
54
55.endm
56
57
58// Add in the whole j'th row
59
60.macro addrow arg1
61 mov rdx, [y+8*\arg1]
62 xor zeroe, zeroe
63
64 mulpadd \arg1, 0
65
66.if (\arg1 % 4 == 0)
67 mov [z+8*\arg1],r8
68.elseif (\arg1 % 4 == 1)
69 mov [z+8*\arg1],r9
70.elseif (\arg1 % 4 == 2)
71 mov [z+8*\arg1],r10
72.elseif (\arg1 % 4 == 3)
73 mov [z+8*\arg1],r11
74.endif
75
76 mulpadd \arg1, 1
77 mulpadd \arg1, 2
78
79.if (\arg1 % 4 == 0)
80 mulx r8, rax, [x+24]
81 adcx r11, rax
82 adox r8, zero
83 adcx r8, zero
84.elseif (\arg1 % 4 == 1)
85 mulx r9, rax, [x+24]
86 adcx r8, rax
87 adox r9, zero
88 adcx r9, zero
89.elseif (\arg1 % 4 == 2)
90 mulx r10, rax, [x+24]
91 adcx r9, rax
92 adox r10, zero
93 adcx r10, zero
94.elseif (\arg1 % 4 == 3)
95 mulx r11, rax, [x+24]
96 adcx r10, rax
97 adox r11, zero
98 adcx r11, zero
99.endif
100
101.endm
102
103
104
105S2N_BN_SYMBOL(bignum_mul_4_8):
106 _CET_ENDBR
107
108#if WINDOWS_ABI
109 push rdi
110 push rsi
111 mov rdi, rcx
112 mov rsi, rdx
113 mov rdx, r8
114#endif
115
116// Save more registers to play with
117
118 push rbp
119 push rbx
120
121// Copy y into a safe register to start with
122
123 mov y, rdx
124
125// Zero a register, which also makes sure we don't get a fake carry-in
126
127 xor zeroe, zeroe
128
129// Do the zeroth row, which is a bit different
130// Write back the zero-zero product and then accumulate
131// r8,r11,r10,r9 as y[0] * x from 1..4
132
133 mov rdx, [y]
134
135 mulx r9, r8, [x]
136 mov [z], r8
137
138 mulx r10, rbx, [x+8]
139 adcx r9, rbx
140
141 mulx r11, rbx, [x+16]
142 adcx r10, rbx
143
144 mulx r8, rbx, [x+24]
145 adcx r11, rbx
146 adcx r8, zero
147
148// Now all the other rows in a uniform pattern
149
150 addrow 1
151 addrow 2
152 addrow 3
153
154// Now write back the additional columns
155
156 mov [z+32], r8
157 mov [z+40], r9
158 mov [z+48], r10
159 mov [z+56], r11
160
161// Restore registers and return
162
163 pop rbx
164 pop rbp
165
166#if WINDOWS_ABI
167 pop rsi
168 pop rdi
169#endif
170 ret
171
172#if defined(__linux__) && defined(__ELF__)
173.section .note.GNU-stack,"",%progbits
174#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..7bfae22a3f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,260 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[8], y[8]; output z[16]
7//
8// extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
9// const uint64_t y[static 8]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
12// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
20 .text
21
22// These are actually right
23
24#define z rdi
25#define x rsi
26
27// Copied in or set up
28
29#define y rcx
30
31// A zero register
32
33#define zero rbp
34#define zeroe ebp
35
36// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
37
38.macro mulpadd arg1,arg2
39 mulx rbx, rax, [x+8*\arg1]
40.if ((\arg1 + \arg2) % 8 == 0)
41 adcx r8, rax
42 adox r9, rbx
43.elseif ((\arg1 + \arg2) % 8 == 1)
44 adcx r9, rax
45 adox r10, rbx
46.elseif ((\arg1 + \arg2) % 8 == 2)
47 adcx r10, rax
48 adox r11, rbx
49.elseif ((\arg1 + \arg2) % 8 == 3)
50 adcx r11, rax
51 adox r12, rbx
52.elseif ((\arg1 + \arg2) % 8 == 4)
53 adcx r12, rax
54 adox r13, rbx
55.elseif ((\arg1 + \arg2) % 8 == 5)
56 adcx r13, rax
57 adox r14, rbx
58.elseif ((\arg1 + \arg2) % 8 == 6)
59 adcx r14, rax
60 adox r15, rbx
61.elseif ((\arg1 + \arg2) % 8 == 7)
62 adcx r15, rax
63 adox r8, rbx
64.endif
65
66.endm
67
68// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
69// but re-creates the top word assuming nothing to add there
70
71.macro mulpade arg1,arg2
72.if ((\arg1 + \arg2) % 8 == 0)
73 mulx r9, rax, [x+8*\arg1]
74 adcx r8, rax
75 adox r9, zero
76.elseif ((\arg1 + \arg2) % 8 == 1)
77 mulx r10, rax, [x+8*\arg1]
78 adcx r9, rax
79 adox r10, zero
80.elseif ((\arg1 + \arg2) % 8 == 2)
81 mulx r11, rax, [x+8*\arg1]
82 adcx r10, rax
83 adox r11, zero
84.elseif ((\arg1 + \arg2) % 8 == 3)
85 mulx r12, rax, [x+8*\arg1]
86 adcx r11, rax
87 adox r12, zero
88.elseif ((\arg1 + \arg2) % 8 == 4)
89 mulx r13, rax, [x+8*\arg1]
90 adcx r12, rax
91 adox r13, zero
92.elseif ((\arg1 + \arg2) % 8 == 5)
93 mulx r14, rax, [x+8*\arg1]
94 adcx r13, rax
95 adox r14, zero
96.elseif ((\arg1 + \arg2) % 8 == 6)
97 mulx r15, rax, [x+8*\arg1]
98 adcx r14, rax
99 adox r15, zero
100.elseif ((\arg1 + \arg2) % 8 == 7)
101 mulx r8, rax, [x+8*\arg1]
102 adcx r15, rax
103 adox r8, zero
104.endif
105
106.endm
107
108// Add in the whole j'th row
109
110.macro addrow arg1
111 mov rdx, [y+8*\arg1]
112 xor zeroe, zeroe
113
114 mulpadd 0, \arg1
115
116.if (\arg1 % 8 == 0)
117 mov [z+8*\arg1],r8
118.elseif (\arg1 % 8 == 1)
119 mov [z+8*\arg1],r9
120.elseif (\arg1 % 8 == 2)
121 mov [z+8*\arg1],r10
122.elseif (\arg1 % 8 == 3)
123 mov [z+8*\arg1],r11
124.elseif (\arg1 % 8 == 4)
125 mov [z+8*\arg1],r12
126.elseif (\arg1 % 8 == 5)
127 mov [z+8*\arg1],r13
128.elseif (\arg1 % 8 == 6)
129 mov [z+8*\arg1],r14
130.elseif (\arg1 % 8 == 7)
131 mov [z+8*\arg1],r15
132.endif
133
134 mulpadd 1, \arg1
135 mulpadd 2, \arg1
136 mulpadd 3, \arg1
137 mulpadd 4, \arg1
138 mulpadd 5, \arg1
139 mulpadd 6, \arg1
140 mulpade 7, \arg1
141
142.if (\arg1 % 8 == 0)
143 adc r8, zero
144.elseif (\arg1 % 8 == 1)
145 adc r9, zero
146.elseif (\arg1 % 8 == 2)
147 adc r10, zero
148.elseif (\arg1 % 8 == 3)
149 adc r11, zero
150.elseif (\arg1 % 8 == 4)
151 adc r12, zero
152.elseif (\arg1 % 8 == 5)
153 adc r13, zero
154.elseif (\arg1 % 8 == 6)
155 adc r14, zero
156.elseif (\arg1 % 8 == 7)
157 adc r15, zero
158.endif
159
160.endm
161
162
163S2N_BN_SYMBOL(bignum_mul_8_16):
164 _CET_ENDBR
165
166#if WINDOWS_ABI
167 push rdi
168 push rsi
169 mov rdi, rcx
170 mov rsi, rdx
171 mov rdx, r8
172#endif
173
174// Save more registers to play with
175
176 push rbp
177 push rbx
178 push r12
179 push r13
180 push r14
181 push r15
182
183// Copy y into a safe register to start with
184
185 mov y, rdx
186
187// Zero a register, which also makes sure we don't get a fake carry-in
188
189 xor zeroe, zeroe
190
191// Do the zeroth row, which is a bit different
192// Write back the zero-zero product and then accumulate
193// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
194
195 mov rdx, [y]
196
197 mulx r9, r8, [x]
198 mov [z], r8
199
200 mulx r10, rbx, [x+8]
201 adc r9, rbx
202
203 mulx r11, rbx, [x+16]
204 adc r10, rbx
205
206 mulx r12, rbx, [x+24]
207 adc r11, rbx
208
209 mulx r13, rbx, [x+32]
210 adc r12, rbx
211
212 mulx r14, rbx, [x+40]
213 adc r13, rbx
214
215 mulx r15, rbx, [x+48]
216 adc r14, rbx
217
218 mulx r8, rbx, [x+56]
219 adc r15, rbx
220 adc r8, zero
221
222// Now all the other rows in a uniform pattern
223
224 addrow 1
225 addrow 2
226 addrow 3
227 addrow 4
228 addrow 5
229 addrow 6
230 addrow 7
231
232// Now write back the additional columns
233
234 mov [z+64], r8
235 mov [z+72], r9
236 mov [z+80], r10
237 mov [z+88], r11
238 mov [z+96], r12
239 mov [z+104], r13
240 mov [z+112], r14
241 mov [z+120], r15
242
243// Real epilog
244
245 pop r15
246 pop r14
247 pop r13
248 pop r12
249 pop rbx
250 pop rbp
251
252#if WINDOWS_ABI
253 pop rsi
254 pop rdi
255#endif
256 ret
257
258#if defined(__linux__) && defined(__ELF__)
259.section .note.GNU-stack,"",%progbits
260#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..205455bd2c
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,145 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Square, z := x^2
6// Input x[4]; output z[8]
7//
8// extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
9//
10// Standard x86-64 ABI: RDI = z, RSI = x
11// Microsoft x64 ABI: RCX = z, RDX = x
12// ----------------------------------------------------------------------------
13
14#include "_internal_s2n_bignum.h"
15
16 .intel_syntax noprefix
17 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
18 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
19 .text
20
21// These are actually right
22
23#define z rdi
24#define x rsi
25
26// A zero register
27
28#define zero rbp
29#define zeroe ebp
30
31// Other registers
32
33#define d1 r8
34#define d2 r9
35#define d3 r10
36#define d4 r11
37#define d5 r12
38#define d6 r13
39
40
41
42S2N_BN_SYMBOL(bignum_sqr_4_8):
43 _CET_ENDBR
44
45#if WINDOWS_ABI
46 push rdi
47 push rsi
48 mov rdi, rcx
49 mov rsi, rdx
50#endif
51
52// Save more registers to play with
53
54 push rbp
55 push r12
56 push r13
57
58// Set up an initial window [d6;...d1] = [23;03;01]
59
60 mov rdx, [x]
61 mulx d2, d1, [x+8]
62 mulx d4, d3, [x+24]
63 mov rdx, [x+16]
64 mulx d6, d5, [x+24]
65
66// Clear our zero register, and also initialize the flags for the carry chain
67
68 xor zeroe, zeroe
69
70// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
71// This gives all the "heterogeneous" terms of the squaring ready to double
72
73 mulx rcx, rax, [x]
74 adcx d2, rax
75 adox d3, rcx
76 mulx rcx, rax, [x+8]
77 adcx d3, rax
78 adox d4, rcx
79 mov rdx, [x+24]
80 mulx rcx, rax, [x+8]
81 adcx d4, rax
82 adox d5, rcx
83 adcx d5, zero
84 adox d6, zero
85 adcx d6, zero
86
87// In principle this is otiose as CF and OF carries are absorbed at this point
88// However it seems helpful for the OOO engine to be told it's a fresh start
89
90 xor zeroe, zeroe
91
92// Double and add to the 00 + 11 + 22 + 33 terms
93//
94// We could use shift-double but this seems tidier and in larger squarings
95// it was actually more efficient. I haven't experimented with this small
96// case to see how much that matters. Note: the writeback here is sprinkled
97// into the sequence in such a way that things still work if z = x, i.e. if
98// the output overwrites the input buffer and beyond.
99
100 mov rdx, [x]
101 mulx rdx, rax, rdx
102 mov [z], rax
103 adcx d1, d1
104 adox d1, rdx
105 mov rdx, [x+8]
106 mov [z+8], d1
107 mulx rdx, rax, rdx
108 adcx d2, d2
109 adox d2, rax
110 adcx d3, d3
111 adox d3, rdx
112 mov rdx, [x+16]
113 mov [z+16], d2
114 mulx rdx, rax, rdx
115 adcx d4, d4
116 adox d4, rax
117 adcx d5, d5
118 adox d5, rdx
119 mov rdx, [x+24]
120 mov [z+24], d3
121 mulx rdx, rax, rdx
122 mov [z+32], d4
123 adcx d6, d6
124 mov [z+40], d5
125 adox d6, rax
126 mov [z+48], d6
127 adcx rdx, zero
128 adox rdx, zero
129 mov [z+56], rdx
130
131// Restore saved registers and return
132
133 pop r13
134 pop r12
135 pop rbp
136
137#if WINDOWS_ABI
138 pop rsi
139 pop rdi
140#endif
141 ret
142
143#if defined(__linux__) && defined(__ELF__)
144.section .note.GNU-stack,"",%progbits
145#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..77741c603e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,298 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Square, z := x^2
6// Input x[8]; output z[16]
7//
8// extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
9//
10// Standard x86-64 ABI: RDI = z, RSI = x
11// Microsoft x64 ABI: RCX = z, RDX = x
12// ----------------------------------------------------------------------------
13
14#include "_internal_s2n_bignum.h"
15
16 .intel_syntax noprefix
17 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
18 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
19 .text
20
21// These are actually right
22
23#define z rdi
24#define x rsi
25
26// A zero register
27
28#define zero rbp
29#define zeroe ebp
30
31// mulpadd i, j adds rdx * x[i] into the window at the i+j point
32
33.macro mulpadd arg1,arg2
34 mulx rcx, rax, [x+8*\arg1]
35.if ((\arg1 + \arg2) % 8 == 0)
36 adcx r8, rax
37 adox r9, rcx
38.elseif ((\arg1 + \arg2) % 8 == 1)
39 adcx r9, rax
40 adox r10, rcx
41.elseif ((\arg1 + \arg2) % 8 == 2)
42 adcx r10, rax
43 adox r11, rcx
44.elseif ((\arg1 + \arg2) % 8 == 3)
45 adcx r11, rax
46 adox r12, rcx
47.elseif ((\arg1 + \arg2) % 8 == 4)
48 adcx r12, rax
49 adox r13, rcx
50.elseif ((\arg1 + \arg2) % 8 == 5)
51 adcx r13, rax
52 adox r14, rcx
53.elseif ((\arg1 + \arg2) % 8 == 6)
54 adcx r14, rax
55 adox r15, rcx
56.elseif ((\arg1 + \arg2) % 8 == 7)
57 adcx r15, rax
58 adox r8, rcx
59.endif
60
61.endm
62
63// mulpade i, j adds rdx * x[i] into the window at i+j
64// but re-creates the top word assuming nothing to add there
65
66.macro mulpade arg1,arg2
67.if ((\arg1 + \arg2) % 8 == 0)
68 mulx r9, rax, [x+8*\arg1]
69 adcx r8, rax
70 adox r9, zero
71.elseif ((\arg1 + \arg2) % 8 == 1)
72 mulx r10, rax, [x+8*\arg1]
73 adcx r9, rax
74 adox r10, zero
75.elseif ((\arg1 + \arg2) % 8 == 2)
76 mulx r11, rax, [x+8*\arg1]
77 adcx r10, rax
78 adox r11, zero
79.elseif ((\arg1 + \arg2) % 8 == 3)
80 mulx r12, rax, [x+8*\arg1]
81 adcx r11, rax
82 adox r12, zero
83.elseif ((\arg1 + \arg2) % 8 == 4)
84 mulx r13, rax, [x+8*\arg1]
85 adcx r12, rax
86 adox r13, zero
87.elseif ((\arg1 + \arg2) % 8 == 5)
88 mulx r14, rax, [x+8*\arg1]
89 adcx r13, rax
90 adox r14, zero
91.elseif ((\arg1 + \arg2) % 8 == 6)
92 mulx r15, rax, [x+8*\arg1]
93 adcx r14, rax
94 adox r15, zero
95.elseif ((\arg1 + \arg2) % 8 == 7)
96 mulx r8, rax, [x+8*\arg1]
97 adcx r15, rax
98 adox r8, zero
99.endif
100
101.endm
102
103.macro diagonals
104
105 xor zeroe, zeroe
106
107// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
108
109 mov rdx, [x]
110 mulx rax, r9, [x+8]
111 mov [z+8], r9
112 mulx rcx, r10, [x+16]
113 adcx r10, rax
114 mov [z+16], r10
115 mulx rax, r11, [x+24]
116 adcx r11, rcx
117 mulx rcx, r12, [x+32]
118 adcx r12, rax
119 mulx rax, r13, [x+40]
120 adcx r13, rcx
121 mulx rcx, r14, [x+48]
122 adcx r14, rax
123 mulx r8, r15, [x+56]
124 adcx r15, rcx
125 adcx r8, zero
126
127// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
128
129 xor zeroe, zeroe
130 mov rdx, [x+8]
131 mulpadd 2, 1
132 mov [z+24], r11
133 mulpadd 3, 1
134 mov [z+32], r12
135 mulpadd 4, 1
136 mulpadd 5, 1
137 mulpadd 6, 1
138 mulpade 7, 1
139 mov rdx, [x+32]
140 mulpade 5, 4
141 adcx r10, zero
142
143// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
144
145 xor zeroe, zeroe
146 mov rdx, [x+16]
147 mulpadd 3, 2
148 mov [z+40], r13
149 mulpadd 4, 2
150 mov [z+48], r14
151 mulpadd 5, 2
152 mulpadd 6, 2
153 mulpadd 7, 2
154 mov rdx, [x+48]
155 mulpade 4, 6
156 mulpade 5, 6
157 adcx r12, zero
158
159// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
160
161 xor zeroe, zeroe
162 mov rdx, [x+24]
163 mulpadd 4, 3
164 mov [z+56], r15
165 mulpadd 5, 3
166 mov [z+64], r8
167 mulpadd 6, 3
168 mulpadd 7, 3
169 mov rdx, [x+56]
170 mulpadd 4, 7
171 mulpade 5, 7
172 mulpade 6, 7
173 adcx r14, zero
174
175// Double and add things; use z[1]..z[8] and thereafter the registers
176// r9..r15 which haven't been written back yet
177
178 xor zeroe, zeroe
179 mov rdx, [x]
180 mulx rcx, rax, rdx
181 mov [z], rax
182 mov rax, [z+8]
183 adcx rax, rax
184 adox rax, rcx
185 mov [z+8], rax
186
187 mov rax, [z+16]
188 mov rdx, [x+8]
189 mulx rcx, rdx, rdx
190 adcx rax, rax
191 adox rax, rdx
192 mov [z+16], rax
193 mov rax, [z+24]
194 adcx rax, rax
195 adox rax, rcx
196 mov [z+24], rax
197
198 mov rax, [z+32]
199 mov rdx, [x+16]
200 mulx rcx, rdx, rdx
201 adcx rax, rax
202 adox rax, rdx
203 mov [z+32], rax
204 mov rax, [z+40]
205 adcx rax, rax
206 adox rax, rcx
207 mov [z+40], rax
208
209 mov rax, [z+48]
210 mov rdx, [x+24]
211 mulx rcx, rdx, rdx
212 adcx rax, rax
213 adox rax, rdx
214 mov [z+48], rax
215 mov rax, [z+56]
216 adcx rax, rax
217 adox rax, rcx
218 mov [z+56], rax
219
220 mov rax, [z+64]
221 mov rdx, [x+32]
222 mulx rcx, rdx, rdx
223 adcx rax, rax
224 adox rax, rdx
225 mov [z+64], rax
226 adcx r9, r9
227 adox r9, rcx
228 mov [z+72], r9
229
230 mov rdx, [x+40]
231 mulx rcx, rdx, rdx
232 adcx r10, r10
233 adox r10, rdx
234 mov [z+80], r10
235 adcx r11, r11
236 adox r11, rcx
237 mov [z+88], r11
238
239 mov rdx, [x+48]
240 mulx rcx, rdx, rdx
241 adcx r12, r12
242 adox r12, rdx
243 mov [z+96], r12
244 adcx r13, r13
245 adox r13, rcx
246 mov [z+104], r13
247
248 mov rdx, [x+56]
249 mulx r15, rdx, rdx
250 adcx r14, r14
251 adox r14, rdx
252 mov [z+112], r14
253 adcx r15, zero
254 adox r15, zero
255 mov [z+120], r15
256
257.endm
258
259
260S2N_BN_SYMBOL(bignum_sqr_8_16):
261 _CET_ENDBR
262
263#if WINDOWS_ABI
264 push rdi
265 push rsi
266 mov rdi, rcx
267 mov rsi, rdx
268#endif
269
270// Save more registers to play with
271
272 push rbp
273 push r12
274 push r13
275 push r14
276 push r15
277
278// Do the multiplication
279
280 diagonals
281
282// Real epilog
283
284 pop r15
285 pop r14
286 pop r13
287 pop r12
288 pop rbp
289
290#if WINDOWS_ABI
291 pop rsi
292 pop rdi
293#endif
294 ret
295
296#if defined(__linux__) && defined(__ELF__)
297.section .note.GNU-stack,"",%progbits
298#endif