summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2025-08-12 10:09:46 +0000
committerjsing <>2025-08-12 10:09:46 +0000
commite5f4dc854c2c390526097888bf3654db21bdbb15 (patch)
tree25b70579b17511a42354b46d22fd95e599a86b6c /src
parentf111c1e299493f0bdf21f9ff08f046d429764e75 (diff)
downloadopenbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.gz
openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.bz2
openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.zip
Bring in bignum_{mul,sqr}_6_12{,_alt}() from s2n-bignum.
These provide fast multiplication and squaring of inputs with 6x words, producing a 12 word result. The non-_alt versions require the CPU to support ADX instructions, while the _alt versions do not.
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S210
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S186
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S214
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S197
4 files changed, 807 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..081b602cde
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,210 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[6], y[6]; output z[12]
7//
8// extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
9// const uint64_t y[static 6]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
12// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
20 .text
21
22// These are actually right
23
24#define z rdi
25#define x rsi
26
27// Copied in or set up
28
29#define y rcx
30
31// A zero register
32
33#define zero rbp
34#define zeroe ebp
35
36// Add in x[i] * rdx to the (i,i+1) position with the register window
37// Would be nice to have conditional expressions reg[i], reg[i+1] ...
38
39.macro mulpadd arg1,arg2
40 mulx rbx, rax, [x+8*\arg2]
41.if ((\arg1 + \arg2) % 6 == 0)
42 adcx r8, rax
43 adox r9, rbx
44.elseif ((\arg1 + \arg2) % 6 == 1)
45 adcx r9, rax
46 adox r10, rbx
47.elseif ((\arg1 + \arg2) % 6 == 2)
48 adcx r10, rax
49 adox r11, rbx
50.elseif ((\arg1 + \arg2) % 6 == 3)
51 adcx r11, rax
52 adox r12, rbx
53.elseif ((\arg1 + \arg2) % 6 == 4)
54 adcx r12, rax
55 adox r13, rbx
56.elseif ((\arg1 + \arg2) % 6 == 5)
57 adcx r13, rax
58 adox r8, rbx
59.endif
60
61.endm
62
63
64// Add in the whole j'th row
65
66.macro addrow arg1
67 mov rdx, [y+8*\arg1]
68 xor zeroe, zeroe
69
70 mulpadd \arg1, 0
71
72.if (\arg1 % 6 == 0)
73 mov [z+8*\arg1],r8
74.elseif (\arg1 % 6 == 1)
75 mov [z+8*\arg1],r9
76.elseif (\arg1 % 6 == 2)
77 mov [z+8*\arg1],r10
78.elseif (\arg1 % 6 == 3)
79 mov [z+8*\arg1],r11
80.elseif (\arg1 % 6 == 4)
81 mov [z+8*\arg1],r12
82.elseif (\arg1 % 6 == 5)
83 mov [z+8*\arg1],r13
84.endif
85
86 mulpadd \arg1, 1
87 mulpadd \arg1, 2
88 mulpadd \arg1, 3
89 mulpadd \arg1, 4
90
91.if (\arg1 % 6 == 0)
92 mulx r8, rax, [x+40]
93 adcx r13, rax
94 adox r8, zero
95 adcx r8, zero
96.elseif (\arg1 % 6 == 1)
97 mulx r9, rax, [x+40]
98 adcx r8, rax
99 adox r9, zero
100 adcx r9, zero
101.elseif (\arg1 % 6 == 2)
102 mulx r10, rax, [x+40]
103 adcx r9, rax
104 adox r10, zero
105 adcx r10, zero
106.elseif (\arg1 % 6 == 3)
107 mulx r11, rax, [x+40]
108 adcx r10, rax
109 adox r11, zero
110 adcx r11, zero
111.elseif (\arg1 % 6 == 4)
112 mulx r12, rax, [x+40]
113 adcx r11, rax
114 adox r12, zero
115 adcx r12, zero
116.elseif (\arg1 % 6 == 5)
117 mulx r13, rax, [x+40]
118 adcx r12, rax
119 adox r13, zero
120 adcx r13, zero
121.endif
122
123.endm
124
125
126
127S2N_BN_SYMBOL(bignum_mul_6_12):
128 _CET_ENDBR
129
130#if WINDOWS_ABI
131 push rdi
132 push rsi
133 mov rdi, rcx
134 mov rsi, rdx
135 mov rdx, r8
136#endif
137
138// Save more registers to play with
139
140 push rbp
141 push rbx
142 push r12
143 push r13
144
145// Copy y into a safe register to start with
146
147 mov y, rdx
148
149// Zero a register, which also makes sure we don't get a fake carry-in
150
151 xor zeroe, zeroe
152
153// Do the zeroth row, which is a bit different
154// Write back the zero-zero product and then accumulate
155// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
156
157 mov rdx, [y]
158
159 mulx r9, r8, [x]
160 mov [z], r8
161
162 mulx r10, rbx, [x+8]
163 adcx r9, rbx
164
165 mulx r11, rbx, [x+16]
166 adcx r10, rbx
167
168 mulx r12, rbx, [x+24]
169 adcx r11, rbx
170
171 mulx r13, rbx, [x+32]
172 adcx r12, rbx
173
174 mulx r8, rbx, [x+40]
175 adcx r13, rbx
176 adcx r8, zero
177
178// Now all the other rows in a uniform pattern
179
180 addrow 1
181 addrow 2
182 addrow 3
183 addrow 4
184 addrow 5
185
186// Now write back the additional columns
187
188 mov [z+48], r8
189 mov [z+56], r9
190 mov [z+64], r10
191 mov [z+72], r11
192 mov [z+80], r12
193 mov [z+88], r13
194
195// Restore registers and return
196
197 pop r13
198 pop r12
199 pop rbx
200 pop rbp
201
202#if WINDOWS_ABI
203 pop rsi
204 pop rdi
205#endif
206 ret
207
208#if defined(__linux__) && defined(__ELF__)
209.section .note.GNU-stack,"",%progbits
210#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..f4ea4ed68b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,186 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[6], y[6]; output z[12]
7//
8// extern void bignum_mul_6_12_alt(uint64_t z[static 12],
9// const uint64_t x[static 6],
10// const uint64_t y[static 6]);
11//
12// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
13// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
14// ----------------------------------------------------------------------------
15
16#include "_internal_s2n_bignum.h"
17
18 .intel_syntax noprefix
19 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
20 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
21 .text
22
23// These are actually right
24
25#define z rdi
26#define x rsi
27
28// This is moved from rdx to free it for muls
29
30#define y rcx
31
32// Other variables used as a rotating 3-word window to add terms to
33
34#define t0 r8
35#define t1 r9
36#define t2 r10
37
38// Macro for the key "multiply and add to (c,h,l)" step
39
40#define combadd(c,h,l,numa,numb) \
41 mov rax, numa; \
42 mul QWORD PTR numb; \
43 add l, rax; \
44 adc h, rdx; \
45 adc c, 0
46
47// A minutely shorter form for when c = 0 initially
48
49#define combadz(c,h,l,numa,numb) \
50 mov rax, numa; \
51 mul QWORD PTR numb; \
52 add l, rax; \
53 adc h, rdx; \
54 adc c, c
55
56// A short form where we don't expect a top carry
57
58#define combads(h,l,numa,numb) \
59 mov rax, numa; \
60 mul QWORD PTR numb; \
61 add l, rax; \
62 adc h, rdx
63
64S2N_BN_SYMBOL(bignum_mul_6_12_alt):
65 _CET_ENDBR
66
67#if WINDOWS_ABI
68 push rdi
69 push rsi
70 mov rdi, rcx
71 mov rsi, rdx
72 mov rdx, r8
73#endif
74
75// Copy y into a safe register to start with
76
77 mov y, rdx
78
79// Result term 0
80
81 mov rax, [x]
82 mul QWORD PTR [y]
83
84 mov [z], rax
85 mov t0, rdx
86 xor t1, t1
87
88// Result term 1
89
90 xor t2, t2
91 combads(t1,t0,[x],[y+8])
92 combadz(t2,t1,t0,[x+8],[y])
93 mov [z+8], t0
94
95// Result term 2
96
97 xor t0, t0
98 combadz(t0,t2,t1,[x],[y+16])
99 combadd(t0,t2,t1,[x+8],[y+8])
100 combadd(t0,t2,t1,[x+16],[y])
101 mov [z+16], t1
102
103// Result term 3
104
105 xor t1, t1
106 combadz(t1,t0,t2,[x],[y+24])
107 combadd(t1,t0,t2,[x+8],[y+16])
108 combadd(t1,t0,t2,[x+16],[y+8])
109 combadd(t1,t0,t2,[x+24],[y])
110 mov [z+24], t2
111
112// Result term 4
113
114 xor t2, t2
115 combadz(t2,t1,t0,[x],[y+32])
116 combadd(t2,t1,t0,[x+8],[y+24])
117 combadd(t2,t1,t0,[x+16],[y+16])
118 combadd(t2,t1,t0,[x+24],[y+8])
119 combadd(t2,t1,t0,[x+32],[y])
120 mov [z+32], t0
121
122// Result term 5
123
124 xor t0, t0
125 combadz(t0,t2,t1,[x],[y+40])
126 combadd(t0,t2,t1,[x+8],[y+32])
127 combadd(t0,t2,t1,[x+16],[y+24])
128 combadd(t0,t2,t1,[x+24],[y+16])
129 combadd(t0,t2,t1,[x+32],[y+8])
130 combadd(t0,t2,t1,[x+40],[y])
131 mov [z+40], t1
132
133// Result term 6
134
135 xor t1, t1
136 combadz(t1,t0,t2,[x+8],[y+40])
137 combadd(t1,t0,t2,[x+16],[y+32])
138 combadd(t1,t0,t2,[x+24],[y+24])
139 combadd(t1,t0,t2,[x+32],[y+16])
140 combadd(t1,t0,t2,[x+40],[y+8])
141 mov [z+48], t2
142
143// Result term 7
144
145 xor t2, t2
146 combadz(t2,t1,t0,[x+16],[y+40])
147 combadd(t2,t1,t0,[x+24],[y+32])
148 combadd(t2,t1,t0,[x+32],[y+24])
149 combadd(t2,t1,t0,[x+40],[y+16])
150 mov [z+56], t0
151
152// Result term 8
153
154 xor t0, t0
155 combadz(t0,t2,t1,[x+24],[y+40])
156 combadd(t0,t2,t1,[x+32],[y+32])
157 combadd(t0,t2,t1,[x+40],[y+24])
158 mov [z+64], t1
159
160// Result term 9
161
162 xor t1, t1
163 combadz(t1,t0,t2,[x+32],[y+40])
164 combadd(t1,t0,t2,[x+40],[y+32])
165 mov [z+72], t2
166
167// Result term 10
168
169 combads(t1,t0,[x+40],[y+40])
170 mov [z+80], t0
171
172// Result term 11
173
174 mov [z+88], t1
175
176// Return
177
178#if WINDOWS_ABI
179 pop rsi
180 pop rdi
181#endif
182 ret
183
184#if defined(__linux__) && defined(__ELF__)
185.section .note.GNU-stack,"",%progbits
186#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..07d840b47c
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,214 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Square, z := x^2
6// Input x[6]; output z[12]
7//
8// extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
9//
10// Standard x86-64 ABI: RDI = z, RSI = x
11// Microsoft x64 ABI: RCX = z, RDX = x
12// ----------------------------------------------------------------------------
13
14#include "_internal_s2n_bignum.h"
15
16 .intel_syntax noprefix
17 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
18 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
19 .text
20
21// These are actually right
22
23#define z rdi
24#define x rsi
25
26// A zero register
27
28#define zero rbp
29#define zeroe ebp
30
31// Other registers
32
33#define d1 r8
34#define d2 r9
35#define d3 r10
36#define d4 r11
37#define d5 r12
38#define d6 r13
39#define d7 r14
40#define d8 r15
41#define d9 rbx
42
43// Care is needed: re-using the zero register
44
45#define d10 rbp
46
47
48S2N_BN_SYMBOL(bignum_sqr_6_12):
49 _CET_ENDBR
50
51#if WINDOWS_ABI
52 push rdi
53 push rsi
54 mov rdi, rcx
55 mov rsi, rdx
56#endif
57
58// Save more registers to play with
59
60 push rbp
61 push rbx
62 push r12
63 push r13
64 push r14
65 push r15
66
67// Set up an initial window [d8;...d1] = [34;05;03;01]
68
69 mov rdx, [x]
70 mulx d2, d1, [x+8]
71 mulx d4, d3, [x+24]
72 mulx d6, d5, [x+40]
73 mov rdx, [x+24]
74 mulx d8, d7, [x+32]
75
76// Clear our zero register, and also initialize the flags for the carry chain
77
78 xor zeroe, zeroe
79
80// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
81// (no carry-out possible since we add it to the top of a product)
82
83 mov rdx, [x+16]
84 mulx rcx, rax, [x]
85 adcx d2, rax
86 adox d3, rcx
87 mulx rcx, rax, [x+8]
88 adcx d3, rax
89 adox d4, rcx
90 mov rdx, [x+8]
91 mulx rcx, rax, [x+24]
92 adcx d4, rax
93 adox d5, rcx
94 mulx rcx, rax, [x+32]
95 adcx d5, rax
96 adox d6, rcx
97 mulx rcx, rax, [x+40]
98 adcx d6, rax
99 adox d7, rcx
100 adcx d7, zero
101 adox d8, zero
102 adcx d8, zero
103
104// Again zero out the flags. Actually they are already cleared but it may
105// help decouple these in the OOO engine not to wait for the chain above
106
107 xor zeroe, zeroe
108
109// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
110// We are running out of registers and here our zero register is not zero!
111
112 mov rdx, [x+32]
113 mulx rcx, rax, [x]
114 adcx d4, rax
115 adox d5, rcx
116 mov rdx, [x+16]
117 mulx rcx, rax, [x+24]
118 adcx d5, rax
119 adox d6, rcx
120 mulx rcx, rax, [x+32]
121 adcx d6, rax
122 adox d7, rcx
123 mulx rcx, rax, [x+40]
124 adcx d7, rax
125 adox d8, rcx
126 mov rdx, [x+24]
127 mulx d9, rax, [x+40]
128 adcx d8, rax
129 adox d9, zero
130 mov rdx, [x+32]
131 mulx d10, rax, [x+40]
132 adcx d9, rax
133 mov eax, 0
134 adox d10, rax
135 adcx d10, rax
136
137// Again, just for a clear fresh start for the flags
138
139 xor eax, eax
140
141// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
142//
143// We could use shift-double but this seems tidier and in larger squarings
144// it was actually more efficient. I haven't experimented with this small
145// case to see how much that matters. Note: the writeback here is sprinkled
146// into the sequence in such a way that things still work if z = x, i.e. if
147// the output overwrites the input buffer and beyond.
148
149 mov rdx, [x]
150 mulx rdx, rax, rdx
151 mov [z], rax
152 adcx d1, d1
153 adox d1, rdx
154 mov rdx, [x+8]
155 mov [z+8], d1
156 mulx rdx, rax, rdx
157 adcx d2, d2
158 adox d2, rax
159 adcx d3, d3
160 adox d3, rdx
161 mov rdx, [x+16]
162 mov [z+16], d2
163 mulx rdx, rax, rdx
164 adcx d4, d4
165 adox d4, rax
166 adcx d5, d5
167 adox d5, rdx
168 mov rdx, [x+24]
169 mov [z+24], d3
170 mulx rdx, rax, rdx
171 adcx d6, d6
172 adox d6, rax
173 adcx d7, d7
174 adox d7, rdx
175 mov rdx, [x+32]
176 mov [z+32], d4
177 mulx rdx, rax, rdx
178 adcx d8, d8
179 adox d8, rax
180 adcx d9, d9
181 adox d9, rdx
182 mov rdx, [x+40]
183 mov [z+40], d5
184 mulx rdx, rax, rdx
185 mov [z+48], d6
186 adcx d10, d10
187 mov [z+56], d7
188 adox d10, rax
189 mov [z+64], d8
190 mov eax, 0
191 mov [z+72], d9
192 adcx rdx, rax
193 mov [z+80], d10
194 adox rdx, rax
195 mov [z+88], rdx
196
197// Restore saved registers and return
198
199 pop r15
200 pop r14
201 pop r13
202 pop r12
203 pop rbx
204 pop rbp
205
206#if WINDOWS_ABI
207 pop rsi
208 pop rdi
209#endif
210 ret
211
212#if defined(__linux__) && defined(__ELF__)
213.section .note.GNU-stack,"",%progbits
214#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..a517e5c654
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,197 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
3
4// ----------------------------------------------------------------------------
5// Square, z := x^2
6// Input x[6]; output z[12]
7//
8// extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
9// const uint64_t x[static 6]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x
12// Microsoft x64 ABI: RCX = z, RDX = x
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
20 .text
21
22// Input arguments
23
24#define z rdi
25#define x rsi
26
27// Other variables used as a rotating 3-word window to add terms to
28
29#define t0 r8
30#define t1 r9
31#define t2 r10
32
33// Additional temporaries for local windows to share doublings
34
35#define u0 rcx
36#define u1 r11
37
38// Macro for the key "multiply and add to (c,h,l)" step
39
40#define combadd(c,h,l,numa,numb) \
41 mov rax, numa; \
42 mul QWORD PTR numb; \
43 add l, rax; \
44 adc h, rdx; \
45 adc c, 0
46
47// Set up initial window (c,h,l) = numa * numb
48
49#define combaddz(c,h,l,numa,numb) \
50 mov rax, numa; \
51 mul QWORD PTR numb; \
52 xor c, c; \
53 mov l, rax; \
54 mov h, rdx
55
56// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
57
58#define doubladd(c,h,l,hh,ll) \
59 add ll, ll; \
60 adc hh, hh; \
61 adc c, c; \
62 add l, ll; \
63 adc h, hh; \
64 adc c, 0
65
66// Square term incorporation (c,h,l) += numba^2
67
68#define combadd1(c,h,l,numa) \
69 mov rax, numa; \
70 mul rax; \
71 add l, rax; \
72 adc h, rdx; \
73 adc c, 0
74
75// A short form where we don't expect a top carry
76
77#define combads(h,l,numa) \
78 mov rax, numa; \
79 mul rax; \
80 add l, rax; \
81 adc h, rdx
82
83// A version doubling directly before adding, for single non-square terms
84
85#define combadd2(c,h,l,numa,numb) \
86 mov rax, numa; \
87 mul QWORD PTR numb; \
88 add rax, rax; \
89 adc rdx, rdx; \
90 adc c, 0; \
91 add l, rax; \
92 adc h, rdx; \
93 adc c, 0
94
95S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
96 _CET_ENDBR
97
98#if WINDOWS_ABI
99 push rdi
100 push rsi
101 mov rdi, rcx
102 mov rsi, rdx
103#endif
104
105// Result term 0
106
107 mov rax, [x]
108 mul rax
109
110 mov [z], rax
111 mov t0, rdx
112 xor t1, t1
113
114// Result term 1
115
116 xor t2, t2
117 combadd2(t2,t1,t0,[x],[x+8])
118 mov [z+8], t0
119
120// Result term 2
121
122 xor t0, t0
123 combadd1(t0,t2,t1,[x+8])
124 combadd2(t0,t2,t1,[x],[x+16])
125 mov [z+16], t1
126
127// Result term 3
128
129 combaddz(t1,u1,u0,[x],[x+24])
130 combadd(t1,u1,u0,[x+8],[x+16])
131 doubladd(t1,t0,t2,u1,u0)
132 mov [z+24], t2
133
134// Result term 4
135
136 combaddz(t2,u1,u0,[x],[x+32])
137 combadd(t2,u1,u0,[x+8],[x+24])
138 doubladd(t2,t1,t0,u1,u0)
139 combadd1(t2,t1,t0,[x+16])
140 mov [z+32], t0
141
142// Result term 5
143
144 combaddz(t0,u1,u0,[x],[x+40])
145 combadd(t0,u1,u0,[x+8],[x+32])
146 combadd(t0,u1,u0,[x+16],[x+24])
147 doubladd(t0,t2,t1,u1,u0)
148 mov [z+40], t1
149
150// Result term 6
151
152 combaddz(t1,u1,u0,[x+8],[x+40])
153 combadd(t1,u1,u0,[x+16],[x+32])
154 doubladd(t1,t0,t2,u1,u0)
155 combadd1(t1,t0,t2,[x+24])
156 mov [z+48], t2
157
158// Result term 7
159
160 combaddz(t2,u1,u0,[x+16],[x+40])
161 combadd(t2,u1,u0,[x+24],[x+32])
162 doubladd(t2,t1,t0,u1,u0)
163 mov [z+56], t0
164
165// Result term 8
166
167 xor t0, t0
168 combadd2(t0,t2,t1,[x+24],[x+40])
169 combadd1(t0,t2,t1,[x+32])
170 mov [z+64], t1
171
172// Result term 9
173
174 xor t1, t1
175 combadd2(t1,t0,t2,[x+32],[x+40])
176 mov [z+72], t2
177
178// Result term 10
179
180 combads(t1,t0,[x+40])
181 mov [z+80], t0
182
183// Result term 11
184
185 mov [z+88], t1
186
187// Return
188
189#if WINDOWS_ABI
190 pop rsi
191 pop rdi
192#endif
193 ret
194
195#if defined(__linux__) && defined(__ELF__)
196.section .note.GNU-stack,"",%progbits
197#endif