diff options
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S | 174 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S | 260 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S | 145 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S | 298 |
4 files changed, 877 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S new file mode 100644 index 0000000000..5e9aebd685 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S | |||
@@ -0,0 +1,174 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[4], y[4]; output z[8] | ||
7 | // | ||
8 | // extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4], | ||
9 | // const uint64_t y[static 4]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8) | ||
20 | .text | ||
21 | |||
22 | // These are actually right | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // Copied in or set up | ||
28 | |||
29 | #define y rcx | ||
30 | |||
31 | // A zero register | ||
32 | |||
33 | #define zero rbp | ||
34 | #define zeroe ebp | ||
35 | |||
36 | // Add in x[i] * rdx to the (i,i+1) position with the register window | ||
37 | // Would be nice to have conditional expressions reg[i], reg[i+1] ... | ||
38 | |||
39 | .macro mulpadd arg1,arg2 | ||
40 | mulx rbx, rax, [x+8*\arg2] | ||
41 | .if ((\arg1 + \arg2) % 4 == 0) | ||
42 | adcx r8, rax | ||
43 | adox r9, rbx | ||
44 | .elseif ((\arg1 + \arg2) % 4 == 1) | ||
45 | adcx r9, rax | ||
46 | adox r10, rbx | ||
47 | .elseif ((\arg1 + \arg2) % 4 == 2) | ||
48 | adcx r10, rax | ||
49 | adox r11, rbx | ||
50 | .elseif ((\arg1 + \arg2) % 4 == 3) | ||
51 | adcx r11, rax | ||
52 | adox r8, rbx | ||
53 | .endif | ||
54 | |||
55 | .endm | ||
56 | |||
57 | |||
58 | // Add in the whole j'th row | ||
59 | |||
60 | .macro addrow arg1 | ||
61 | mov rdx, [y+8*\arg1] | ||
62 | xor zeroe, zeroe | ||
63 | |||
64 | mulpadd \arg1, 0 | ||
65 | |||
66 | .if (\arg1 % 4 == 0) | ||
67 | mov [z+8*\arg1],r8 | ||
68 | .elseif (\arg1 % 4 == 1) | ||
69 | mov [z+8*\arg1],r9 | ||
70 | .elseif (\arg1 % 4 == 2) | ||
71 | mov [z+8*\arg1],r10 | ||
72 | .elseif (\arg1 % 4 == 3) | ||
73 | mov [z+8*\arg1],r11 | ||
74 | .endif | ||
75 | |||
76 | mulpadd \arg1, 1 | ||
77 | mulpadd \arg1, 2 | ||
78 | |||
79 | .if (\arg1 % 4 == 0) | ||
80 | mulx r8, rax, [x+24] | ||
81 | adcx r11, rax | ||
82 | adox r8, zero | ||
83 | adcx r8, zero | ||
84 | .elseif (\arg1 % 4 == 1) | ||
85 | mulx r9, rax, [x+24] | ||
86 | adcx r8, rax | ||
87 | adox r9, zero | ||
88 | adcx r9, zero | ||
89 | .elseif (\arg1 % 4 == 2) | ||
90 | mulx r10, rax, [x+24] | ||
91 | adcx r9, rax | ||
92 | adox r10, zero | ||
93 | adcx r10, zero | ||
94 | .elseif (\arg1 % 4 == 3) | ||
95 | mulx r11, rax, [x+24] | ||
96 | adcx r10, rax | ||
97 | adox r11, zero | ||
98 | adcx r11, zero | ||
99 | .endif | ||
100 | |||
101 | .endm | ||
102 | |||
103 | |||
104 | |||
105 | S2N_BN_SYMBOL(bignum_mul_4_8): | ||
106 | _CET_ENDBR | ||
107 | |||
108 | #if WINDOWS_ABI | ||
109 | push rdi | ||
110 | push rsi | ||
111 | mov rdi, rcx | ||
112 | mov rsi, rdx | ||
113 | mov rdx, r8 | ||
114 | #endif | ||
115 | |||
116 | // Save more registers to play with | ||
117 | |||
118 | push rbp | ||
119 | push rbx | ||
120 | |||
121 | // Copy y into a safe register to start with | ||
122 | |||
123 | mov y, rdx | ||
124 | |||
125 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
126 | |||
127 | xor zeroe, zeroe | ||
128 | |||
129 | // Do the zeroth row, which is a bit different | ||
130 | // Write back the zero-zero product and then accumulate | ||
131 | // r8,r11,r10,r9 as y[0] * x from 1..4 | ||
132 | |||
133 | mov rdx, [y] | ||
134 | |||
135 | mulx r9, r8, [x] | ||
136 | mov [z], r8 | ||
137 | |||
138 | mulx r10, rbx, [x+8] | ||
139 | adcx r9, rbx | ||
140 | |||
141 | mulx r11, rbx, [x+16] | ||
142 | adcx r10, rbx | ||
143 | |||
144 | mulx r8, rbx, [x+24] | ||
145 | adcx r11, rbx | ||
146 | adcx r8, zero | ||
147 | |||
148 | // Now all the other rows in a uniform pattern | ||
149 | |||
150 | addrow 1 | ||
151 | addrow 2 | ||
152 | addrow 3 | ||
153 | |||
154 | // Now write back the additional columns | ||
155 | |||
156 | mov [z+32], r8 | ||
157 | mov [z+40], r9 | ||
158 | mov [z+48], r10 | ||
159 | mov [z+56], r11 | ||
160 | |||
161 | // Restore registers and return | ||
162 | |||
163 | pop rbx | ||
164 | pop rbp | ||
165 | |||
166 | #if WINDOWS_ABI | ||
167 | pop rsi | ||
168 | pop rdi | ||
169 | #endif | ||
170 | ret | ||
171 | |||
172 | #if defined(__linux__) && defined(__ELF__) | ||
173 | .section .note.GNU-stack,"",%progbits | ||
174 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S new file mode 100644 index 0000000000..7bfae22a3f --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S | |||
@@ -0,0 +1,260 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[8], y[8]; output z[16] | ||
7 | // | ||
8 | // extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8], | ||
9 | // const uint64_t y[static 8]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16) | ||
20 | .text | ||
21 | |||
22 | // These are actually right | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // Copied in or set up | ||
28 | |||
29 | #define y rcx | ||
30 | |||
31 | // A zero register | ||
32 | |||
33 | #define zero rbp | ||
34 | #define zeroe ebp | ||
35 | |||
36 | // mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j | ||
37 | |||
38 | .macro mulpadd arg1,arg2 | ||
39 | mulx rbx, rax, [x+8*\arg1] | ||
40 | .if ((\arg1 + \arg2) % 8 == 0) | ||
41 | adcx r8, rax | ||
42 | adox r9, rbx | ||
43 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
44 | adcx r9, rax | ||
45 | adox r10, rbx | ||
46 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
47 | adcx r10, rax | ||
48 | adox r11, rbx | ||
49 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
50 | adcx r11, rax | ||
51 | adox r12, rbx | ||
52 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
53 | adcx r12, rax | ||
54 | adox r13, rbx | ||
55 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
56 | adcx r13, rax | ||
57 | adox r14, rbx | ||
58 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
59 | adcx r14, rax | ||
60 | adox r15, rbx | ||
61 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
62 | adcx r15, rax | ||
63 | adox r8, rbx | ||
64 | .endif | ||
65 | |||
66 | .endm | ||
67 | |||
68 | // mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j | ||
69 | // but re-creates the top word assuming nothing to add there | ||
70 | |||
71 | .macro mulpade arg1,arg2 | ||
72 | .if ((\arg1 + \arg2) % 8 == 0) | ||
73 | mulx r9, rax, [x+8*\arg1] | ||
74 | adcx r8, rax | ||
75 | adox r9, zero | ||
76 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
77 | mulx r10, rax, [x+8*\arg1] | ||
78 | adcx r9, rax | ||
79 | adox r10, zero | ||
80 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
81 | mulx r11, rax, [x+8*\arg1] | ||
82 | adcx r10, rax | ||
83 | adox r11, zero | ||
84 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
85 | mulx r12, rax, [x+8*\arg1] | ||
86 | adcx r11, rax | ||
87 | adox r12, zero | ||
88 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
89 | mulx r13, rax, [x+8*\arg1] | ||
90 | adcx r12, rax | ||
91 | adox r13, zero | ||
92 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
93 | mulx r14, rax, [x+8*\arg1] | ||
94 | adcx r13, rax | ||
95 | adox r14, zero | ||
96 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
97 | mulx r15, rax, [x+8*\arg1] | ||
98 | adcx r14, rax | ||
99 | adox r15, zero | ||
100 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
101 | mulx r8, rax, [x+8*\arg1] | ||
102 | adcx r15, rax | ||
103 | adox r8, zero | ||
104 | .endif | ||
105 | |||
106 | .endm | ||
107 | |||
108 | // Add in the whole j'th row | ||
109 | |||
110 | .macro addrow arg1 | ||
111 | mov rdx, [y+8*\arg1] | ||
112 | xor zeroe, zeroe | ||
113 | |||
114 | mulpadd 0, \arg1 | ||
115 | |||
116 | .if (\arg1 % 8 == 0) | ||
117 | mov [z+8*\arg1],r8 | ||
118 | .elseif (\arg1 % 8 == 1) | ||
119 | mov [z+8*\arg1],r9 | ||
120 | .elseif (\arg1 % 8 == 2) | ||
121 | mov [z+8*\arg1],r10 | ||
122 | .elseif (\arg1 % 8 == 3) | ||
123 | mov [z+8*\arg1],r11 | ||
124 | .elseif (\arg1 % 8 == 4) | ||
125 | mov [z+8*\arg1],r12 | ||
126 | .elseif (\arg1 % 8 == 5) | ||
127 | mov [z+8*\arg1],r13 | ||
128 | .elseif (\arg1 % 8 == 6) | ||
129 | mov [z+8*\arg1],r14 | ||
130 | .elseif (\arg1 % 8 == 7) | ||
131 | mov [z+8*\arg1],r15 | ||
132 | .endif | ||
133 | |||
134 | mulpadd 1, \arg1 | ||
135 | mulpadd 2, \arg1 | ||
136 | mulpadd 3, \arg1 | ||
137 | mulpadd 4, \arg1 | ||
138 | mulpadd 5, \arg1 | ||
139 | mulpadd 6, \arg1 | ||
140 | mulpade 7, \arg1 | ||
141 | |||
142 | .if (\arg1 % 8 == 0) | ||
143 | adc r8, zero | ||
144 | .elseif (\arg1 % 8 == 1) | ||
145 | adc r9, zero | ||
146 | .elseif (\arg1 % 8 == 2) | ||
147 | adc r10, zero | ||
148 | .elseif (\arg1 % 8 == 3) | ||
149 | adc r11, zero | ||
150 | .elseif (\arg1 % 8 == 4) | ||
151 | adc r12, zero | ||
152 | .elseif (\arg1 % 8 == 5) | ||
153 | adc r13, zero | ||
154 | .elseif (\arg1 % 8 == 6) | ||
155 | adc r14, zero | ||
156 | .elseif (\arg1 % 8 == 7) | ||
157 | adc r15, zero | ||
158 | .endif | ||
159 | |||
160 | .endm | ||
161 | |||
162 | |||
163 | S2N_BN_SYMBOL(bignum_mul_8_16): | ||
164 | _CET_ENDBR | ||
165 | |||
166 | #if WINDOWS_ABI | ||
167 | push rdi | ||
168 | push rsi | ||
169 | mov rdi, rcx | ||
170 | mov rsi, rdx | ||
171 | mov rdx, r8 | ||
172 | #endif | ||
173 | |||
174 | // Save more registers to play with | ||
175 | |||
176 | push rbp | ||
177 | push rbx | ||
178 | push r12 | ||
179 | push r13 | ||
180 | push r14 | ||
181 | push r15 | ||
182 | |||
183 | // Copy y into a safe register to start with | ||
184 | |||
185 | mov y, rdx | ||
186 | |||
187 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
188 | |||
189 | xor zeroe, zeroe | ||
190 | |||
191 | // Do the zeroth row, which is a bit different | ||
192 | // Write back the zero-zero product and then accumulate | ||
193 | // r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8 | ||
194 | |||
195 | mov rdx, [y] | ||
196 | |||
197 | mulx r9, r8, [x] | ||
198 | mov [z], r8 | ||
199 | |||
200 | mulx r10, rbx, [x+8] | ||
201 | adc r9, rbx | ||
202 | |||
203 | mulx r11, rbx, [x+16] | ||
204 | adc r10, rbx | ||
205 | |||
206 | mulx r12, rbx, [x+24] | ||
207 | adc r11, rbx | ||
208 | |||
209 | mulx r13, rbx, [x+32] | ||
210 | adc r12, rbx | ||
211 | |||
212 | mulx r14, rbx, [x+40] | ||
213 | adc r13, rbx | ||
214 | |||
215 | mulx r15, rbx, [x+48] | ||
216 | adc r14, rbx | ||
217 | |||
218 | mulx r8, rbx, [x+56] | ||
219 | adc r15, rbx | ||
220 | adc r8, zero | ||
221 | |||
222 | // Now all the other rows in a uniform pattern | ||
223 | |||
224 | addrow 1 | ||
225 | addrow 2 | ||
226 | addrow 3 | ||
227 | addrow 4 | ||
228 | addrow 5 | ||
229 | addrow 6 | ||
230 | addrow 7 | ||
231 | |||
232 | // Now write back the additional columns | ||
233 | |||
234 | mov [z+64], r8 | ||
235 | mov [z+72], r9 | ||
236 | mov [z+80], r10 | ||
237 | mov [z+88], r11 | ||
238 | mov [z+96], r12 | ||
239 | mov [z+104], r13 | ||
240 | mov [z+112], r14 | ||
241 | mov [z+120], r15 | ||
242 | |||
243 | // Real epilog | ||
244 | |||
245 | pop r15 | ||
246 | pop r14 | ||
247 | pop r13 | ||
248 | pop r12 | ||
249 | pop rbx | ||
250 | pop rbp | ||
251 | |||
252 | #if WINDOWS_ABI | ||
253 | pop rsi | ||
254 | pop rdi | ||
255 | #endif | ||
256 | ret | ||
257 | |||
258 | #if defined(__linux__) && defined(__ELF__) | ||
259 | .section .note.GNU-stack,"",%progbits | ||
260 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S new file mode 100644 index 0000000000..205455bd2c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S | |||
@@ -0,0 +1,145 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Square, z := x^2 | ||
6 | // Input x[4]; output z[8] | ||
7 | // | ||
8 | // extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]); | ||
9 | // | ||
10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
12 | // ---------------------------------------------------------------------------- | ||
13 | |||
14 | #include "_internal_s2n_bignum.h" | ||
15 | |||
16 | .intel_syntax noprefix | ||
17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8) | ||
18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8) | ||
19 | .text | ||
20 | |||
21 | // These are actually right | ||
22 | |||
23 | #define z rdi | ||
24 | #define x rsi | ||
25 | |||
26 | // A zero register | ||
27 | |||
28 | #define zero rbp | ||
29 | #define zeroe ebp | ||
30 | |||
31 | // Other registers | ||
32 | |||
33 | #define d1 r8 | ||
34 | #define d2 r9 | ||
35 | #define d3 r10 | ||
36 | #define d4 r11 | ||
37 | #define d5 r12 | ||
38 | #define d6 r13 | ||
39 | |||
40 | |||
41 | |||
42 | S2N_BN_SYMBOL(bignum_sqr_4_8): | ||
43 | _CET_ENDBR | ||
44 | |||
45 | #if WINDOWS_ABI | ||
46 | push rdi | ||
47 | push rsi | ||
48 | mov rdi, rcx | ||
49 | mov rsi, rdx | ||
50 | #endif | ||
51 | |||
52 | // Save more registers to play with | ||
53 | |||
54 | push rbp | ||
55 | push r12 | ||
56 | push r13 | ||
57 | |||
58 | // Set up an initial window [d6;...d1] = [23;03;01] | ||
59 | |||
60 | mov rdx, [x] | ||
61 | mulx d2, d1, [x+8] | ||
62 | mulx d4, d3, [x+24] | ||
63 | mov rdx, [x+16] | ||
64 | mulx d6, d5, [x+24] | ||
65 | |||
66 | // Clear our zero register, and also initialize the flags for the carry chain | ||
67 | |||
68 | xor zeroe, zeroe | ||
69 | |||
70 | // Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) | ||
71 | // This gives all the "heterogeneous" terms of the squaring ready to double | ||
72 | |||
73 | mulx rcx, rax, [x] | ||
74 | adcx d2, rax | ||
75 | adox d3, rcx | ||
76 | mulx rcx, rax, [x+8] | ||
77 | adcx d3, rax | ||
78 | adox d4, rcx | ||
79 | mov rdx, [x+24] | ||
80 | mulx rcx, rax, [x+8] | ||
81 | adcx d4, rax | ||
82 | adox d5, rcx | ||
83 | adcx d5, zero | ||
84 | adox d6, zero | ||
85 | adcx d6, zero | ||
86 | |||
87 | // In principle this is otiose as CF and OF carries are absorbed at this point | ||
88 | // However it seems helpful for the OOO engine to be told it's a fresh start | ||
89 | |||
90 | xor zeroe, zeroe | ||
91 | |||
92 | // Double and add to the 00 + 11 + 22 + 33 terms | ||
93 | // | ||
94 | // We could use shift-double but this seems tidier and in larger squarings | ||
95 | // it was actually more efficient. I haven't experimented with this small | ||
96 | // case to see how much that matters. Note: the writeback here is sprinkled | ||
97 | // into the sequence in such a way that things still work if z = x, i.e. if | ||
98 | // the output overwrites the input buffer and beyond. | ||
99 | |||
100 | mov rdx, [x] | ||
101 | mulx rdx, rax, rdx | ||
102 | mov [z], rax | ||
103 | adcx d1, d1 | ||
104 | adox d1, rdx | ||
105 | mov rdx, [x+8] | ||
106 | mov [z+8], d1 | ||
107 | mulx rdx, rax, rdx | ||
108 | adcx d2, d2 | ||
109 | adox d2, rax | ||
110 | adcx d3, d3 | ||
111 | adox d3, rdx | ||
112 | mov rdx, [x+16] | ||
113 | mov [z+16], d2 | ||
114 | mulx rdx, rax, rdx | ||
115 | adcx d4, d4 | ||
116 | adox d4, rax | ||
117 | adcx d5, d5 | ||
118 | adox d5, rdx | ||
119 | mov rdx, [x+24] | ||
120 | mov [z+24], d3 | ||
121 | mulx rdx, rax, rdx | ||
122 | mov [z+32], d4 | ||
123 | adcx d6, d6 | ||
124 | mov [z+40], d5 | ||
125 | adox d6, rax | ||
126 | mov [z+48], d6 | ||
127 | adcx rdx, zero | ||
128 | adox rdx, zero | ||
129 | mov [z+56], rdx | ||
130 | |||
131 | // Restore saved registers and return | ||
132 | |||
133 | pop r13 | ||
134 | pop r12 | ||
135 | pop rbp | ||
136 | |||
137 | #if WINDOWS_ABI | ||
138 | pop rsi | ||
139 | pop rdi | ||
140 | #endif | ||
141 | ret | ||
142 | |||
143 | #if defined(__linux__) && defined(__ELF__) | ||
144 | .section .note.GNU-stack,"",%progbits | ||
145 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S new file mode 100644 index 0000000000..77741c603e --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S | |||
@@ -0,0 +1,298 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Square, z := x^2 | ||
6 | // Input x[8]; output z[16] | ||
7 | // | ||
8 | // extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]); | ||
9 | // | ||
10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
12 | // ---------------------------------------------------------------------------- | ||
13 | |||
14 | #include "_internal_s2n_bignum.h" | ||
15 | |||
16 | .intel_syntax noprefix | ||
17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16) | ||
18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16) | ||
19 | .text | ||
20 | |||
21 | // These are actually right | ||
22 | |||
23 | #define z rdi | ||
24 | #define x rsi | ||
25 | |||
26 | // A zero register | ||
27 | |||
28 | #define zero rbp | ||
29 | #define zeroe ebp | ||
30 | |||
31 | // mulpadd i, j adds rdx * x[i] into the window at the i+j point | ||
32 | |||
33 | .macro mulpadd arg1,arg2 | ||
34 | mulx rcx, rax, [x+8*\arg1] | ||
35 | .if ((\arg1 + \arg2) % 8 == 0) | ||
36 | adcx r8, rax | ||
37 | adox r9, rcx | ||
38 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
39 | adcx r9, rax | ||
40 | adox r10, rcx | ||
41 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
42 | adcx r10, rax | ||
43 | adox r11, rcx | ||
44 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
45 | adcx r11, rax | ||
46 | adox r12, rcx | ||
47 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
48 | adcx r12, rax | ||
49 | adox r13, rcx | ||
50 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
51 | adcx r13, rax | ||
52 | adox r14, rcx | ||
53 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
54 | adcx r14, rax | ||
55 | adox r15, rcx | ||
56 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
57 | adcx r15, rax | ||
58 | adox r8, rcx | ||
59 | .endif | ||
60 | |||
61 | .endm | ||
62 | |||
63 | // mulpade i, j adds rdx * x[i] into the window at i+j | ||
64 | // but re-creates the top word assuming nothing to add there | ||
65 | |||
66 | .macro mulpade arg1,arg2 | ||
67 | .if ((\arg1 + \arg2) % 8 == 0) | ||
68 | mulx r9, rax, [x+8*\arg1] | ||
69 | adcx r8, rax | ||
70 | adox r9, zero | ||
71 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
72 | mulx r10, rax, [x+8*\arg1] | ||
73 | adcx r9, rax | ||
74 | adox r10, zero | ||
75 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
76 | mulx r11, rax, [x+8*\arg1] | ||
77 | adcx r10, rax | ||
78 | adox r11, zero | ||
79 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
80 | mulx r12, rax, [x+8*\arg1] | ||
81 | adcx r11, rax | ||
82 | adox r12, zero | ||
83 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
84 | mulx r13, rax, [x+8*\arg1] | ||
85 | adcx r12, rax | ||
86 | adox r13, zero | ||
87 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
88 | mulx r14, rax, [x+8*\arg1] | ||
89 | adcx r13, rax | ||
90 | adox r14, zero | ||
91 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
92 | mulx r15, rax, [x+8*\arg1] | ||
93 | adcx r14, rax | ||
94 | adox r15, zero | ||
95 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
96 | mulx r8, rax, [x+8*\arg1] | ||
97 | adcx r15, rax | ||
98 | adox r8, zero | ||
99 | .endif | ||
100 | |||
101 | .endm | ||
102 | |||
103 | .macro diagonals | ||
104 | |||
105 | xor zeroe, zeroe | ||
106 | |||
107 | // Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 | ||
108 | |||
109 | mov rdx, [x] | ||
110 | mulx rax, r9, [x+8] | ||
111 | mov [z+8], r9 | ||
112 | mulx rcx, r10, [x+16] | ||
113 | adcx r10, rax | ||
114 | mov [z+16], r10 | ||
115 | mulx rax, r11, [x+24] | ||
116 | adcx r11, rcx | ||
117 | mulx rcx, r12, [x+32] | ||
118 | adcx r12, rax | ||
119 | mulx rax, r13, [x+40] | ||
120 | adcx r13, rcx | ||
121 | mulx rcx, r14, [x+48] | ||
122 | adcx r14, rax | ||
123 | mulx r8, r15, [x+56] | ||
124 | adcx r15, rcx | ||
125 | adcx r8, zero | ||
126 | |||
127 | // Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 | ||
128 | |||
129 | xor zeroe, zeroe | ||
130 | mov rdx, [x+8] | ||
131 | mulpadd 2, 1 | ||
132 | mov [z+24], r11 | ||
133 | mulpadd 3, 1 | ||
134 | mov [z+32], r12 | ||
135 | mulpadd 4, 1 | ||
136 | mulpadd 5, 1 | ||
137 | mulpadd 6, 1 | ||
138 | mulpade 7, 1 | ||
139 | mov rdx, [x+32] | ||
140 | mulpade 5, 4 | ||
141 | adcx r10, zero | ||
142 | |||
143 | // And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 | ||
144 | |||
145 | xor zeroe, zeroe | ||
146 | mov rdx, [x+16] | ||
147 | mulpadd 3, 2 | ||
148 | mov [z+40], r13 | ||
149 | mulpadd 4, 2 | ||
150 | mov [z+48], r14 | ||
151 | mulpadd 5, 2 | ||
152 | mulpadd 6, 2 | ||
153 | mulpadd 7, 2 | ||
154 | mov rdx, [x+48] | ||
155 | mulpade 4, 6 | ||
156 | mulpade 5, 6 | ||
157 | adcx r12, zero | ||
158 | |||
159 | // And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 | ||
160 | |||
161 | xor zeroe, zeroe | ||
162 | mov rdx, [x+24] | ||
163 | mulpadd 4, 3 | ||
164 | mov [z+56], r15 | ||
165 | mulpadd 5, 3 | ||
166 | mov [z+64], r8 | ||
167 | mulpadd 6, 3 | ||
168 | mulpadd 7, 3 | ||
169 | mov rdx, [x+56] | ||
170 | mulpadd 4, 7 | ||
171 | mulpade 5, 7 | ||
172 | mulpade 6, 7 | ||
173 | adcx r14, zero | ||
174 | |||
175 | // Double and add things; use z[1]..z[8] and thereafter the registers | ||
176 | // r9..r15 which haven't been written back yet | ||
177 | |||
178 | xor zeroe, zeroe | ||
179 | mov rdx, [x] | ||
180 | mulx rcx, rax, rdx | ||
181 | mov [z], rax | ||
182 | mov rax, [z+8] | ||
183 | adcx rax, rax | ||
184 | adox rax, rcx | ||
185 | mov [z+8], rax | ||
186 | |||
187 | mov rax, [z+16] | ||
188 | mov rdx, [x+8] | ||
189 | mulx rcx, rdx, rdx | ||
190 | adcx rax, rax | ||
191 | adox rax, rdx | ||
192 | mov [z+16], rax | ||
193 | mov rax, [z+24] | ||
194 | adcx rax, rax | ||
195 | adox rax, rcx | ||
196 | mov [z+24], rax | ||
197 | |||
198 | mov rax, [z+32] | ||
199 | mov rdx, [x+16] | ||
200 | mulx rcx, rdx, rdx | ||
201 | adcx rax, rax | ||
202 | adox rax, rdx | ||
203 | mov [z+32], rax | ||
204 | mov rax, [z+40] | ||
205 | adcx rax, rax | ||
206 | adox rax, rcx | ||
207 | mov [z+40], rax | ||
208 | |||
209 | mov rax, [z+48] | ||
210 | mov rdx, [x+24] | ||
211 | mulx rcx, rdx, rdx | ||
212 | adcx rax, rax | ||
213 | adox rax, rdx | ||
214 | mov [z+48], rax | ||
215 | mov rax, [z+56] | ||
216 | adcx rax, rax | ||
217 | adox rax, rcx | ||
218 | mov [z+56], rax | ||
219 | |||
220 | mov rax, [z+64] | ||
221 | mov rdx, [x+32] | ||
222 | mulx rcx, rdx, rdx | ||
223 | adcx rax, rax | ||
224 | adox rax, rdx | ||
225 | mov [z+64], rax | ||
226 | adcx r9, r9 | ||
227 | adox r9, rcx | ||
228 | mov [z+72], r9 | ||
229 | |||
230 | mov rdx, [x+40] | ||
231 | mulx rcx, rdx, rdx | ||
232 | adcx r10, r10 | ||
233 | adox r10, rdx | ||
234 | mov [z+80], r10 | ||
235 | adcx r11, r11 | ||
236 | adox r11, rcx | ||
237 | mov [z+88], r11 | ||
238 | |||
239 | mov rdx, [x+48] | ||
240 | mulx rcx, rdx, rdx | ||
241 | adcx r12, r12 | ||
242 | adox r12, rdx | ||
243 | mov [z+96], r12 | ||
244 | adcx r13, r13 | ||
245 | adox r13, rcx | ||
246 | mov [z+104], r13 | ||
247 | |||
248 | mov rdx, [x+56] | ||
249 | mulx r15, rdx, rdx | ||
250 | adcx r14, r14 | ||
251 | adox r14, rdx | ||
252 | mov [z+112], r14 | ||
253 | adcx r15, zero | ||
254 | adox r15, zero | ||
255 | mov [z+120], r15 | ||
256 | |||
257 | .endm | ||
258 | |||
259 | |||
260 | S2N_BN_SYMBOL(bignum_sqr_8_16): | ||
261 | _CET_ENDBR | ||
262 | |||
263 | #if WINDOWS_ABI | ||
264 | push rdi | ||
265 | push rsi | ||
266 | mov rdi, rcx | ||
267 | mov rsi, rdx | ||
268 | #endif | ||
269 | |||
270 | // Save more registers to play with | ||
271 | |||
272 | push rbp | ||
273 | push r12 | ||
274 | push r13 | ||
275 | push r14 | ||
276 | push r15 | ||
277 | |||
278 | // Do the multiplication | ||
279 | |||
280 | diagonals | ||
281 | |||
282 | // Real epilog | ||
283 | |||
284 | pop r15 | ||
285 | pop r14 | ||
286 | pop r13 | ||
287 | pop r12 | ||
288 | pop rbp | ||
289 | |||
290 | #if WINDOWS_ABI | ||
291 | pop rsi | ||
292 | pop rdi | ||
293 | #endif | ||
294 | ret | ||
295 | |||
296 | #if defined(__linux__) && defined(__ELF__) | ||
297 | .section .note.GNU-stack,"",%progbits | ||
298 | #endif | ||