diff options
author | jsing <> | 2025-08-12 10:09:46 +0000 |
---|---|---|
committer | jsing <> | 2025-08-12 10:09:46 +0000 |
commit | e5f4dc854c2c390526097888bf3654db21bdbb15 (patch) | |
tree | 25b70579b17511a42354b46d22fd95e599a86b6c /src | |
parent | f111c1e299493f0bdf21f9ff08f046d429764e75 (diff) | |
download | openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.gz openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.bz2 openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.zip |
Bring in bignum_{mul,sqr}_6_12{,_alt}() from s2n-bignum.
These provide fast multiplication and squaring of inputs with 6x words,
producing a 12 word result. The non-_alt versions require the CPU to
support ADX instructions, while the _alt versions do not.
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S | 210 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S | 186 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S | 214 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S | 197 |
4 files changed, 807 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S new file mode 100644 index 0000000000..081b602cde --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S | |||
@@ -0,0 +1,210 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[6], y[6]; output z[12] | ||
7 | // | ||
8 | // extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6], | ||
9 | // const uint64_t y[static 6]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12) | ||
20 | .text | ||
21 | |||
22 | // These are actually right | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // Copied in or set up | ||
28 | |||
29 | #define y rcx | ||
30 | |||
31 | // A zero register | ||
32 | |||
33 | #define zero rbp | ||
34 | #define zeroe ebp | ||
35 | |||
36 | // Add in x[i] * rdx to the (i,i+1) position with the register window | ||
37 | // Would be nice to have conditional expressions reg[i], reg[i+1] ... | ||
38 | |||
39 | .macro mulpadd arg1,arg2 | ||
40 | mulx rbx, rax, [x+8*\arg2] | ||
41 | .if ((\arg1 + \arg2) % 6 == 0) | ||
42 | adcx r8, rax | ||
43 | adox r9, rbx | ||
44 | .elseif ((\arg1 + \arg2) % 6 == 1) | ||
45 | adcx r9, rax | ||
46 | adox r10, rbx | ||
47 | .elseif ((\arg1 + \arg2) % 6 == 2) | ||
48 | adcx r10, rax | ||
49 | adox r11, rbx | ||
50 | .elseif ((\arg1 + \arg2) % 6 == 3) | ||
51 | adcx r11, rax | ||
52 | adox r12, rbx | ||
53 | .elseif ((\arg1 + \arg2) % 6 == 4) | ||
54 | adcx r12, rax | ||
55 | adox r13, rbx | ||
56 | .elseif ((\arg1 + \arg2) % 6 == 5) | ||
57 | adcx r13, rax | ||
58 | adox r8, rbx | ||
59 | .endif | ||
60 | |||
61 | .endm | ||
62 | |||
63 | |||
64 | // Add in the whole j'th row | ||
65 | |||
66 | .macro addrow arg1 | ||
67 | mov rdx, [y+8*\arg1] | ||
68 | xor zeroe, zeroe | ||
69 | |||
70 | mulpadd \arg1, 0 | ||
71 | |||
72 | .if (\arg1 % 6 == 0) | ||
73 | mov [z+8*\arg1],r8 | ||
74 | .elseif (\arg1 % 6 == 1) | ||
75 | mov [z+8*\arg1],r9 | ||
76 | .elseif (\arg1 % 6 == 2) | ||
77 | mov [z+8*\arg1],r10 | ||
78 | .elseif (\arg1 % 6 == 3) | ||
79 | mov [z+8*\arg1],r11 | ||
80 | .elseif (\arg1 % 6 == 4) | ||
81 | mov [z+8*\arg1],r12 | ||
82 | .elseif (\arg1 % 6 == 5) | ||
83 | mov [z+8*\arg1],r13 | ||
84 | .endif | ||
85 | |||
86 | mulpadd \arg1, 1 | ||
87 | mulpadd \arg1, 2 | ||
88 | mulpadd \arg1, 3 | ||
89 | mulpadd \arg1, 4 | ||
90 | |||
91 | .if (\arg1 % 6 == 0) | ||
92 | mulx r8, rax, [x+40] | ||
93 | adcx r13, rax | ||
94 | adox r8, zero | ||
95 | adcx r8, zero | ||
96 | .elseif (\arg1 % 6 == 1) | ||
97 | mulx r9, rax, [x+40] | ||
98 | adcx r8, rax | ||
99 | adox r9, zero | ||
100 | adcx r9, zero | ||
101 | .elseif (\arg1 % 6 == 2) | ||
102 | mulx r10, rax, [x+40] | ||
103 | adcx r9, rax | ||
104 | adox r10, zero | ||
105 | adcx r10, zero | ||
106 | .elseif (\arg1 % 6 == 3) | ||
107 | mulx r11, rax, [x+40] | ||
108 | adcx r10, rax | ||
109 | adox r11, zero | ||
110 | adcx r11, zero | ||
111 | .elseif (\arg1 % 6 == 4) | ||
112 | mulx r12, rax, [x+40] | ||
113 | adcx r11, rax | ||
114 | adox r12, zero | ||
115 | adcx r12, zero | ||
116 | .elseif (\arg1 % 6 == 5) | ||
117 | mulx r13, rax, [x+40] | ||
118 | adcx r12, rax | ||
119 | adox r13, zero | ||
120 | adcx r13, zero | ||
121 | .endif | ||
122 | |||
123 | .endm | ||
124 | |||
125 | |||
126 | |||
127 | S2N_BN_SYMBOL(bignum_mul_6_12): | ||
128 | _CET_ENDBR | ||
129 | |||
130 | #if WINDOWS_ABI | ||
131 | push rdi | ||
132 | push rsi | ||
133 | mov rdi, rcx | ||
134 | mov rsi, rdx | ||
135 | mov rdx, r8 | ||
136 | #endif | ||
137 | |||
138 | // Save more registers to play with | ||
139 | |||
140 | push rbp | ||
141 | push rbx | ||
142 | push r12 | ||
143 | push r13 | ||
144 | |||
145 | // Copy y into a safe register to start with | ||
146 | |||
147 | mov y, rdx | ||
148 | |||
149 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
150 | |||
151 | xor zeroe, zeroe | ||
152 | |||
153 | // Do the zeroth row, which is a bit different | ||
154 | // Write back the zero-zero product and then accumulate | ||
155 | // r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6 | ||
156 | |||
157 | mov rdx, [y] | ||
158 | |||
159 | mulx r9, r8, [x] | ||
160 | mov [z], r8 | ||
161 | |||
162 | mulx r10, rbx, [x+8] | ||
163 | adcx r9, rbx | ||
164 | |||
165 | mulx r11, rbx, [x+16] | ||
166 | adcx r10, rbx | ||
167 | |||
168 | mulx r12, rbx, [x+24] | ||
169 | adcx r11, rbx | ||
170 | |||
171 | mulx r13, rbx, [x+32] | ||
172 | adcx r12, rbx | ||
173 | |||
174 | mulx r8, rbx, [x+40] | ||
175 | adcx r13, rbx | ||
176 | adcx r8, zero | ||
177 | |||
178 | // Now all the other rows in a uniform pattern | ||
179 | |||
180 | addrow 1 | ||
181 | addrow 2 | ||
182 | addrow 3 | ||
183 | addrow 4 | ||
184 | addrow 5 | ||
185 | |||
186 | // Now write back the additional columns | ||
187 | |||
188 | mov [z+48], r8 | ||
189 | mov [z+56], r9 | ||
190 | mov [z+64], r10 | ||
191 | mov [z+72], r11 | ||
192 | mov [z+80], r12 | ||
193 | mov [z+88], r13 | ||
194 | |||
195 | // Restore registers and return | ||
196 | |||
197 | pop r13 | ||
198 | pop r12 | ||
199 | pop rbx | ||
200 | pop rbp | ||
201 | |||
202 | #if WINDOWS_ABI | ||
203 | pop rsi | ||
204 | pop rdi | ||
205 | #endif | ||
206 | ret | ||
207 | |||
208 | #if defined(__linux__) && defined(__ELF__) | ||
209 | .section .note.GNU-stack,"",%progbits | ||
210 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S new file mode 100644 index 0000000000..f4ea4ed68b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S | |||
@@ -0,0 +1,186 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[6], y[6]; output z[12] | ||
7 | // | ||
8 | // extern void bignum_mul_6_12_alt(uint64_t z[static 12], | ||
9 | // const uint64_t x[static 6], | ||
10 | // const uint64_t y[static 6]); | ||
11 | // | ||
12 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
13 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
14 | // ---------------------------------------------------------------------------- | ||
15 | |||
16 | #include "_internal_s2n_bignum.h" | ||
17 | |||
18 | .intel_syntax noprefix | ||
19 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt) | ||
20 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt) | ||
21 | .text | ||
22 | |||
23 | // These are actually right | ||
24 | |||
25 | #define z rdi | ||
26 | #define x rsi | ||
27 | |||
28 | // This is moved from rdx to free it for muls | ||
29 | |||
30 | #define y rcx | ||
31 | |||
32 | // Other variables used as a rotating 3-word window to add terms to | ||
33 | |||
34 | #define t0 r8 | ||
35 | #define t1 r9 | ||
36 | #define t2 r10 | ||
37 | |||
38 | // Macro for the key "multiply and add to (c,h,l)" step | ||
39 | |||
40 | #define combadd(c,h,l,numa,numb) \ | ||
41 | mov rax, numa; \ | ||
42 | mul QWORD PTR numb; \ | ||
43 | add l, rax; \ | ||
44 | adc h, rdx; \ | ||
45 | adc c, 0 | ||
46 | |||
47 | // A minutely shorter form for when c = 0 initially | ||
48 | |||
49 | #define combadz(c,h,l,numa,numb) \ | ||
50 | mov rax, numa; \ | ||
51 | mul QWORD PTR numb; \ | ||
52 | add l, rax; \ | ||
53 | adc h, rdx; \ | ||
54 | adc c, c | ||
55 | |||
56 | // A short form where we don't expect a top carry | ||
57 | |||
58 | #define combads(h,l,numa,numb) \ | ||
59 | mov rax, numa; \ | ||
60 | mul QWORD PTR numb; \ | ||
61 | add l, rax; \ | ||
62 | adc h, rdx | ||
63 | |||
64 | S2N_BN_SYMBOL(bignum_mul_6_12_alt): | ||
65 | _CET_ENDBR | ||
66 | |||
67 | #if WINDOWS_ABI | ||
68 | push rdi | ||
69 | push rsi | ||
70 | mov rdi, rcx | ||
71 | mov rsi, rdx | ||
72 | mov rdx, r8 | ||
73 | #endif | ||
74 | |||
75 | // Copy y into a safe register to start with | ||
76 | |||
77 | mov y, rdx | ||
78 | |||
79 | // Result term 0 | ||
80 | |||
81 | mov rax, [x] | ||
82 | mul QWORD PTR [y] | ||
83 | |||
84 | mov [z], rax | ||
85 | mov t0, rdx | ||
86 | xor t1, t1 | ||
87 | |||
88 | // Result term 1 | ||
89 | |||
90 | xor t2, t2 | ||
91 | combads(t1,t0,[x],[y+8]) | ||
92 | combadz(t2,t1,t0,[x+8],[y]) | ||
93 | mov [z+8], t0 | ||
94 | |||
95 | // Result term 2 | ||
96 | |||
97 | xor t0, t0 | ||
98 | combadz(t0,t2,t1,[x],[y+16]) | ||
99 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
100 | combadd(t0,t2,t1,[x+16],[y]) | ||
101 | mov [z+16], t1 | ||
102 | |||
103 | // Result term 3 | ||
104 | |||
105 | xor t1, t1 | ||
106 | combadz(t1,t0,t2,[x],[y+24]) | ||
107 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
108 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
109 | combadd(t1,t0,t2,[x+24],[y]) | ||
110 | mov [z+24], t2 | ||
111 | |||
112 | // Result term 4 | ||
113 | |||
114 | xor t2, t2 | ||
115 | combadz(t2,t1,t0,[x],[y+32]) | ||
116 | combadd(t2,t1,t0,[x+8],[y+24]) | ||
117 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
118 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
119 | combadd(t2,t1,t0,[x+32],[y]) | ||
120 | mov [z+32], t0 | ||
121 | |||
122 | // Result term 5 | ||
123 | |||
124 | xor t0, t0 | ||
125 | combadz(t0,t2,t1,[x],[y+40]) | ||
126 | combadd(t0,t2,t1,[x+8],[y+32]) | ||
127 | combadd(t0,t2,t1,[x+16],[y+24]) | ||
128 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
129 | combadd(t0,t2,t1,[x+32],[y+8]) | ||
130 | combadd(t0,t2,t1,[x+40],[y]) | ||
131 | mov [z+40], t1 | ||
132 | |||
133 | // Result term 6 | ||
134 | |||
135 | xor t1, t1 | ||
136 | combadz(t1,t0,t2,[x+8],[y+40]) | ||
137 | combadd(t1,t0,t2,[x+16],[y+32]) | ||
138 | combadd(t1,t0,t2,[x+24],[y+24]) | ||
139 | combadd(t1,t0,t2,[x+32],[y+16]) | ||
140 | combadd(t1,t0,t2,[x+40],[y+8]) | ||
141 | mov [z+48], t2 | ||
142 | |||
143 | // Result term 7 | ||
144 | |||
145 | xor t2, t2 | ||
146 | combadz(t2,t1,t0,[x+16],[y+40]) | ||
147 | combadd(t2,t1,t0,[x+24],[y+32]) | ||
148 | combadd(t2,t1,t0,[x+32],[y+24]) | ||
149 | combadd(t2,t1,t0,[x+40],[y+16]) | ||
150 | mov [z+56], t0 | ||
151 | |||
152 | // Result term 8 | ||
153 | |||
154 | xor t0, t0 | ||
155 | combadz(t0,t2,t1,[x+24],[y+40]) | ||
156 | combadd(t0,t2,t1,[x+32],[y+32]) | ||
157 | combadd(t0,t2,t1,[x+40],[y+24]) | ||
158 | mov [z+64], t1 | ||
159 | |||
160 | // Result term 9 | ||
161 | |||
162 | xor t1, t1 | ||
163 | combadz(t1,t0,t2,[x+32],[y+40]) | ||
164 | combadd(t1,t0,t2,[x+40],[y+32]) | ||
165 | mov [z+72], t2 | ||
166 | |||
167 | // Result term 10 | ||
168 | |||
169 | combads(t1,t0,[x+40],[y+40]) | ||
170 | mov [z+80], t0 | ||
171 | |||
172 | // Result term 11 | ||
173 | |||
174 | mov [z+88], t1 | ||
175 | |||
176 | // Return | ||
177 | |||
178 | #if WINDOWS_ABI | ||
179 | pop rsi | ||
180 | pop rdi | ||
181 | #endif | ||
182 | ret | ||
183 | |||
184 | #if defined(__linux__) && defined(__ELF__) | ||
185 | .section .note.GNU-stack,"",%progbits | ||
186 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S new file mode 100644 index 0000000000..07d840b47c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S | |||
@@ -0,0 +1,214 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Square, z := x^2 | ||
6 | // Input x[6]; output z[12] | ||
7 | // | ||
8 | // extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]); | ||
9 | // | ||
10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
12 | // ---------------------------------------------------------------------------- | ||
13 | |||
14 | #include "_internal_s2n_bignum.h" | ||
15 | |||
16 | .intel_syntax noprefix | ||
17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12) | ||
18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12) | ||
19 | .text | ||
20 | |||
21 | // These are actually right | ||
22 | |||
23 | #define z rdi | ||
24 | #define x rsi | ||
25 | |||
26 | // A zero register | ||
27 | |||
28 | #define zero rbp | ||
29 | #define zeroe ebp | ||
30 | |||
31 | // Other registers | ||
32 | |||
33 | #define d1 r8 | ||
34 | #define d2 r9 | ||
35 | #define d3 r10 | ||
36 | #define d4 r11 | ||
37 | #define d5 r12 | ||
38 | #define d6 r13 | ||
39 | #define d7 r14 | ||
40 | #define d8 r15 | ||
41 | #define d9 rbx | ||
42 | |||
43 | // Care is needed: re-using the zero register | ||
44 | |||
45 | #define d10 rbp | ||
46 | |||
47 | |||
48 | S2N_BN_SYMBOL(bignum_sqr_6_12): | ||
49 | _CET_ENDBR | ||
50 | |||
51 | #if WINDOWS_ABI | ||
52 | push rdi | ||
53 | push rsi | ||
54 | mov rdi, rcx | ||
55 | mov rsi, rdx | ||
56 | #endif | ||
57 | |||
58 | // Save more registers to play with | ||
59 | |||
60 | push rbp | ||
61 | push rbx | ||
62 | push r12 | ||
63 | push r13 | ||
64 | push r14 | ||
65 | push r15 | ||
66 | |||
67 | // Set up an initial window [d8;...d1] = [34;05;03;01] | ||
68 | |||
69 | mov rdx, [x] | ||
70 | mulx d2, d1, [x+8] | ||
71 | mulx d4, d3, [x+24] | ||
72 | mulx d6, d5, [x+40] | ||
73 | mov rdx, [x+24] | ||
74 | mulx d8, d7, [x+32] | ||
75 | |||
76 | // Clear our zero register, and also initialize the flags for the carry chain | ||
77 | |||
78 | xor zeroe, zeroe | ||
79 | |||
80 | // Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window | ||
81 | // (no carry-out possible since we add it to the top of a product) | ||
82 | |||
83 | mov rdx, [x+16] | ||
84 | mulx rcx, rax, [x] | ||
85 | adcx d2, rax | ||
86 | adox d3, rcx | ||
87 | mulx rcx, rax, [x+8] | ||
88 | adcx d3, rax | ||
89 | adox d4, rcx | ||
90 | mov rdx, [x+8] | ||
91 | mulx rcx, rax, [x+24] | ||
92 | adcx d4, rax | ||
93 | adox d5, rcx | ||
94 | mulx rcx, rax, [x+32] | ||
95 | adcx d5, rax | ||
96 | adox d6, rcx | ||
97 | mulx rcx, rax, [x+40] | ||
98 | adcx d6, rax | ||
99 | adox d7, rcx | ||
100 | adcx d7, zero | ||
101 | adox d8, zero | ||
102 | adcx d8, zero | ||
103 | |||
104 | // Again zero out the flags. Actually they are already cleared but it may | ||
105 | // help decouple these in the OOO engine not to wait for the chain above | ||
106 | |||
107 | xor zeroe, zeroe | ||
108 | |||
109 | // Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms | ||
110 | // We are running out of registers and here our zero register is not zero! | ||
111 | |||
112 | mov rdx, [x+32] | ||
113 | mulx rcx, rax, [x] | ||
114 | adcx d4, rax | ||
115 | adox d5, rcx | ||
116 | mov rdx, [x+16] | ||
117 | mulx rcx, rax, [x+24] | ||
118 | adcx d5, rax | ||
119 | adox d6, rcx | ||
120 | mulx rcx, rax, [x+32] | ||
121 | adcx d6, rax | ||
122 | adox d7, rcx | ||
123 | mulx rcx, rax, [x+40] | ||
124 | adcx d7, rax | ||
125 | adox d8, rcx | ||
126 | mov rdx, [x+24] | ||
127 | mulx d9, rax, [x+40] | ||
128 | adcx d8, rax | ||
129 | adox d9, zero | ||
130 | mov rdx, [x+32] | ||
131 | mulx d10, rax, [x+40] | ||
132 | adcx d9, rax | ||
133 | mov eax, 0 | ||
134 | adox d10, rax | ||
135 | adcx d10, rax | ||
136 | |||
137 | // Again, just for a clear fresh start for the flags | ||
138 | |||
139 | xor eax, eax | ||
140 | |||
141 | // Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms | ||
142 | // | ||
143 | // We could use shift-double but this seems tidier and in larger squarings | ||
144 | // it was actually more efficient. I haven't experimented with this small | ||
145 | // case to see how much that matters. Note: the writeback here is sprinkled | ||
146 | // into the sequence in such a way that things still work if z = x, i.e. if | ||
147 | // the output overwrites the input buffer and beyond. | ||
148 | |||
149 | mov rdx, [x] | ||
150 | mulx rdx, rax, rdx | ||
151 | mov [z], rax | ||
152 | adcx d1, d1 | ||
153 | adox d1, rdx | ||
154 | mov rdx, [x+8] | ||
155 | mov [z+8], d1 | ||
156 | mulx rdx, rax, rdx | ||
157 | adcx d2, d2 | ||
158 | adox d2, rax | ||
159 | adcx d3, d3 | ||
160 | adox d3, rdx | ||
161 | mov rdx, [x+16] | ||
162 | mov [z+16], d2 | ||
163 | mulx rdx, rax, rdx | ||
164 | adcx d4, d4 | ||
165 | adox d4, rax | ||
166 | adcx d5, d5 | ||
167 | adox d5, rdx | ||
168 | mov rdx, [x+24] | ||
169 | mov [z+24], d3 | ||
170 | mulx rdx, rax, rdx | ||
171 | adcx d6, d6 | ||
172 | adox d6, rax | ||
173 | adcx d7, d7 | ||
174 | adox d7, rdx | ||
175 | mov rdx, [x+32] | ||
176 | mov [z+32], d4 | ||
177 | mulx rdx, rax, rdx | ||
178 | adcx d8, d8 | ||
179 | adox d8, rax | ||
180 | adcx d9, d9 | ||
181 | adox d9, rdx | ||
182 | mov rdx, [x+40] | ||
183 | mov [z+40], d5 | ||
184 | mulx rdx, rax, rdx | ||
185 | mov [z+48], d6 | ||
186 | adcx d10, d10 | ||
187 | mov [z+56], d7 | ||
188 | adox d10, rax | ||
189 | mov [z+64], d8 | ||
190 | mov eax, 0 | ||
191 | mov [z+72], d9 | ||
192 | adcx rdx, rax | ||
193 | mov [z+80], d10 | ||
194 | adox rdx, rax | ||
195 | mov [z+88], rdx | ||
196 | |||
197 | // Restore saved registers and return | ||
198 | |||
199 | pop r15 | ||
200 | pop r14 | ||
201 | pop r13 | ||
202 | pop r12 | ||
203 | pop rbx | ||
204 | pop rbp | ||
205 | |||
206 | #if WINDOWS_ABI | ||
207 | pop rsi | ||
208 | pop rdi | ||
209 | #endif | ||
210 | ret | ||
211 | |||
212 | #if defined(__linux__) && defined(__ELF__) | ||
213 | .section .note.GNU-stack,"",%progbits | ||
214 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S new file mode 100644 index 0000000000..a517e5c654 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S | |||
@@ -0,0 +1,197 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Square, z := x^2 | ||
6 | // Input x[6]; output z[12] | ||
7 | // | ||
8 | // extern void bignum_sqr_6_12_alt(uint64_t z[static 12], | ||
9 | // const uint64_t x[static 6]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt) | ||
20 | .text | ||
21 | |||
22 | // Input arguments | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // Other variables used as a rotating 3-word window to add terms to | ||
28 | |||
29 | #define t0 r8 | ||
30 | #define t1 r9 | ||
31 | #define t2 r10 | ||
32 | |||
33 | // Additional temporaries for local windows to share doublings | ||
34 | |||
35 | #define u0 rcx | ||
36 | #define u1 r11 | ||
37 | |||
38 | // Macro for the key "multiply and add to (c,h,l)" step | ||
39 | |||
40 | #define combadd(c,h,l,numa,numb) \ | ||
41 | mov rax, numa; \ | ||
42 | mul QWORD PTR numb; \ | ||
43 | add l, rax; \ | ||
44 | adc h, rdx; \ | ||
45 | adc c, 0 | ||
46 | |||
47 | // Set up initial window (c,h,l) = numa * numb | ||
48 | |||
49 | #define combaddz(c,h,l,numa,numb) \ | ||
50 | mov rax, numa; \ | ||
51 | mul QWORD PTR numb; \ | ||
52 | xor c, c; \ | ||
53 | mov l, rax; \ | ||
54 | mov h, rdx | ||
55 | |||
56 | // Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) | ||
57 | |||
58 | #define doubladd(c,h,l,hh,ll) \ | ||
59 | add ll, ll; \ | ||
60 | adc hh, hh; \ | ||
61 | adc c, c; \ | ||
62 | add l, ll; \ | ||
63 | adc h, hh; \ | ||
64 | adc c, 0 | ||
65 | |||
66 | // Square term incorporation (c,h,l) += numba^2 | ||
67 | |||
68 | #define combadd1(c,h,l,numa) \ | ||
69 | mov rax, numa; \ | ||
70 | mul rax; \ | ||
71 | add l, rax; \ | ||
72 | adc h, rdx; \ | ||
73 | adc c, 0 | ||
74 | |||
75 | // A short form where we don't expect a top carry | ||
76 | |||
77 | #define combads(h,l,numa) \ | ||
78 | mov rax, numa; \ | ||
79 | mul rax; \ | ||
80 | add l, rax; \ | ||
81 | adc h, rdx | ||
82 | |||
83 | // A version doubling directly before adding, for single non-square terms | ||
84 | |||
85 | #define combadd2(c,h,l,numa,numb) \ | ||
86 | mov rax, numa; \ | ||
87 | mul QWORD PTR numb; \ | ||
88 | add rax, rax; \ | ||
89 | adc rdx, rdx; \ | ||
90 | adc c, 0; \ | ||
91 | add l, rax; \ | ||
92 | adc h, rdx; \ | ||
93 | adc c, 0 | ||
94 | |||
95 | S2N_BN_SYMBOL(bignum_sqr_6_12_alt): | ||
96 | _CET_ENDBR | ||
97 | |||
98 | #if WINDOWS_ABI | ||
99 | push rdi | ||
100 | push rsi | ||
101 | mov rdi, rcx | ||
102 | mov rsi, rdx | ||
103 | #endif | ||
104 | |||
105 | // Result term 0 | ||
106 | |||
107 | mov rax, [x] | ||
108 | mul rax | ||
109 | |||
110 | mov [z], rax | ||
111 | mov t0, rdx | ||
112 | xor t1, t1 | ||
113 | |||
114 | // Result term 1 | ||
115 | |||
116 | xor t2, t2 | ||
117 | combadd2(t2,t1,t0,[x],[x+8]) | ||
118 | mov [z+8], t0 | ||
119 | |||
120 | // Result term 2 | ||
121 | |||
122 | xor t0, t0 | ||
123 | combadd1(t0,t2,t1,[x+8]) | ||
124 | combadd2(t0,t2,t1,[x],[x+16]) | ||
125 | mov [z+16], t1 | ||
126 | |||
127 | // Result term 3 | ||
128 | |||
129 | combaddz(t1,u1,u0,[x],[x+24]) | ||
130 | combadd(t1,u1,u0,[x+8],[x+16]) | ||
131 | doubladd(t1,t0,t2,u1,u0) | ||
132 | mov [z+24], t2 | ||
133 | |||
134 | // Result term 4 | ||
135 | |||
136 | combaddz(t2,u1,u0,[x],[x+32]) | ||
137 | combadd(t2,u1,u0,[x+8],[x+24]) | ||
138 | doubladd(t2,t1,t0,u1,u0) | ||
139 | combadd1(t2,t1,t0,[x+16]) | ||
140 | mov [z+32], t0 | ||
141 | |||
142 | // Result term 5 | ||
143 | |||
144 | combaddz(t0,u1,u0,[x],[x+40]) | ||
145 | combadd(t0,u1,u0,[x+8],[x+32]) | ||
146 | combadd(t0,u1,u0,[x+16],[x+24]) | ||
147 | doubladd(t0,t2,t1,u1,u0) | ||
148 | mov [z+40], t1 | ||
149 | |||
150 | // Result term 6 | ||
151 | |||
152 | combaddz(t1,u1,u0,[x+8],[x+40]) | ||
153 | combadd(t1,u1,u0,[x+16],[x+32]) | ||
154 | doubladd(t1,t0,t2,u1,u0) | ||
155 | combadd1(t1,t0,t2,[x+24]) | ||
156 | mov [z+48], t2 | ||
157 | |||
158 | // Result term 7 | ||
159 | |||
160 | combaddz(t2,u1,u0,[x+16],[x+40]) | ||
161 | combadd(t2,u1,u0,[x+24],[x+32]) | ||
162 | doubladd(t2,t1,t0,u1,u0) | ||
163 | mov [z+56], t0 | ||
164 | |||
165 | // Result term 8 | ||
166 | |||
167 | xor t0, t0 | ||
168 | combadd2(t0,t2,t1,[x+24],[x+40]) | ||
169 | combadd1(t0,t2,t1,[x+32]) | ||
170 | mov [z+64], t1 | ||
171 | |||
172 | // Result term 9 | ||
173 | |||
174 | xor t1, t1 | ||
175 | combadd2(t1,t0,t2,[x+32],[x+40]) | ||
176 | mov [z+72], t2 | ||
177 | |||
178 | // Result term 10 | ||
179 | |||
180 | combads(t1,t0,[x+40]) | ||
181 | mov [z+80], t0 | ||
182 | |||
183 | // Result term 11 | ||
184 | |||
185 | mov [z+88], t1 | ||
186 | |||
187 | // Return | ||
188 | |||
189 | #if WINDOWS_ABI | ||
190 | pop rsi | ||
191 | pop rdi | ||
192 | #endif | ||
193 | ret | ||
194 | |||
195 | #if defined(__linux__) && defined(__ELF__) | ||
196 | .section .note.GNU-stack,"",%progbits | ||
197 | #endif | ||