diff options
author | jsing <> | 2023-01-23 18:22:15 +0000 |
---|---|---|
committer | jsing <> | 2023-01-23 18:22:15 +0000 |
commit | 2538c963f64364e2264eed4f07b0fef71d0f361e (patch) | |
tree | 11495f3ee04f23a2b483b157959c7d21ed3a76c2 /src | |
parent | fa7433bffa7c26069e9ca9d343af05899fc9aba5 (diff) | |
download | openbsd-2538c963f64364e2264eed4f07b0fef71d0f361e.tar.gz openbsd-2538c963f64364e2264eed4f07b0fef71d0f361e.tar.bz2 openbsd-2538c963f64364e2264eed4f07b0fef71d0f361e.zip |
Bring in various s2n-bignum functions for amd64.
This brings in bignum_add(), bignum_cmadd(), bignum_cmul(), bignum_mul()
and bignum_sub(), along with bignum_{mul,sqr}_4_8_alt() and
bignum_{mul,sqr}_8_16_alt().
Discussed with tb@
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_add.S | 153 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S | 143 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S | 126 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul.S | 155 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S | 145 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S | 232 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S | 133 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S | 230 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sub.S | 141 |
9 files changed, 1458 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S new file mode 100644 index 0000000000..33663916bb --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S | |||
@@ -0,0 +1,153 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Add, z := x + y | ||
6 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] | ||
7 | // | ||
8 | // extern uint64_t bignum_add | ||
9 | // (uint64_t p, uint64_t *z, | ||
10 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
11 | // | ||
12 | // Does the z := x + y operation, truncating modulo p words in general and | ||
13 | // returning a top carry (0 or 1) in the p'th place, only adding the input | ||
14 | // words below p (as well as m and n respectively) to get the sum and carry. | ||
15 | // | ||
16 | // Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX | ||
17 | // Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX | ||
18 | // ---------------------------------------------------------------------------- | ||
19 | |||
20 | #include "_internal_s2n_bignum.h" | ||
21 | |||
22 | .intel_syntax noprefix | ||
23 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add) | ||
24 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add) | ||
25 | .text | ||
26 | |||
27 | #define p rdi | ||
28 | #define z rsi | ||
29 | #define m rdx | ||
30 | #define x rcx | ||
31 | #define n r8 | ||
32 | #define y r9 | ||
33 | #define i r10 | ||
34 | #define a rax | ||
35 | |||
36 | #define ashort eax | ||
37 | |||
38 | |||
39 | |||
40 | S2N_BN_SYMBOL(bignum_add): | ||
41 | |||
42 | #if WINDOWS_ABI | ||
43 | push rdi | ||
44 | push rsi | ||
45 | mov rdi, rcx | ||
46 | mov rsi, rdx | ||
47 | mov rdx, r8 | ||
48 | mov rcx, r9 | ||
49 | mov r8, [rsp+56] | ||
50 | mov r9, [rsp+64] | ||
51 | #endif | ||
52 | |||
53 | // Zero the main index counter for both branches | ||
54 | |||
55 | xor i, i | ||
56 | |||
57 | // First clamp the two input sizes m := min(p,m) and n := min(p,n) since | ||
58 | // we'll never need words past the p'th. Can now assume m <= p and n <= p. | ||
59 | // Then compare the modified m and n and branch accordingly | ||
60 | |||
61 | cmp p, m | ||
62 | cmovc m, p | ||
63 | cmp p, n | ||
64 | cmovc n, p | ||
65 | cmp m, n | ||
66 | jc ylonger | ||
67 | |||
68 | // The case where x is longer or of the same size (p >= m >= n) | ||
69 | |||
70 | sub p, m | ||
71 | sub m, n | ||
72 | inc m | ||
73 | test n, n | ||
74 | jz xtest | ||
75 | xmainloop: | ||
76 | mov a, [x+8*i] | ||
77 | adc a, [y+8*i] | ||
78 | mov [z+8*i],a | ||
79 | inc i | ||
80 | dec n | ||
81 | jnz xmainloop | ||
82 | jmp xtest | ||
83 | xtoploop: | ||
84 | mov a, [x+8*i] | ||
85 | adc a, 0 | ||
86 | mov [z+8*i],a | ||
87 | inc i | ||
88 | xtest: | ||
89 | dec m | ||
90 | jnz xtoploop | ||
91 | mov ashort, 0 | ||
92 | adc a, 0 | ||
93 | test p, p | ||
94 | jnz tails | ||
95 | #if WINDOWS_ABI | ||
96 | pop rsi | ||
97 | pop rdi | ||
98 | #endif | ||
99 | ret | ||
100 | |||
101 | // The case where y is longer (p >= n > m) | ||
102 | |||
103 | ylonger: | ||
104 | |||
105 | sub p, n | ||
106 | sub n, m | ||
107 | test m, m | ||
108 | jz ytoploop | ||
109 | ymainloop: | ||
110 | mov a, [x+8*i] | ||
111 | adc a, [y+8*i] | ||
112 | mov [z+8*i],a | ||
113 | inc i | ||
114 | dec m | ||
115 | jnz ymainloop | ||
116 | ytoploop: | ||
117 | mov a, [y+8*i] | ||
118 | adc a, 0 | ||
119 | mov [z+8*i],a | ||
120 | inc i | ||
121 | dec n | ||
122 | jnz ytoploop | ||
123 | mov ashort, 0 | ||
124 | adc a, 0 | ||
125 | test p, p | ||
126 | jnz tails | ||
127 | #if WINDOWS_ABI | ||
128 | pop rsi | ||
129 | pop rdi | ||
130 | #endif | ||
131 | ret | ||
132 | |||
133 | // Adding a non-trivial tail, when p > max(m,n) | ||
134 | |||
135 | tails: | ||
136 | mov [z+8*i],a | ||
137 | xor a, a | ||
138 | jmp tail | ||
139 | tailloop: | ||
140 | mov [z+8*i],a | ||
141 | tail: | ||
142 | inc i | ||
143 | dec p | ||
144 | jnz tailloop | ||
145 | #if WINDOWS_ABI | ||
146 | pop rsi | ||
147 | pop rdi | ||
148 | #endif | ||
149 | ret | ||
150 | |||
151 | #if defined(__linux__) && defined(__ELF__) | ||
152 | .section .note.GNU-stack,"",%progbits | ||
153 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S new file mode 100644 index 0000000000..33f9be2fa0 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S | |||
@@ -0,0 +1,143 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply-add with single-word multiplier, z := z + c * y | ||
6 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] | ||
7 | // | ||
8 | // extern uint64_t bignum_cmadd | ||
9 | // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); | ||
10 | // | ||
11 | // Does the "z := z + c * y" operation where y is n digits, result z is p. | ||
12 | // Truncates the result in general. | ||
13 | // | ||
14 | // The return value is a high/carry word that is meaningful when p = n + 1, or | ||
15 | // more generally when n <= p and the result fits in p + 1 digits. In these | ||
16 | // cases it gives the top digit of the (p + 1)-digit result. | ||
17 | // | ||
18 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX | ||
19 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX | ||
20 | // ---------------------------------------------------------------------------- | ||
21 | |||
22 | #include "_internal_s2n_bignum.h" | ||
23 | |||
24 | .intel_syntax noprefix | ||
25 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) | ||
26 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) | ||
27 | .text | ||
28 | |||
29 | #define p rdi | ||
30 | #define z rsi | ||
31 | #define c r9 | ||
32 | #define n rcx | ||
33 | #define x r8 | ||
34 | |||
35 | #define i r10 | ||
36 | #define h r11 | ||
37 | |||
38 | #define r rbx | ||
39 | |||
40 | #define hshort r11d | ||
41 | #define ishort r10d | ||
42 | |||
43 | |||
44 | |||
45 | S2N_BN_SYMBOL(bignum_cmadd): | ||
46 | |||
47 | #if WINDOWS_ABI | ||
48 | push rdi | ||
49 | push rsi | ||
50 | mov rdi, rcx | ||
51 | mov rsi, rdx | ||
52 | mov rdx, r8 | ||
53 | mov rcx, r9 | ||
54 | mov r8, [rsp+56] | ||
55 | #endif | ||
56 | |||
57 | // Seems hard to avoid one more register | ||
58 | |||
59 | push rbx | ||
60 | |||
61 | // First clamp the input size n := min(p,n) since we can never need to read | ||
62 | // past the p'th term of the input to generate p-digit output. | ||
63 | // Subtract p := p - min(n,p) so it holds the size of the extra tail needed | ||
64 | |||
65 | cmp p, n | ||
66 | cmovc n, p | ||
67 | sub p, n | ||
68 | |||
69 | // Initialize high part h = 0; if n = 0 do nothing but return that zero | ||
70 | |||
71 | xor h, h | ||
72 | test n, n | ||
73 | jz end | ||
74 | |||
75 | // Move c into a safer register as multiplies overwrite rdx | ||
76 | |||
77 | mov c, rdx | ||
78 | |||
79 | // Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 | ||
80 | |||
81 | mov rax, [x] | ||
82 | mul c | ||
83 | add [z], rax | ||
84 | mov h, rdx | ||
85 | mov ishort, 1 | ||
86 | dec n | ||
87 | jz hightail | ||
88 | |||
89 | // Main loop, where we always have CF + previous high part h to add in | ||
90 | |||
91 | loop: | ||
92 | adc h, [z+8*i] | ||
93 | sbb r, r | ||
94 | mov rax, [x+8*i] | ||
95 | mul c | ||
96 | sub rdx, r | ||
97 | add rax, h | ||
98 | mov [z+8*i], rax | ||
99 | mov h, rdx | ||
100 | inc i | ||
101 | dec n | ||
102 | jnz loop | ||
103 | |||
104 | hightail: | ||
105 | adc h, 0 | ||
106 | |||
107 | // Propagate the carry all the way to the end with h as extra carry word | ||
108 | |||
109 | tail: | ||
110 | test p, p | ||
111 | jz end | ||
112 | |||
113 | add [z+8*i], h | ||
114 | mov hshort, 0 | ||
115 | inc i | ||
116 | dec p | ||
117 | jz highend | ||
118 | |||
119 | tloop: | ||
120 | adc [z+8*i], h | ||
121 | inc i | ||
122 | dec p | ||
123 | jnz tloop | ||
124 | |||
125 | highend: | ||
126 | |||
127 | adc h, 0 | ||
128 | |||
129 | // Return the high/carry word | ||
130 | |||
131 | end: | ||
132 | mov rax, h | ||
133 | |||
134 | pop rbx | ||
135 | #if WINDOWS_ABI | ||
136 | pop rsi | ||
137 | pop rdi | ||
138 | #endif | ||
139 | ret | ||
140 | |||
141 | #if defined(__linux__) && defined(__ELF__) | ||
142 | .section .note.GNU-stack,"",%progbits | ||
143 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S new file mode 100644 index 0000000000..6d184e3f39 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S | |||
@@ -0,0 +1,126 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply by a single word, z := c * y | ||
6 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] | ||
7 | // | ||
8 | // extern uint64_t bignum_cmul | ||
9 | // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); | ||
10 | // | ||
11 | // Does the "z := c * y" operation where y is n digits, result z is p. | ||
12 | // Truncates the result in general unless p >= n + 1. | ||
13 | // | ||
14 | // The return value is a high/carry word that is meaningful when p >= n as | ||
15 | // giving the high part of the result. Since this is always zero if p > n, | ||
16 | // it is mainly of interest in the special case p = n, i.e. where the source | ||
17 | // and destination have the same nominal size, when it gives the extra word | ||
18 | // of the full result. | ||
19 | // | ||
20 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX | ||
21 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX | ||
22 | // ---------------------------------------------------------------------------- | ||
23 | |||
24 | #include "_internal_s2n_bignum.h" | ||
25 | |||
26 | .intel_syntax noprefix | ||
27 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul) | ||
28 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul) | ||
29 | .text | ||
30 | |||
31 | #define p rdi | ||
32 | #define z rsi | ||
33 | #define c r9 | ||
34 | #define n rcx | ||
35 | #define x r8 | ||
36 | |||
37 | #define i r10 | ||
38 | #define h r11 | ||
39 | |||
40 | |||
41 | |||
42 | S2N_BN_SYMBOL(bignum_cmul): | ||
43 | |||
44 | #if WINDOWS_ABI | ||
45 | push rdi | ||
46 | push rsi | ||
47 | mov rdi, rcx | ||
48 | mov rsi, rdx | ||
49 | mov rdx, r8 | ||
50 | mov rcx, r9 | ||
51 | mov r8, [rsp+56] | ||
52 | #endif | ||
53 | |||
54 | // First clamp the input size n := min(p,n) since we can never need to read | ||
55 | // past the p'th term of the input to generate p-digit output. Now we can | ||
56 | // assume that n <= p | ||
57 | |||
58 | cmp p, n | ||
59 | cmovc n, p | ||
60 | |||
61 | // Initialize current input/output pointer offset i and high part h. | ||
62 | // But then if n = 0 skip the multiplication and go to the tail part | ||
63 | |||
64 | xor h, h | ||
65 | xor i, i | ||
66 | test n, n | ||
67 | jz tail | ||
68 | |||
69 | // Move c into a safer register as multiplies overwrite rdx | ||
70 | |||
71 | mov c, rdx | ||
72 | |||
73 | // Initialization of the loop: [h,l] = c * x_0 | ||
74 | |||
75 | mov rax, [x] | ||
76 | mul c | ||
77 | mov [z], rax | ||
78 | mov h, rdx | ||
79 | inc i | ||
80 | cmp i, n | ||
81 | jz tail | ||
82 | |||
83 | // Main loop doing the multiplications | ||
84 | |||
85 | loop: | ||
86 | mov rax, [x+8*i] | ||
87 | mul c | ||
88 | add rax, h | ||
89 | adc rdx, 0 | ||
90 | mov [z+8*i], rax | ||
91 | mov h, rdx | ||
92 | inc i | ||
93 | cmp i, n | ||
94 | jc loop | ||
95 | |||
96 | // Add a tail when the destination is longer | ||
97 | |||
98 | tail: | ||
99 | cmp i, p | ||
100 | jnc end | ||
101 | mov [z+8*i], h | ||
102 | xor h, h | ||
103 | inc i | ||
104 | cmp i, p | ||
105 | jnc end | ||
106 | |||
107 | tloop: | ||
108 | mov [z+8*i], h | ||
109 | inc i | ||
110 | cmp i, p | ||
111 | jc tloop | ||
112 | |||
113 | // Return the high/carry word | ||
114 | |||
115 | end: | ||
116 | mov rax, h | ||
117 | |||
118 | #if WINDOWS_ABI | ||
119 | pop rsi | ||
120 | pop rdi | ||
121 | #endif | ||
122 | ret | ||
123 | |||
124 | #if defined(__linux__) && defined(__ELF__) | ||
125 | .section .note.GNU-stack,"",%progbits | ||
126 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S new file mode 100644 index 0000000000..20c3f70202 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S | |||
@@ -0,0 +1,155 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[m], y[n]; output z[k] | ||
7 | // | ||
8 | // extern void bignum_mul | ||
9 | // (uint64_t k, uint64_t *z, | ||
10 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
11 | // | ||
12 | // Does the "z := x * y" operation where x is m digits, y is n, result z is k. | ||
13 | // Truncates the result in general unless k >= m + n | ||
14 | // | ||
15 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y | ||
16 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y | ||
17 | // ---------------------------------------------------------------------------- | ||
18 | |||
19 | #include "_internal_s2n_bignum.h" | ||
20 | |||
21 | .intel_syntax noprefix | ||
22 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul) | ||
23 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul) | ||
24 | .text | ||
25 | |||
26 | // These are actually right | ||
27 | |||
28 | #define p rdi | ||
29 | #define z rsi | ||
30 | #define n r8 | ||
31 | |||
32 | // These are not | ||
33 | |||
34 | #define c r15 | ||
35 | #define h r14 | ||
36 | #define l r13 | ||
37 | #define x r12 | ||
38 | #define y r11 | ||
39 | #define i rbx | ||
40 | #define k r10 | ||
41 | #define m rbp | ||
42 | |||
43 | // These are always local scratch since multiplier result is in these | ||
44 | |||
45 | #define a rax | ||
46 | #define d rdx | ||
47 | |||
48 | |||
49 | |||
50 | S2N_BN_SYMBOL(bignum_mul): | ||
51 | |||
52 | #if WINDOWS_ABI | ||
53 | push rdi | ||
54 | push rsi | ||
55 | mov rdi, rcx | ||
56 | mov rsi, rdx | ||
57 | mov rdx, r8 | ||
58 | mov rcx, r9 | ||
59 | mov r8, [rsp+56] | ||
60 | mov r9, [rsp+64] | ||
61 | #endif | ||
62 | |||
63 | // We use too many registers, and also we need rax:rdx for multiplications | ||
64 | |||
65 | push rbx | ||
66 | push rbp | ||
67 | push r12 | ||
68 | push r13 | ||
69 | push r14 | ||
70 | push r15 | ||
71 | mov m, rdx | ||
72 | |||
73 | // If the result size is zero, do nothing | ||
74 | // Note that even if either or both inputs has size zero, we can't | ||
75 | // just give up because we at least need to zero the output array | ||
76 | // If we did a multiply-add variant, however, then we could | ||
77 | |||
78 | test p, p | ||
79 | jz end | ||
80 | |||
81 | // Set initial 2-part sum to zero (we zero c inside the body) | ||
82 | |||
83 | xor h,h | ||
84 | xor l,l | ||
85 | |||
86 | // Otherwise do outer loop k = 0 ... k = p - 1 | ||
87 | |||
88 | xor k, k | ||
89 | |||
90 | outerloop: | ||
91 | |||
92 | // Zero our carry term first; we eventually want it and a zero is useful now | ||
93 | // Set a = max 0 (k + 1 - n), i = min (k + 1) m | ||
94 | // This defines the range a <= j < i for the inner summation | ||
95 | // Note that since k < p < 2^64 we can assume k + 1 doesn't overflow | ||
96 | // And since we want to increment it anyway, we might as well do it now | ||
97 | |||
98 | xor c, c // c = 0 | ||
99 | inc k // k = k + 1 | ||
100 | |||
101 | mov a, k // a = k + 1 | ||
102 | sub a, n // a = k + 1 - n | ||
103 | cmovc a, c // a = max 0 (k + 1 - n) | ||
104 | |||
105 | mov i, m // i = m | ||
106 | cmp k, m // CF <=> k + 1 < m | ||
107 | cmovc i, k // i = min (k + 1) m | ||
108 | |||
109 | // Turn i into a loop count, and skip things if it's <= 0 | ||
110 | // Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a] | ||
111 | // and then launch into the main inner loop, postdecrementing i | ||
112 | |||
113 | mov d, k | ||
114 | sub d, i | ||
115 | sub i, a | ||
116 | jbe innerend | ||
117 | lea x,[rcx+8*a] | ||
118 | lea y,[r9+8*d-8] | ||
119 | |||
120 | innerloop: | ||
121 | mov rax, [y+8*i] | ||
122 | mul QWORD PTR [x] | ||
123 | add x, 8 | ||
124 | add l, rax | ||
125 | adc h, rdx | ||
126 | adc c, 0 | ||
127 | dec i | ||
128 | jnz innerloop | ||
129 | |||
130 | innerend: | ||
131 | |||
132 | mov [z], l | ||
133 | mov l, h | ||
134 | mov h, c | ||
135 | add z, 8 | ||
136 | |||
137 | cmp k, p | ||
138 | jc outerloop | ||
139 | |||
140 | end: | ||
141 | pop r15 | ||
142 | pop r14 | ||
143 | pop r13 | ||
144 | pop r12 | ||
145 | pop rbp | ||
146 | pop rbx | ||
147 | #if WINDOWS_ABI | ||
148 | pop rsi | ||
149 | pop rdi | ||
150 | #endif | ||
151 | ret | ||
152 | |||
153 | #if defined(__linux__) && defined(__ELF__) | ||
154 | .section .note.GNU-stack,"",%progbits | ||
155 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S new file mode 100644 index 0000000000..e70ed116da --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S | |||
@@ -0,0 +1,145 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[4], y[4]; output z[8] | ||
7 | // | ||
8 | // extern void bignum_mul_4_8_alt | ||
9 | // (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt) | ||
20 | .text | ||
21 | |||
22 | // These are actually right | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // This is moved from rdx to free it for muls | ||
28 | |||
29 | #define y rcx | ||
30 | |||
31 | // Other variables used as a rotating 3-word window to add terms to | ||
32 | |||
33 | #define t0 r8 | ||
34 | #define t1 r9 | ||
35 | #define t2 r10 | ||
36 | |||
37 | // Macro for the key "multiply and add to (c,h,l)" step | ||
38 | |||
39 | #define combadd(c,h,l,numa,numb) \ | ||
40 | mov rax, numa; \ | ||
41 | mul QWORD PTR numb; \ | ||
42 | add l, rax; \ | ||
43 | adc h, rdx; \ | ||
44 | adc c, 0 | ||
45 | |||
46 | // A minutely shorter form for when c = 0 initially | ||
47 | |||
48 | #define combadz(c,h,l,numa,numb) \ | ||
49 | mov rax, numa; \ | ||
50 | mul QWORD PTR numb; \ | ||
51 | add l, rax; \ | ||
52 | adc h, rdx; \ | ||
53 | adc c, c | ||
54 | |||
55 | // A short form where we don't expect a top carry | ||
56 | |||
57 | #define combads(h,l,numa,numb) \ | ||
58 | mov rax, numa; \ | ||
59 | mul QWORD PTR numb; \ | ||
60 | add l, rax; \ | ||
61 | adc h, rdx | ||
62 | |||
63 | S2N_BN_SYMBOL(bignum_mul_4_8_alt): | ||
64 | |||
65 | #if WINDOWS_ABI | ||
66 | push rdi | ||
67 | push rsi | ||
68 | mov rdi, rcx | ||
69 | mov rsi, rdx | ||
70 | mov rdx, r8 | ||
71 | #endif | ||
72 | |||
73 | // Copy y into a safe register to start with | ||
74 | |||
75 | mov y, rdx | ||
76 | |||
77 | // Result term 0 | ||
78 | |||
79 | mov rax, [x] | ||
80 | mul QWORD PTR [y] | ||
81 | |||
82 | mov [z], rax | ||
83 | mov t0, rdx | ||
84 | xor t1, t1 | ||
85 | |||
86 | // Result term 1 | ||
87 | |||
88 | xor t2, t2 | ||
89 | combads(t1,t0,[x],[y+8]) | ||
90 | combadz(t2,t1,t0,[x+8],[y]) | ||
91 | mov [z+8], t0 | ||
92 | |||
93 | // Result term 2 | ||
94 | |||
95 | xor t0, t0 | ||
96 | combadz(t0,t2,t1,[x],[y+16]) | ||
97 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
98 | combadd(t0,t2,t1,[x+16],[y]) | ||
99 | mov [z+16], t1 | ||
100 | |||
101 | // Result term 3 | ||
102 | |||
103 | xor t1, t1 | ||
104 | combadz(t1,t0,t2,[x],[y+24]) | ||
105 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
106 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
107 | combadd(t1,t0,t2,[x+24],[y]) | ||
108 | mov [z+24], t2 | ||
109 | |||
110 | // Result term 4 | ||
111 | |||
112 | xor t2, t2 | ||
113 | combadz(t2,t1,t0,[x+8],[y+24]) | ||
114 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
115 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
116 | mov [z+32], t0 | ||
117 | |||
118 | // Result term 5 | ||
119 | |||
120 | xor t0, t0 | ||
121 | combadz(t0,t2,t1,[x+16],[y+24]) | ||
122 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
123 | mov [z+40], t1 | ||
124 | |||
125 | // Result term 6 | ||
126 | |||
127 | xor t1, t1 | ||
128 | combads(t0,t2,[x+24],[y+24]) | ||
129 | mov [z+48], t2 | ||
130 | |||
131 | // Result term 7 | ||
132 | |||
133 | mov [z+56], t0 | ||
134 | |||
135 | // Return | ||
136 | |||
137 | #if WINDOWS_ABI | ||
138 | pop rsi | ||
139 | pop rdi | ||
140 | #endif | ||
141 | ret | ||
142 | |||
143 | #if defined(__linux__) && defined(__ELF__) | ||
144 | .section .note.GNU-stack,"",%progbits | ||
145 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S new file mode 100644 index 0000000000..43c6c48667 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S | |||
@@ -0,0 +1,232 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Multiply z := x * y | ||
6 | // Inputs x[8], y[8]; output z[16] | ||
7 | // | ||
8 | // extern void bignum_mul_8_16_alt | ||
9 | // (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) | ||
20 | .text | ||
21 | |||
22 | // These are actually right | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // This is moved from rdx to free it for muls | ||
28 | |||
29 | #define y rcx | ||
30 | |||
31 | // Other variables used as a rotating 3-word window to add terms to | ||
32 | |||
33 | #define t0 r8 | ||
34 | #define t1 r9 | ||
35 | #define t2 r10 | ||
36 | |||
37 | // Macro for the key "multiply and add to (c,h,l)" step | ||
38 | |||
39 | #define combadd(c,h,l,numa,numb) \ | ||
40 | mov rax, numa; \ | ||
41 | mul QWORD PTR numb; \ | ||
42 | add l, rax; \ | ||
43 | adc h, rdx; \ | ||
44 | adc c, 0 | ||
45 | |||
46 | // A minutely shorter form for when c = 0 initially | ||
47 | |||
48 | #define combadz(c,h,l,numa,numb) \ | ||
49 | mov rax, numa; \ | ||
50 | mul QWORD PTR numb; \ | ||
51 | add l, rax; \ | ||
52 | adc h, rdx; \ | ||
53 | adc c, c | ||
54 | |||
55 | // A short form where we don't expect a top carry | ||
56 | |||
57 | #define combads(h,l,numa,numb) \ | ||
58 | mov rax, numa; \ | ||
59 | mul QWORD PTR numb; \ | ||
60 | add l, rax; \ | ||
61 | adc h, rdx | ||
62 | |||
63 | S2N_BN_SYMBOL(bignum_mul_8_16_alt): | ||
64 | |||
65 | #if WINDOWS_ABI | ||
66 | push rdi | ||
67 | push rsi | ||
68 | mov rdi, rcx | ||
69 | mov rsi, rdx | ||
70 | mov rdx, r8 | ||
71 | #endif | ||
72 | |||
73 | // Copy y into a safe register to start with | ||
74 | |||
75 | mov y, rdx | ||
76 | |||
77 | // Result term 0 | ||
78 | |||
79 | mov rax, [x] | ||
80 | mul QWORD PTR [y] | ||
81 | |||
82 | mov [z], rax | ||
83 | mov t0, rdx | ||
84 | xor t1, t1 | ||
85 | |||
86 | // Result term 1 | ||
87 | |||
88 | xor t2, t2 | ||
89 | combads(t1,t0,[x],[y+8]) | ||
90 | combadz(t2,t1,t0,[x+8],[y]) | ||
91 | mov [z+8], t0 | ||
92 | |||
93 | // Result term 2 | ||
94 | |||
95 | xor t0, t0 | ||
96 | combadz(t0,t2,t1,[x],[y+16]) | ||
97 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
98 | combadd(t0,t2,t1,[x+16],[y]) | ||
99 | mov [z+16], t1 | ||
100 | |||
101 | // Result term 3 | ||
102 | |||
103 | xor t1, t1 | ||
104 | combadz(t1,t0,t2,[x],[y+24]) | ||
105 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
106 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
107 | combadd(t1,t0,t2,[x+24],[y]) | ||
108 | mov [z+24], t2 | ||
109 | |||
110 | // Result term 4 | ||
111 | |||
112 | xor t2, t2 | ||
113 | combadz(t2,t1,t0,[x],[y+32]) | ||
114 | combadd(t2,t1,t0,[x+8],[y+24]) | ||
115 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
116 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
117 | combadd(t2,t1,t0,[x+32],[y]) | ||
118 | mov [z+32], t0 | ||
119 | |||
120 | // Result term 5 | ||
121 | |||
122 | xor t0, t0 | ||
123 | combadz(t0,t2,t1,[x],[y+40]) | ||
124 | combadd(t0,t2,t1,[x+8],[y+32]) | ||
125 | combadd(t0,t2,t1,[x+16],[y+24]) | ||
126 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
127 | combadd(t0,t2,t1,[x+32],[y+8]) | ||
128 | combadd(t0,t2,t1,[x+40],[y]) | ||
129 | mov [z+40], t1 | ||
130 | |||
131 | // Result term 6 | ||
132 | |||
133 | xor t1, t1 | ||
134 | combadz(t1,t0,t2,[x],[y+48]) | ||
135 | combadd(t1,t0,t2,[x+8],[y+40]) | ||
136 | combadd(t1,t0,t2,[x+16],[y+32]) | ||
137 | combadd(t1,t0,t2,[x+24],[y+24]) | ||
138 | combadd(t1,t0,t2,[x+32],[y+16]) | ||
139 | combadd(t1,t0,t2,[x+40],[y+8]) | ||
140 | combadd(t1,t0,t2,[x+48],[y]) | ||
141 | mov [z+48], t2 | ||
142 | |||
143 | // Result term 7 | ||
144 | |||
145 | xor t2, t2 | ||
146 | combadz(t2,t1,t0,[x],[y+56]) | ||
147 | combadd(t2,t1,t0,[x+8],[y+48]) | ||
148 | combadd(t2,t1,t0,[x+16],[y+40]) | ||
149 | combadd(t2,t1,t0,[x+24],[y+32]) | ||
150 | combadd(t2,t1,t0,[x+32],[y+24]) | ||
151 | combadd(t2,t1,t0,[x+40],[y+16]) | ||
152 | combadd(t2,t1,t0,[x+48],[y+8]) | ||
153 | combadd(t2,t1,t0,[x+56],[y]) | ||
154 | mov [z+56], t0 | ||
155 | |||
156 | // Result term 8 | ||
157 | |||
158 | xor t0, t0 | ||
159 | combadz(t0,t2,t1,[x+8],[y+56]) | ||
160 | combadd(t0,t2,t1,[x+16],[y+48]) | ||
161 | combadd(t0,t2,t1,[x+24],[y+40]) | ||
162 | combadd(t0,t2,t1,[x+32],[y+32]) | ||
163 | combadd(t0,t2,t1,[x+40],[y+24]) | ||
164 | combadd(t0,t2,t1,[x+48],[y+16]) | ||
165 | combadd(t0,t2,t1,[x+56],[y+8]) | ||
166 | mov [z+64], t1 | ||
167 | |||
168 | // Result term 9 | ||
169 | |||
170 | xor t1, t1 | ||
171 | combadz(t1,t0,t2,[x+16],[y+56]) | ||
172 | combadd(t1,t0,t2,[x+24],[y+48]) | ||
173 | combadd(t1,t0,t2,[x+32],[y+40]) | ||
174 | combadd(t1,t0,t2,[x+40],[y+32]) | ||
175 | combadd(t1,t0,t2,[x+48],[y+24]) | ||
176 | combadd(t1,t0,t2,[x+56],[y+16]) | ||
177 | mov [z+72], t2 | ||
178 | |||
179 | // Result term 10 | ||
180 | |||
181 | xor t2, t2 | ||
182 | combadz(t2,t1,t0,[x+24],[y+56]) | ||
183 | combadd(t2,t1,t0,[x+32],[y+48]) | ||
184 | combadd(t2,t1,t0,[x+40],[y+40]) | ||
185 | combadd(t2,t1,t0,[x+48],[y+32]) | ||
186 | combadd(t2,t1,t0,[x+56],[y+24]) | ||
187 | mov [z+80], t0 | ||
188 | |||
189 | // Result term 11 | ||
190 | |||
191 | xor t0, t0 | ||
192 | combadz(t0,t2,t1,[x+32],[y+56]) | ||
193 | combadd(t0,t2,t1,[x+40],[y+48]) | ||
194 | combadd(t0,t2,t1,[x+48],[y+40]) | ||
195 | combadd(t0,t2,t1,[x+56],[y+32]) | ||
196 | mov [z+88], t1 | ||
197 | |||
198 | // Result term 12 | ||
199 | |||
200 | xor t1, t1 | ||
201 | combadz(t1,t0,t2,[x+40],[y+56]) | ||
202 | combadd(t1,t0,t2,[x+48],[y+48]) | ||
203 | combadd(t1,t0,t2,[x+56],[y+40]) | ||
204 | mov [z+96], t2 | ||
205 | |||
206 | // Result term 13 | ||
207 | |||
208 | xor t2, t2 | ||
209 | combadz(t2,t1,t0,[x+48],[y+56]) | ||
210 | combadd(t2,t1,t0,[x+56],[y+48]) | ||
211 | mov [z+104], t0 | ||
212 | |||
213 | // Result term 14 | ||
214 | |||
215 | combads(t2,t1,[x+56],[y+56]) | ||
216 | mov [z+112], t1 | ||
217 | |||
218 | // Result term 11 | ||
219 | |||
220 | mov [z+120], t2 | ||
221 | |||
222 | // Return | ||
223 | |||
224 | #if WINDOWS_ABI | ||
225 | pop rsi | ||
226 | pop rdi | ||
227 | #endif | ||
228 | ret | ||
229 | |||
230 | #if defined(__linux__) && defined(__ELF__) | ||
231 | .section .note.GNU-stack,"",%progbits | ||
232 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S new file mode 100644 index 0000000000..db483a5723 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S | |||
@@ -0,0 +1,133 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Square, z := x^2 | ||
6 | // Input x[4]; output z[8] | ||
7 | // | ||
8 | // extern void bignum_sqr_4_8_alt | ||
9 | // (uint64_t z[static 8], uint64_t x[static 4]); | ||
10 | // | ||
11 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
12 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
13 | // ---------------------------------------------------------------------------- | ||
14 | |||
15 | #include "_internal_s2n_bignum.h" | ||
16 | |||
17 | .intel_syntax noprefix | ||
18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt) | ||
19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt) | ||
20 | .text | ||
21 | |||
22 | // Input arguments | ||
23 | |||
24 | #define z rdi | ||
25 | #define x rsi | ||
26 | |||
27 | // Other variables used as a rotating 3-word window to add terms to | ||
28 | |||
29 | #define t0 rcx | ||
30 | #define t1 r8 | ||
31 | #define t2 r9 | ||
32 | |||
33 | // Macro for the key "multiply and add to (c,h,l)" step, for square term | ||
34 | |||
35 | #define combadd1(c,h,l,numa) \ | ||
36 | mov rax, numa; \ | ||
37 | mul rax; \ | ||
38 | add l, rax; \ | ||
39 | adc h, rdx; \ | ||
40 | adc c, 0 | ||
41 | |||
42 | // A short form where we don't expect a top carry | ||
43 | |||
44 | #define combads(h,l,numa) \ | ||
45 | mov rax, numa; \ | ||
46 | mul rax; \ | ||
47 | add l, rax; \ | ||
48 | adc h, rdx | ||
49 | |||
50 | // A version doubling before adding, for non-square terms | ||
51 | |||
52 | #define combadd2(c,h,l,numa,numb) \ | ||
53 | mov rax, numa; \ | ||
54 | mul QWORD PTR numb; \ | ||
55 | add rax, rax; \ | ||
56 | adc rdx, rdx; \ | ||
57 | adc c, 0; \ | ||
58 | add l, rax; \ | ||
59 | adc h, rdx; \ | ||
60 | adc c, 0 | ||
61 | |||
62 | S2N_BN_SYMBOL(bignum_sqr_4_8_alt): | ||
63 | |||
64 | #if WINDOWS_ABI | ||
65 | push rdi | ||
66 | push rsi | ||
67 | mov rdi, rcx | ||
68 | mov rsi, rdx | ||
69 | #endif | ||
70 | |||
71 | // Result term 0 | ||
72 | |||
73 | mov rax, [x] | ||
74 | mul rax | ||
75 | |||
76 | mov [z], rax | ||
77 | mov t0, rdx | ||
78 | xor t1, t1 | ||
79 | |||
80 | // Result term 1 | ||
81 | |||
82 | xor t2, t2 | ||
83 | combadd2(t2,t1,t0,[x],[x+8]) | ||
84 | mov [z+8], t0 | ||
85 | |||
86 | // Result term 2 | ||
87 | |||
88 | xor t0, t0 | ||
89 | combadd1(t0,t2,t1,[x+8]) | ||
90 | combadd2(t0,t2,t1,[x],[x+16]) | ||
91 | mov [z+16], t1 | ||
92 | |||
93 | // Result term 3 | ||
94 | |||
95 | xor t1, t1 | ||
96 | combadd2(t1,t0,t2,[x],[x+24]) | ||
97 | combadd2(t1,t0,t2,[x+8],[x+16]) | ||
98 | mov [z+24], t2 | ||
99 | |||
100 | // Result term 4 | ||
101 | |||
102 | xor t2, t2 | ||
103 | combadd2(t2,t1,t0,[x+8],[x+24]) | ||
104 | combadd1(t2,t1,t0,[x+16]) | ||
105 | mov [z+32], t0 | ||
106 | |||
107 | // Result term 5 | ||
108 | |||
109 | xor t0, t0 | ||
110 | combadd2(t0,t2,t1,[x+16],[x+24]) | ||
111 | mov [z+40], t1 | ||
112 | |||
113 | // Result term 6 | ||
114 | |||
115 | xor t1, t1 | ||
116 | combads(t0,t2,[x+24]) | ||
117 | mov [z+48], t2 | ||
118 | |||
119 | // Result term 7 | ||
120 | |||
121 | mov [z+56], t0 | ||
122 | |||
123 | // Return | ||
124 | |||
125 | #if WINDOWS_ABI | ||
126 | pop rsi | ||
127 | pop rdi | ||
128 | #endif | ||
129 | ret | ||
130 | |||
131 | #if defined(__linux__) && defined(__ELF__) | ||
132 | .section .note.GNU-stack,"",%progbits | ||
133 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S new file mode 100644 index 0000000000..dcf3c92e5b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S | |||
@@ -0,0 +1,230 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Square, z := x^2 | ||
6 | // Input x[8]; output z[16] | ||
7 | // | ||
8 | // extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); | ||
9 | // | ||
10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
12 | // ---------------------------------------------------------------------------- | ||
13 | |||
14 | #include "_internal_s2n_bignum.h" | ||
15 | |||
16 | .intel_syntax noprefix | ||
17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt) | ||
18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt) | ||
19 | .text | ||
20 | |||
21 | // Input arguments | ||
22 | |||
23 | #define z rdi | ||
24 | #define x rsi | ||
25 | |||
26 | // Other variables used as a rotating 3-word window to add terms to | ||
27 | |||
28 | #define t0 r8 | ||
29 | #define t1 r9 | ||
30 | #define t2 r10 | ||
31 | |||
32 | // Additional temporaries for local windows to share doublings | ||
33 | |||
34 | #define u0 rcx | ||
35 | #define u1 r11 | ||
36 | |||
37 | // Macro for the key "multiply and add to (c,h,l)" step | ||
38 | |||
39 | #define combadd(c,h,l,numa,numb) \ | ||
40 | mov rax, numa; \ | ||
41 | mul QWORD PTR numb; \ | ||
42 | add l, rax; \ | ||
43 | adc h, rdx; \ | ||
44 | adc c, 0 | ||
45 | |||
46 | // Set up initial window (c,h,l) = numa * numb | ||
47 | |||
48 | #define combaddz(c,h,l,numa,numb) \ | ||
49 | mov rax, numa; \ | ||
50 | mul QWORD PTR numb; \ | ||
51 | xor c, c; \ | ||
52 | mov l, rax; \ | ||
53 | mov h, rdx | ||
54 | |||
55 | // Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) | ||
56 | |||
57 | #define doubladd(c,h,l,hh,ll) \ | ||
58 | add ll, ll; \ | ||
59 | adc hh, hh; \ | ||
60 | adc c, c; \ | ||
61 | add l, ll; \ | ||
62 | adc h, hh; \ | ||
63 | adc c, 0 | ||
64 | |||
65 | // Square term incorporation (c,h,l) += numba^2 | ||
66 | |||
67 | #define combadd1(c,h,l,numa) \ | ||
68 | mov rax, numa; \ | ||
69 | mul rax; \ | ||
70 | add l, rax; \ | ||
71 | adc h, rdx; \ | ||
72 | adc c, 0 | ||
73 | |||
74 | // A short form where we don't expect a top carry | ||
75 | |||
76 | #define combads(h,l,numa) \ | ||
77 | mov rax, numa; \ | ||
78 | mul rax; \ | ||
79 | add l, rax; \ | ||
80 | adc h, rdx | ||
81 | |||
82 | // A version doubling directly before adding, for single non-square terms | ||
83 | |||
84 | #define combadd2(c,h,l,numa,numb) \ | ||
85 | mov rax, numa; \ | ||
86 | mul QWORD PTR numb; \ | ||
87 | add rax, rax; \ | ||
88 | adc rdx, rdx; \ | ||
89 | adc c, 0; \ | ||
90 | add l, rax; \ | ||
91 | adc h, rdx; \ | ||
92 | adc c, 0 | ||
93 | |||
94 | S2N_BN_SYMBOL(bignum_sqr_8_16_alt): | ||
95 | |||
96 | #if WINDOWS_ABI | ||
97 | push rdi | ||
98 | push rsi | ||
99 | mov rdi, rcx | ||
100 | mov rsi, rdx | ||
101 | #endif | ||
102 | |||
103 | // Result term 0 | ||
104 | |||
105 | mov rax, [x] | ||
106 | mul rax | ||
107 | |||
108 | mov [z], rax | ||
109 | mov t0, rdx | ||
110 | xor t1, t1 | ||
111 | |||
112 | // Result term 1 | ||
113 | |||
114 | xor t2, t2 | ||
115 | combadd2(t2,t1,t0,[x],[x+8]) | ||
116 | mov [z+8], t0 | ||
117 | |||
118 | // Result term 2 | ||
119 | |||
120 | xor t0, t0 | ||
121 | combadd1(t0,t2,t1,[x+8]) | ||
122 | combadd2(t0,t2,t1,[x],[x+16]) | ||
123 | mov [z+16], t1 | ||
124 | |||
125 | // Result term 3 | ||
126 | |||
127 | combaddz(t1,u1,u0,[x],[x+24]) | ||
128 | combadd(t1,u1,u0,[x+8],[x+16]) | ||
129 | doubladd(t1,t0,t2,u1,u0) | ||
130 | mov [z+24], t2 | ||
131 | |||
132 | // Result term 4 | ||
133 | |||
134 | combaddz(t2,u1,u0,[x],[x+32]) | ||
135 | combadd(t2,u1,u0,[x+8],[x+24]) | ||
136 | doubladd(t2,t1,t0,u1,u0) | ||
137 | combadd1(t2,t1,t0,[x+16]) | ||
138 | mov [z+32], t0 | ||
139 | |||
140 | // Result term 5 | ||
141 | |||
142 | combaddz(t0,u1,u0,[x],[x+40]) | ||
143 | combadd(t0,u1,u0,[x+8],[x+32]) | ||
144 | combadd(t0,u1,u0,[x+16],[x+24]) | ||
145 | doubladd(t0,t2,t1,u1,u0) | ||
146 | mov [z+40], t1 | ||
147 | |||
148 | // Result term 6 | ||
149 | |||
150 | combaddz(t1,u1,u0,[x],[x+48]) | ||
151 | combadd(t1,u1,u0,[x+8],[x+40]) | ||
152 | combadd(t1,u1,u0,[x+16],[x+32]) | ||
153 | doubladd(t1,t0,t2,u1,u0) | ||
154 | combadd1(t1,t0,t2,[x+24]) | ||
155 | mov [z+48], t2 | ||
156 | |||
157 | // Result term 7 | ||
158 | |||
159 | combaddz(t2,u1,u0,[x],[x+56]) | ||
160 | combadd(t2,u1,u0,[x+8],[x+48]) | ||
161 | combadd(t2,u1,u0,[x+16],[x+40]) | ||
162 | combadd(t2,u1,u0,[x+24],[x+32]) | ||
163 | doubladd(t2,t1,t0,u1,u0) | ||
164 | mov [z+56], t0 | ||
165 | |||
166 | // Result term 8 | ||
167 | |||
168 | combaddz(t0,u1,u0,[x+8],[x+56]) | ||
169 | combadd(t0,u1,u0,[x+16],[x+48]) | ||
170 | combadd(t0,u1,u0,[x+24],[x+40]) | ||
171 | doubladd(t0,t2,t1,u1,u0) | ||
172 | combadd1(t0,t2,t1,[x+32]) | ||
173 | mov [z+64], t1 | ||
174 | |||
175 | // Result term 9 | ||
176 | |||
177 | combaddz(t1,u1,u0,[x+16],[x+56]) | ||
178 | combadd(t1,u1,u0,[x+24],[x+48]) | ||
179 | combadd(t1,u1,u0,[x+32],[x+40]) | ||
180 | doubladd(t1,t0,t2,u1,u0) | ||
181 | mov [z+72], t2 | ||
182 | |||
183 | // Result term 10 | ||
184 | |||
185 | combaddz(t2,u1,u0,[x+24],[x+56]) | ||
186 | combadd(t2,u1,u0,[x+32],[x+48]) | ||
187 | doubladd(t2,t1,t0,u1,u0) | ||
188 | combadd1(t2,t1,t0,[x+40]) | ||
189 | mov [z+80], t0 | ||
190 | |||
191 | // Result term 11 | ||
192 | |||
193 | combaddz(t0,u1,u0,[x+32],[x+56]) | ||
194 | combadd(t0,u1,u0,[x+40],[x+48]) | ||
195 | doubladd(t0,t2,t1,u1,u0) | ||
196 | mov [z+88], t1 | ||
197 | |||
198 | // Result term 12 | ||
199 | |||
200 | xor t1, t1 | ||
201 | combadd2(t1,t0,t2,[x+40],[x+56]) | ||
202 | combadd1(t1,t0,t2,[x+48]) | ||
203 | mov [z+96], t2 | ||
204 | |||
205 | // Result term 13 | ||
206 | |||
207 | xor t2, t2 | ||
208 | combadd2(t2,t1,t0,[x+48],[x+56]) | ||
209 | mov [z+104], t0 | ||
210 | |||
211 | // Result term 14 | ||
212 | |||
213 | combads(t2,t1,[x+56]) | ||
214 | mov [z+112], t1 | ||
215 | |||
216 | // Result term 15 | ||
217 | |||
218 | mov [z+120], t2 | ||
219 | |||
220 | // Return | ||
221 | |||
222 | #if WINDOWS_ABI | ||
223 | pop rsi | ||
224 | pop rdi | ||
225 | #endif | ||
226 | ret | ||
227 | |||
228 | #if defined(__linux__) && defined(__ELF__) | ||
229 | .section .note.GNU-stack,"",%progbits | ||
230 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S new file mode 100644 index 0000000000..42eb6a82ef --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S | |||
@@ -0,0 +1,141 @@ | |||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
3 | |||
4 | // ---------------------------------------------------------------------------- | ||
5 | // Subtract, z := x - y | ||
6 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] | ||
7 | // | ||
8 | // extern uint64_t bignum_sub | ||
9 | // (uint64_t p, uint64_t *z, | ||
10 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
11 | // | ||
12 | // Does the z := x - y operation, truncating modulo p words in general and | ||
13 | // returning a top borrow (0 or 1) in the p'th place, only subtracting input | ||
14 | // words below p (as well as m and n respectively) to get the diff and borrow. | ||
15 | // | ||
16 | // Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX | ||
17 | // Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX | ||
18 | // ---------------------------------------------------------------------------- | ||
19 | |||
20 | #include "_internal_s2n_bignum.h" | ||
21 | |||
22 | .intel_syntax noprefix | ||
23 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub) | ||
24 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub) | ||
25 | .text | ||
26 | |||
27 | #define p rdi | ||
28 | #define z rsi | ||
29 | #define m rdx | ||
30 | #define x rcx | ||
31 | #define n r8 | ||
32 | #define y r9 | ||
33 | #define i r10 | ||
34 | #define a rax | ||
35 | |||
36 | #define ashort eax | ||
37 | |||
38 | |||
39 | |||
40 | S2N_BN_SYMBOL(bignum_sub): | ||
41 | |||
42 | #if WINDOWS_ABI | ||
43 | push rdi | ||
44 | push rsi | ||
45 | mov rdi, rcx | ||
46 | mov rsi, rdx | ||
47 | mov rdx, r8 | ||
48 | mov rcx, r9 | ||
49 | mov r8, [rsp+56] | ||
50 | mov r9, [rsp+64] | ||
51 | #endif | ||
52 | |||
53 | // Zero the main index counter for both branches | ||
54 | |||
55 | xor i, i | ||
56 | |||
57 | // First clamp the two input sizes m := min(p,m) and n := min(p,n) since | ||
58 | // we'll never need words past the p'th. Can now assume m <= p and n <= p. | ||
59 | // Then compare the modified m and n and branch accordingly | ||
60 | |||
61 | cmp p, m | ||
62 | cmovc m, p | ||
63 | cmp p, n | ||
64 | cmovc n, p | ||
65 | cmp m, n | ||
66 | jc ylonger | ||
67 | |||
68 | // The case where x is longer or of the same size (p >= m >= n) | ||
69 | |||
70 | sub p, m | ||
71 | sub m, n | ||
72 | inc m | ||
73 | test n, n | ||
74 | jz xtest | ||
75 | xmainloop: | ||
76 | mov a, [x+8*i] | ||
77 | sbb a, [y+8*i] | ||
78 | mov [z+8*i],a | ||
79 | inc i | ||
80 | dec n | ||
81 | jnz xmainloop | ||
82 | jmp xtest | ||
83 | xtoploop: | ||
84 | mov a, [x+8*i] | ||
85 | sbb a, 0 | ||
86 | mov [z+8*i],a | ||
87 | inc i | ||
88 | xtest: | ||
89 | dec m | ||
90 | jnz xtoploop | ||
91 | sbb a, a | ||
92 | test p, p | ||
93 | jz tailskip | ||
94 | tailloop: | ||
95 | mov [z+8*i],a | ||
96 | inc i | ||
97 | dec p | ||
98 | jnz tailloop | ||
99 | tailskip: | ||
100 | neg a | ||
101 | #if WINDOWS_ABI | ||
102 | pop rsi | ||
103 | pop rdi | ||
104 | #endif | ||
105 | ret | ||
106 | |||
107 | // The case where y is longer (p >= n > m) | ||
108 | |||
109 | ylonger: | ||
110 | |||
111 | sub p, n | ||
112 | sub n, m | ||
113 | test m, m | ||
114 | jz ytoploop | ||
115 | ymainloop: | ||
116 | mov a, [x+8*i] | ||
117 | sbb a, [y+8*i] | ||
118 | mov [z+8*i],a | ||
119 | inc i | ||
120 | dec m | ||
121 | jnz ymainloop | ||
122 | ytoploop: | ||
123 | mov ashort, 0 | ||
124 | sbb a, [y+8*i] | ||
125 | mov [z+8*i],a | ||
126 | inc i | ||
127 | dec n | ||
128 | jnz ytoploop | ||
129 | sbb a, a | ||
130 | test p, p | ||
131 | jnz tailloop | ||
132 | neg a | ||
133 | #if WINDOWS_ABI | ||
134 | pop rsi | ||
135 | pop rdi | ||
136 | #endif | ||
137 | ret | ||
138 | |||
139 | #if defined(__linux__) && defined(__ELF__) | ||
140 | .section .note.GNU-stack,"",%progbits | ||
141 | #endif | ||