diff options
Diffstat (limited to 'src/lib/libcrypto/bn/arch/amd64')
23 files changed, 2231 insertions, 138 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S index 5fe4aae7a1..1d4e6d08ef 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,9 +18,8 @@ | |||
16 | // Add, z := x + y | 18 | // Add, z := x + y |
17 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] | 19 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] |
18 | // | 20 | // |
19 | // extern uint64_t bignum_add | 21 | // extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m, |
20 | // (uint64_t p, uint64_t *z, | 22 | // const uint64_t *x, uint64_t n, const uint64_t *y); |
21 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
22 | // | 23 | // |
23 | // Does the z := x + y operation, truncating modulo p words in general and | 24 | // Does the z := x + y operation, truncating modulo p words in general and |
24 | // returning a top carry (0 or 1) in the p'th place, only adding the input | 25 | // returning a top carry (0 or 1) in the p'th place, only adding the input |
@@ -49,7 +50,7 @@ | |||
49 | 50 | ||
50 | 51 | ||
51 | S2N_BN_SYMBOL(bignum_add): | 52 | S2N_BN_SYMBOL(bignum_add): |
52 | _CET_ENDBR | 53 | _CET_ENDBR |
53 | 54 | ||
54 | #if WINDOWS_ABI | 55 | #if WINDOWS_ABI |
55 | push rdi | 56 | push rdi |
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add): | |||
75 | cmp p, n | 76 | cmp p, n |
76 | cmovc n, p | 77 | cmovc n, p |
77 | cmp m, n | 78 | cmp m, n |
78 | jc ylonger | 79 | jc bignum_add_ylonger |
79 | 80 | ||
80 | // The case where x is longer or of the same size (p >= m >= n) | 81 | // The case where x is longer or of the same size (p >= m >= n) |
81 | 82 | ||
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add): | |||
83 | sub m, n | 84 | sub m, n |
84 | inc m | 85 | inc m |
85 | test n, n | 86 | test n, n |
86 | jz xtest | 87 | jz bignum_add_xtest |
87 | xmainloop: | 88 | bignum_add_xmainloop: |
88 | mov a, [x+8*i] | 89 | mov a, [x+8*i] |
89 | adc a, [y+8*i] | 90 | adc a, [y+8*i] |
90 | mov [z+8*i],a | 91 | mov [z+8*i],a |
91 | inc i | 92 | inc i |
92 | dec n | 93 | dec n |
93 | jnz xmainloop | 94 | jnz bignum_add_xmainloop |
94 | jmp xtest | 95 | jmp bignum_add_xtest |
95 | xtoploop: | 96 | bignum_add_xtoploop: |
96 | mov a, [x+8*i] | 97 | mov a, [x+8*i] |
97 | adc a, 0 | 98 | adc a, 0 |
98 | mov [z+8*i],a | 99 | mov [z+8*i],a |
99 | inc i | 100 | inc i |
100 | xtest: | 101 | bignum_add_xtest: |
101 | dec m | 102 | dec m |
102 | jnz xtoploop | 103 | jnz bignum_add_xtoploop |
103 | mov ashort, 0 | 104 | mov ashort, 0 |
104 | adc a, 0 | 105 | adc a, 0 |
105 | test p, p | 106 | test p, p |
106 | jnz tails | 107 | jnz bignum_add_tails |
107 | #if WINDOWS_ABI | 108 | #if WINDOWS_ABI |
108 | pop rsi | 109 | pop rsi |
109 | pop rdi | 110 | pop rdi |
@@ -112,30 +113,30 @@ xtest: | |||
112 | 113 | ||
113 | // The case where y is longer (p >= n > m) | 114 | // The case where y is longer (p >= n > m) |
114 | 115 | ||
115 | ylonger: | 116 | bignum_add_ylonger: |
116 | 117 | ||
117 | sub p, n | 118 | sub p, n |
118 | sub n, m | 119 | sub n, m |
119 | test m, m | 120 | test m, m |
120 | jz ytoploop | 121 | jz bignum_add_ytoploop |
121 | ymainloop: | 122 | bignum_add_ymainloop: |
122 | mov a, [x+8*i] | 123 | mov a, [x+8*i] |
123 | adc a, [y+8*i] | 124 | adc a, [y+8*i] |
124 | mov [z+8*i],a | 125 | mov [z+8*i],a |
125 | inc i | 126 | inc i |
126 | dec m | 127 | dec m |
127 | jnz ymainloop | 128 | jnz bignum_add_ymainloop |
128 | ytoploop: | 129 | bignum_add_ytoploop: |
129 | mov a, [y+8*i] | 130 | mov a, [y+8*i] |
130 | adc a, 0 | 131 | adc a, 0 |
131 | mov [z+8*i],a | 132 | mov [z+8*i],a |
132 | inc i | 133 | inc i |
133 | dec n | 134 | dec n |
134 | jnz ytoploop | 135 | jnz bignum_add_ytoploop |
135 | mov ashort, 0 | 136 | mov ashort, 0 |
136 | adc a, 0 | 137 | adc a, 0 |
137 | test p, p | 138 | test p, p |
138 | jnz tails | 139 | jnz bignum_add_tails |
139 | #if WINDOWS_ABI | 140 | #if WINDOWS_ABI |
140 | pop rsi | 141 | pop rsi |
141 | pop rdi | 142 | pop rdi |
@@ -144,16 +145,16 @@ ytoploop: | |||
144 | 145 | ||
145 | // Adding a non-trivial tail, when p > max(m,n) | 146 | // Adding a non-trivial tail, when p > max(m,n) |
146 | 147 | ||
147 | tails: | 148 | bignum_add_tails: |
148 | mov [z+8*i],a | 149 | mov [z+8*i],a |
149 | xor a, a | 150 | xor a, a |
150 | jmp tail | 151 | jmp bignum_add_tail |
151 | tailloop: | 152 | bignum_add_tailloop: |
152 | mov [z+8*i],a | 153 | mov [z+8*i],a |
153 | tail: | 154 | bignum_add_tail: |
154 | inc i | 155 | inc i |
155 | dec p | 156 | dec p |
156 | jnz tailloop | 157 | jnz bignum_add_tailloop |
157 | #if WINDOWS_ABI | 158 | #if WINDOWS_ABI |
158 | pop rsi | 159 | pop rsi |
159 | pop rdi | 160 | pop rdi |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S index 25ba17bce2..a611919603 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,8 +18,8 @@ | |||
16 | // Multiply-add with single-word multiplier, z := z + c * y | 18 | // Multiply-add with single-word multiplier, z := z + c * y |
17 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] | 19 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] |
18 | // | 20 | // |
19 | // extern uint64_t bignum_cmadd | 21 | // extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n, |
20 | // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); | 22 | // const uint64_t *y); |
21 | // | 23 | // |
22 | // Does the "z := z + c * y" operation where y is n digits, result z is p. | 24 | // Does the "z := z + c * y" operation where y is n digits, result z is p. |
23 | // Truncates the result in general. | 25 | // Truncates the result in general. |
@@ -54,7 +56,7 @@ | |||
54 | 56 | ||
55 | 57 | ||
56 | S2N_BN_SYMBOL(bignum_cmadd): | 58 | S2N_BN_SYMBOL(bignum_cmadd): |
57 | _CET_ENDBR | 59 | _CET_ENDBR |
58 | 60 | ||
59 | #if WINDOWS_ABI | 61 | #if WINDOWS_ABI |
60 | push rdi | 62 | push rdi |
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd): | |||
82 | 84 | ||
83 | xor h, h | 85 | xor h, h |
84 | test n, n | 86 | test n, n |
85 | jz end | 87 | jz bignum_cmadd_end |
86 | 88 | ||
87 | // Move c into a safer register as multiplies overwrite rdx | 89 | // Move c into a safer register as multiplies overwrite rdx |
88 | 90 | ||
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd): | |||
96 | mov h, rdx | 98 | mov h, rdx |
97 | mov ishort, 1 | 99 | mov ishort, 1 |
98 | dec n | 100 | dec n |
99 | jz hightail | 101 | jz bignum_cmadd_hightail |
100 | 102 | ||
101 | // Main loop, where we always have CF + previous high part h to add in | 103 | // Main loop, where we always have CF + previous high part h to add in |
102 | 104 | ||
103 | loop: | 105 | bignum_cmadd_loop: |
104 | adc h, [z+8*i] | 106 | adc h, [z+8*i] |
105 | sbb r, r | 107 | sbb r, r |
106 | mov rax, [x+8*i] | 108 | mov rax, [x+8*i] |
@@ -111,36 +113,36 @@ loop: | |||
111 | mov h, rdx | 113 | mov h, rdx |
112 | inc i | 114 | inc i |
113 | dec n | 115 | dec n |
114 | jnz loop | 116 | jnz bignum_cmadd_loop |
115 | 117 | ||
116 | hightail: | 118 | bignum_cmadd_hightail: |
117 | adc h, 0 | 119 | adc h, 0 |
118 | 120 | ||
119 | // Propagate the carry all the way to the end with h as extra carry word | 121 | // Propagate the carry all the way to the end with h as extra carry word |
120 | 122 | ||
121 | tail: | 123 | bignum_cmadd_tail: |
122 | test p, p | 124 | test p, p |
123 | jz end | 125 | jz bignum_cmadd_end |
124 | 126 | ||
125 | add [z+8*i], h | 127 | add [z+8*i], h |
126 | mov hshort, 0 | 128 | mov hshort, 0 |
127 | inc i | 129 | inc i |
128 | dec p | 130 | dec p |
129 | jz highend | 131 | jz bignum_cmadd_highend |
130 | 132 | ||
131 | tloop: | 133 | bignum_cmadd_tloop: |
132 | adc [z+8*i], h | 134 | adc [z+8*i], h |
133 | inc i | 135 | inc i |
134 | dec p | 136 | dec p |
135 | jnz tloop | 137 | jnz bignum_cmadd_tloop |
136 | 138 | ||
137 | highend: | 139 | bignum_cmadd_highend: |
138 | 140 | ||
139 | adc h, 0 | 141 | adc h, 0 |
140 | 142 | ||
141 | // Return the high/carry word | 143 | // Return the high/carry word |
142 | 144 | ||
143 | end: | 145 | bignum_cmadd_end: |
144 | mov rax, h | 146 | mov rax, h |
145 | 147 | ||
146 | pop rbx | 148 | pop rbx |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S index 12f785d63a..eb71d9da44 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,8 +18,8 @@ | |||
16 | // Multiply by a single word, z := c * y | 18 | // Multiply by a single word, z := c * y |
17 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] | 19 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] |
18 | // | 20 | // |
19 | // extern uint64_t bignum_cmul | 21 | // extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n, |
20 | // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); | 22 | // const uint64_t *y); |
21 | // | 23 | // |
22 | // Does the "z := c * y" operation where y is n digits, result z is p. | 24 | // Does the "z := c * y" operation where y is n digits, result z is p. |
23 | // Truncates the result in general unless p >= n + 1. | 25 | // Truncates the result in general unless p >= n + 1. |
@@ -51,7 +53,7 @@ | |||
51 | 53 | ||
52 | 54 | ||
53 | S2N_BN_SYMBOL(bignum_cmul): | 55 | S2N_BN_SYMBOL(bignum_cmul): |
54 | _CET_ENDBR | 56 | _CET_ENDBR |
55 | 57 | ||
56 | #if WINDOWS_ABI | 58 | #if WINDOWS_ABI |
57 | push rdi | 59 | push rdi |
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul): | |||
76 | xor h, h | 78 | xor h, h |
77 | xor i, i | 79 | xor i, i |
78 | test n, n | 80 | test n, n |
79 | jz tail | 81 | jz bignum_cmul_tail |
80 | 82 | ||
81 | // Move c into a safer register as multiplies overwrite rdx | 83 | // Move c into a safer register as multiplies overwrite rdx |
82 | 84 | ||
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul): | |||
90 | mov h, rdx | 92 | mov h, rdx |
91 | inc i | 93 | inc i |
92 | cmp i, n | 94 | cmp i, n |
93 | jz tail | 95 | jz bignum_cmul_tail |
94 | 96 | ||
95 | // Main loop doing the multiplications | 97 | // Main loop doing the multiplications |
96 | 98 | ||
97 | loop: | 99 | bignum_cmul_loop: |
98 | mov rax, [x+8*i] | 100 | mov rax, [x+8*i] |
99 | mul c | 101 | mul c |
100 | add rax, h | 102 | add rax, h |
@@ -103,28 +105,28 @@ loop: | |||
103 | mov h, rdx | 105 | mov h, rdx |
104 | inc i | 106 | inc i |
105 | cmp i, n | 107 | cmp i, n |
106 | jc loop | 108 | jc bignum_cmul_loop |
107 | 109 | ||
108 | // Add a tail when the destination is longer | 110 | // Add a tail when the destination is longer |
109 | 111 | ||
110 | tail: | 112 | bignum_cmul_tail: |
111 | cmp i, p | 113 | cmp i, p |
112 | jnc end | 114 | jnc bignum_cmul_end |
113 | mov [z+8*i], h | 115 | mov [z+8*i], h |
114 | xor h, h | 116 | xor h, h |
115 | inc i | 117 | inc i |
116 | cmp i, p | 118 | cmp i, p |
117 | jnc end | 119 | jnc bignum_cmul_end |
118 | 120 | ||
119 | tloop: | 121 | bignum_cmul_tloop: |
120 | mov [z+8*i], h | 122 | mov [z+8*i], h |
121 | inc i | 123 | inc i |
122 | cmp i, p | 124 | cmp i, p |
123 | jc tloop | 125 | jc bignum_cmul_tloop |
124 | 126 | ||
125 | // Return the high/carry word | 127 | // Return the high/carry word |
126 | 128 | ||
127 | end: | 129 | bignum_cmul_end: |
128 | mov rax, h | 130 | mov rax, h |
129 | 131 | ||
130 | #if WINDOWS_ABI | 132 | #if WINDOWS_ABI |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S new file mode 100644 index 0000000000..baf27fdc7f --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S | |||
@@ -0,0 +1,112 @@ | |||
1 | // $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Add modulo m, z := (x + y) mod m, assuming x and y reduced | ||
19 | // Inputs x[k], y[k], m[k]; output z[k] | ||
20 | // | ||
21 | // extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x, | ||
22 | // const uint64_t *y, const uint64_t *m); | ||
23 | // | ||
24 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m | ||
25 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m | ||
26 | // ---------------------------------------------------------------------------- | ||
27 | |||
28 | #include "s2n_bignum_internal.h" | ||
29 | |||
30 | .intel_syntax noprefix | ||
31 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd) | ||
32 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd) | ||
33 | .text | ||
34 | |||
35 | #define k rdi | ||
36 | #define z rsi | ||
37 | #define x rdx | ||
38 | #define y rcx | ||
39 | #define m r8 | ||
40 | #define i r9 | ||
41 | #define j r10 | ||
42 | #define a rax | ||
43 | #define c r11 | ||
44 | |||
45 | S2N_BN_SYMBOL(bignum_modadd): | ||
46 | _CET_ENDBR | ||
47 | |||
48 | #if WINDOWS_ABI | ||
49 | push rdi | ||
50 | push rsi | ||
51 | mov rdi, rcx | ||
52 | mov rsi, rdx | ||
53 | mov rdx, r8 | ||
54 | mov rcx, r9 | ||
55 | mov r8, [rsp+56] | ||
56 | #endif | ||
57 | |||
58 | // If k = 0 do nothing | ||
59 | |||
60 | test k, k | ||
61 | jz bignum_modadd_end | ||
62 | |||
63 | // First just add (c::z) := x + y | ||
64 | |||
65 | xor c, c | ||
66 | mov j, k | ||
67 | xor i, i | ||
68 | bignum_modadd_addloop: | ||
69 | mov a, [x+8*i] | ||
70 | adc a, [y+8*i] | ||
71 | mov [z+8*i], a | ||
72 | inc i | ||
73 | dec j | ||
74 | jnz bignum_modadd_addloop | ||
75 | adc c, 0 | ||
76 | |||
77 | // Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m | ||
78 | |||
79 | mov j, k | ||
80 | xor i, i | ||
81 | bignum_modadd_cmploop: | ||
82 | mov a, [z+8*i] | ||
83 | sbb a, [m+8*i] | ||
84 | inc i | ||
85 | dec j | ||
86 | jnz bignum_modadd_cmploop | ||
87 | sbb c, 0 | ||
88 | not c | ||
89 | |||
90 | // Now do a masked subtraction z := z - [c] * m | ||
91 | |||
92 | xor i, i | ||
93 | bignum_modadd_subloop: | ||
94 | mov a, [m+8*i] | ||
95 | and a, c | ||
96 | neg j | ||
97 | sbb [z+8*i], a | ||
98 | sbb j, j | ||
99 | inc i | ||
100 | cmp i, k | ||
101 | jc bignum_modadd_subloop | ||
102 | |||
103 | bignum_modadd_end: | ||
104 | #if WINDOWS_ABI | ||
105 | pop rsi | ||
106 | pop rdi | ||
107 | #endif | ||
108 | ret | ||
109 | |||
110 | #if defined(__linux__) && defined(__ELF__) | ||
111 | .section .note.GNU-stack,"",%progbits | ||
112 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S new file mode 100644 index 0000000000..63b3230e35 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S | |||
@@ -0,0 +1,99 @@ | |||
1 | // $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Subtract modulo m, z := (x - y) mod m, assuming x and y reduced | ||
19 | // Inputs x[k], y[k], m[k]; output z[k] | ||
20 | // | ||
21 | // extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x, | ||
22 | // const uint64_t *y, const uint64_t *m); | ||
23 | // | ||
24 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m | ||
25 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m | ||
26 | // ---------------------------------------------------------------------------- | ||
27 | |||
28 | #include "s2n_bignum_internal.h" | ||
29 | |||
30 | .intel_syntax noprefix | ||
31 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub) | ||
32 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub) | ||
33 | .text | ||
34 | |||
35 | #define k rdi | ||
36 | #define z rsi | ||
37 | #define x rdx | ||
38 | #define y rcx | ||
39 | #define m r8 | ||
40 | #define i r9 | ||
41 | #define j r10 | ||
42 | #define a rax | ||
43 | #define c r11 | ||
44 | |||
45 | S2N_BN_SYMBOL(bignum_modsub): | ||
46 | _CET_ENDBR | ||
47 | |||
48 | #if WINDOWS_ABI | ||
49 | push rdi | ||
50 | push rsi | ||
51 | mov rdi, rcx | ||
52 | mov rsi, rdx | ||
53 | mov rdx, r8 | ||
54 | mov rcx, r9 | ||
55 | mov r8, [rsp+56] | ||
56 | #endif | ||
57 | |||
58 | // If k = 0 do nothing | ||
59 | |||
60 | test k, k | ||
61 | jz bignum_modsub_end | ||
62 | |||
63 | // Subtract z := x - y and record a mask for the carry x - y < 0 | ||
64 | |||
65 | xor c, c | ||
66 | mov j, k | ||
67 | xor i, i | ||
68 | bignum_modsub_subloop: | ||
69 | mov a, [x+8*i] | ||
70 | sbb a, [y+8*i] | ||
71 | mov [z+8*i], a | ||
72 | inc i | ||
73 | dec j | ||
74 | jnz bignum_modsub_subloop | ||
75 | sbb c, c | ||
76 | |||
77 | // Now do a masked addition z := z + [c] * m | ||
78 | |||
79 | xor i, i | ||
80 | bignum_modsub_addloop: | ||
81 | mov a, [m+8*i] | ||
82 | and a, c | ||
83 | neg j | ||
84 | adc [z+8*i], a | ||
85 | sbb j, j | ||
86 | inc i | ||
87 | cmp i, k | ||
88 | jc bignum_modsub_addloop | ||
89 | |||
90 | bignum_modsub_end: | ||
91 | #if WINDOWS_ABI | ||
92 | pop rsi | ||
93 | pop rdi | ||
94 | #endif | ||
95 | ret | ||
96 | |||
97 | #if defined(__linux__) && defined(__ELF__) | ||
98 | .section .note.GNU-stack,"",%progbits | ||
99 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S index a3552679a2..538cce9af7 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,9 +18,8 @@ | |||
16 | // Multiply z := x * y | 18 | // Multiply z := x * y |
17 | // Inputs x[m], y[n]; output z[k] | 19 | // Inputs x[m], y[n]; output z[k] |
18 | // | 20 | // |
19 | // extern void bignum_mul | 21 | // extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, |
20 | // (uint64_t k, uint64_t *z, | 22 | // uint64_t n, const uint64_t *y); |
21 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
22 | // | 23 | // |
23 | // Does the "z := x * y" operation where x is m digits, y is n, result z is k. | 24 | // Does the "z := x * y" operation where x is m digits, y is n, result z is k. |
24 | // Truncates the result in general unless k >= m + n | 25 | // Truncates the result in general unless k >= m + n |
@@ -59,7 +60,7 @@ | |||
59 | 60 | ||
60 | 61 | ||
61 | S2N_BN_SYMBOL(bignum_mul): | 62 | S2N_BN_SYMBOL(bignum_mul): |
62 | _CET_ENDBR | 63 | _CET_ENDBR |
63 | 64 | ||
64 | #if WINDOWS_ABI | 65 | #if WINDOWS_ABI |
65 | push rdi | 66 | push rdi |
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul): | |||
88 | // If we did a multiply-add variant, however, then we could | 89 | // If we did a multiply-add variant, however, then we could |
89 | 90 | ||
90 | test p, p | 91 | test p, p |
91 | jz end | 92 | jz bignum_mul_end |
92 | 93 | ||
93 | // Set initial 2-part sum to zero (we zero c inside the body) | 94 | // Set initial 2-part sum to zero (we zero c inside the body) |
94 | 95 | ||
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul): | |||
99 | 100 | ||
100 | xor k, k | 101 | xor k, k |
101 | 102 | ||
102 | outerloop: | 103 | bignum_mul_outerloop: |
103 | 104 | ||
104 | // Zero our carry term first; we eventually want it and a zero is useful now | 105 | // Zero our carry term first; we eventually want it and a zero is useful now |
105 | // Set a = max 0 (k + 1 - n), i = min (k + 1) m | 106 | // Set a = max 0 (k + 1 - n), i = min (k + 1) m |
@@ -125,11 +126,11 @@ outerloop: | |||
125 | mov d, k | 126 | mov d, k |
126 | sub d, i | 127 | sub d, i |
127 | sub i, a | 128 | sub i, a |
128 | jbe innerend | 129 | jbe bignum_mul_innerend |
129 | lea x,[rcx+8*a] | 130 | lea x,[rcx+8*a] |
130 | lea y,[r9+8*d-8] | 131 | lea y,[r9+8*d-8] |
131 | 132 | ||
132 | innerloop: | 133 | bignum_mul_innerloop: |
133 | mov rax, [y+8*i] | 134 | mov rax, [y+8*i] |
134 | mul QWORD PTR [x] | 135 | mul QWORD PTR [x] |
135 | add x, 8 | 136 | add x, 8 |
@@ -137,9 +138,9 @@ innerloop: | |||
137 | adc h, rdx | 138 | adc h, rdx |
138 | adc c, 0 | 139 | adc c, 0 |
139 | dec i | 140 | dec i |
140 | jnz innerloop | 141 | jnz bignum_mul_innerloop |
141 | 142 | ||
142 | innerend: | 143 | bignum_mul_innerend: |
143 | 144 | ||
144 | mov [z], l | 145 | mov [z], l |
145 | mov l, h | 146 | mov l, h |
@@ -147,9 +148,9 @@ innerend: | |||
147 | add z, 8 | 148 | add z, 8 |
148 | 149 | ||
149 | cmp k, p | 150 | cmp k, p |
150 | jc outerloop | 151 | jc bignum_mul_outerloop |
151 | 152 | ||
152 | end: | 153 | bignum_mul_end: |
153 | pop r15 | 154 | pop r15 |
154 | pop r14 | 155 | pop r14 |
155 | pop r13 | 156 | pop r13 |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S new file mode 100644 index 0000000000..d6ad514020 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S | |||
@@ -0,0 +1,187 @@ | |||
1 | // $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Multiply z := x * y | ||
19 | // Inputs x[4], y[4]; output z[8] | ||
20 | // | ||
21 | // extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4], | ||
22 | // const uint64_t y[static 4]); | ||
23 | // | ||
24 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
25 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
26 | // ---------------------------------------------------------------------------- | ||
27 | |||
28 | #include "s2n_bignum_internal.h" | ||
29 | |||
30 | .intel_syntax noprefix | ||
31 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8) | ||
32 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8) | ||
33 | .text | ||
34 | |||
35 | // These are actually right | ||
36 | |||
37 | #define z rdi | ||
38 | #define x rsi | ||
39 | |||
40 | // Copied in or set up | ||
41 | |||
42 | #define y rcx | ||
43 | |||
44 | // A zero register | ||
45 | |||
46 | #define zero rbp | ||
47 | #define zeroe ebp | ||
48 | |||
49 | // Add in x[i] * rdx to the (i,i+1) position with the register window | ||
50 | // Would be nice to have conditional expressions reg[i], reg[i+1] ... | ||
51 | |||
52 | .macro mulpadd arg1,arg2 | ||
53 | mulx rbx, rax, [x+8*\arg2] | ||
54 | .if ((\arg1 + \arg2) % 4 == 0) | ||
55 | adcx r8, rax | ||
56 | adox r9, rbx | ||
57 | .elseif ((\arg1 + \arg2) % 4 == 1) | ||
58 | adcx r9, rax | ||
59 | adox r10, rbx | ||
60 | .elseif ((\arg1 + \arg2) % 4 == 2) | ||
61 | adcx r10, rax | ||
62 | adox r11, rbx | ||
63 | .elseif ((\arg1 + \arg2) % 4 == 3) | ||
64 | adcx r11, rax | ||
65 | adox r8, rbx | ||
66 | .endif | ||
67 | |||
68 | .endm | ||
69 | |||
70 | |||
71 | // Add in the whole j'th row | ||
72 | |||
73 | .macro addrow arg1 | ||
74 | mov rdx, [y+8*\arg1] | ||
75 | xor zeroe, zeroe | ||
76 | |||
77 | mulpadd \arg1, 0 | ||
78 | |||
79 | .if (\arg1 % 4 == 0) | ||
80 | mov [z+8*\arg1],r8 | ||
81 | .elseif (\arg1 % 4 == 1) | ||
82 | mov [z+8*\arg1],r9 | ||
83 | .elseif (\arg1 % 4 == 2) | ||
84 | mov [z+8*\arg1],r10 | ||
85 | .elseif (\arg1 % 4 == 3) | ||
86 | mov [z+8*\arg1],r11 | ||
87 | .endif | ||
88 | |||
89 | mulpadd \arg1, 1 | ||
90 | mulpadd \arg1, 2 | ||
91 | |||
92 | .if (\arg1 % 4 == 0) | ||
93 | mulx r8, rax, [x+24] | ||
94 | adcx r11, rax | ||
95 | adox r8, zero | ||
96 | adcx r8, zero | ||
97 | .elseif (\arg1 % 4 == 1) | ||
98 | mulx r9, rax, [x+24] | ||
99 | adcx r8, rax | ||
100 | adox r9, zero | ||
101 | adcx r9, zero | ||
102 | .elseif (\arg1 % 4 == 2) | ||
103 | mulx r10, rax, [x+24] | ||
104 | adcx r9, rax | ||
105 | adox r10, zero | ||
106 | adcx r10, zero | ||
107 | .elseif (\arg1 % 4 == 3) | ||
108 | mulx r11, rax, [x+24] | ||
109 | adcx r10, rax | ||
110 | adox r11, zero | ||
111 | adcx r11, zero | ||
112 | .endif | ||
113 | |||
114 | .endm | ||
115 | |||
116 | |||
117 | |||
118 | S2N_BN_SYMBOL(bignum_mul_4_8): | ||
119 | _CET_ENDBR | ||
120 | |||
121 | #if WINDOWS_ABI | ||
122 | push rdi | ||
123 | push rsi | ||
124 | mov rdi, rcx | ||
125 | mov rsi, rdx | ||
126 | mov rdx, r8 | ||
127 | #endif | ||
128 | |||
129 | // Save more registers to play with | ||
130 | |||
131 | push rbp | ||
132 | push rbx | ||
133 | |||
134 | // Copy y into a safe register to start with | ||
135 | |||
136 | mov y, rdx | ||
137 | |||
138 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
139 | |||
140 | xor zeroe, zeroe | ||
141 | |||
142 | // Do the zeroth row, which is a bit different | ||
143 | // Write back the zero-zero product and then accumulate | ||
144 | // r8,r11,r10,r9 as y[0] * x from 1..4 | ||
145 | |||
146 | mov rdx, [y] | ||
147 | |||
148 | mulx r9, r8, [x] | ||
149 | mov [z], r8 | ||
150 | |||
151 | mulx r10, rbx, [x+8] | ||
152 | adcx r9, rbx | ||
153 | |||
154 | mulx r11, rbx, [x+16] | ||
155 | adcx r10, rbx | ||
156 | |||
157 | mulx r8, rbx, [x+24] | ||
158 | adcx r11, rbx | ||
159 | adcx r8, zero | ||
160 | |||
161 | // Now all the other rows in a uniform pattern | ||
162 | |||
163 | addrow 1 | ||
164 | addrow 2 | ||
165 | addrow 3 | ||
166 | |||
167 | // Now write back the additional columns | ||
168 | |||
169 | mov [z+32], r8 | ||
170 | mov [z+40], r9 | ||
171 | mov [z+48], r10 | ||
172 | mov [z+56], r11 | ||
173 | |||
174 | // Restore registers and return | ||
175 | |||
176 | pop rbx | ||
177 | pop rbp | ||
178 | |||
179 | #if WINDOWS_ABI | ||
180 | pop rsi | ||
181 | pop rdi | ||
182 | #endif | ||
183 | ret | ||
184 | |||
185 | #if defined(__linux__) && defined(__ELF__) | ||
186 | .section .note.GNU-stack,"",%progbits | ||
187 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S index 70ff69e372..2592d1d658 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,8 +18,8 @@ | |||
16 | // Multiply z := x * y | 18 | // Multiply z := x * y |
17 | // Inputs x[4], y[4]; output z[8] | 19 | // Inputs x[4], y[4]; output z[8] |
18 | // | 20 | // |
19 | // extern void bignum_mul_4_8_alt | 21 | // extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4], |
20 | // (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); | 22 | // const uint64_t y[static 4]); |
21 | // | 23 | // |
22 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | 24 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y |
23 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | 25 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y |
@@ -72,7 +74,7 @@ | |||
72 | adc h, rdx | 74 | adc h, rdx |
73 | 75 | ||
74 | S2N_BN_SYMBOL(bignum_mul_4_8_alt): | 76 | S2N_BN_SYMBOL(bignum_mul_4_8_alt): |
75 | _CET_ENDBR | 77 | _CET_ENDBR |
76 | 78 | ||
77 | #if WINDOWS_ABI | 79 | #if WINDOWS_ABI |
78 | push rdi | 80 | push rdi |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S new file mode 100644 index 0000000000..56cbdf06e0 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S | |||
@@ -0,0 +1,223 @@ | |||
1 | // $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Multiply z := x * y | ||
19 | // Inputs x[6], y[6]; output z[12] | ||
20 | // | ||
21 | // extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6], | ||
22 | // const uint64_t y[static 6]); | ||
23 | // | ||
24 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
25 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
26 | // ---------------------------------------------------------------------------- | ||
27 | |||
28 | #include "s2n_bignum_internal.h" | ||
29 | |||
30 | .intel_syntax noprefix | ||
31 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12) | ||
32 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12) | ||
33 | .text | ||
34 | |||
35 | // These are actually right | ||
36 | |||
37 | #define z rdi | ||
38 | #define x rsi | ||
39 | |||
40 | // Copied in or set up | ||
41 | |||
42 | #define y rcx | ||
43 | |||
44 | // A zero register | ||
45 | |||
46 | #define zero rbp | ||
47 | #define zeroe ebp | ||
48 | |||
49 | // Add in x[i] * rdx to the (i,i+1) position with the register window | ||
50 | // Would be nice to have conditional expressions reg[i], reg[i+1] ... | ||
51 | |||
52 | .macro mulpadd arg1,arg2 | ||
53 | mulx rbx, rax, [x+8*\arg2] | ||
54 | .if ((\arg1 + \arg2) % 6 == 0) | ||
55 | adcx r8, rax | ||
56 | adox r9, rbx | ||
57 | .elseif ((\arg1 + \arg2) % 6 == 1) | ||
58 | adcx r9, rax | ||
59 | adox r10, rbx | ||
60 | .elseif ((\arg1 + \arg2) % 6 == 2) | ||
61 | adcx r10, rax | ||
62 | adox r11, rbx | ||
63 | .elseif ((\arg1 + \arg2) % 6 == 3) | ||
64 | adcx r11, rax | ||
65 | adox r12, rbx | ||
66 | .elseif ((\arg1 + \arg2) % 6 == 4) | ||
67 | adcx r12, rax | ||
68 | adox r13, rbx | ||
69 | .elseif ((\arg1 + \arg2) % 6 == 5) | ||
70 | adcx r13, rax | ||
71 | adox r8, rbx | ||
72 | .endif | ||
73 | |||
74 | .endm | ||
75 | |||
76 | |||
77 | // Add in the whole j'th row | ||
78 | |||
79 | .macro addrow arg1 | ||
80 | mov rdx, [y+8*\arg1] | ||
81 | xor zeroe, zeroe | ||
82 | |||
83 | mulpadd \arg1, 0 | ||
84 | |||
85 | .if (\arg1 % 6 == 0) | ||
86 | mov [z+8*\arg1],r8 | ||
87 | .elseif (\arg1 % 6 == 1) | ||
88 | mov [z+8*\arg1],r9 | ||
89 | .elseif (\arg1 % 6 == 2) | ||
90 | mov [z+8*\arg1],r10 | ||
91 | .elseif (\arg1 % 6 == 3) | ||
92 | mov [z+8*\arg1],r11 | ||
93 | .elseif (\arg1 % 6 == 4) | ||
94 | mov [z+8*\arg1],r12 | ||
95 | .elseif (\arg1 % 6 == 5) | ||
96 | mov [z+8*\arg1],r13 | ||
97 | .endif | ||
98 | |||
99 | mulpadd \arg1, 1 | ||
100 | mulpadd \arg1, 2 | ||
101 | mulpadd \arg1, 3 | ||
102 | mulpadd \arg1, 4 | ||
103 | |||
104 | .if (\arg1 % 6 == 0) | ||
105 | mulx r8, rax, [x+40] | ||
106 | adcx r13, rax | ||
107 | adox r8, zero | ||
108 | adcx r8, zero | ||
109 | .elseif (\arg1 % 6 == 1) | ||
110 | mulx r9, rax, [x+40] | ||
111 | adcx r8, rax | ||
112 | adox r9, zero | ||
113 | adcx r9, zero | ||
114 | .elseif (\arg1 % 6 == 2) | ||
115 | mulx r10, rax, [x+40] | ||
116 | adcx r9, rax | ||
117 | adox r10, zero | ||
118 | adcx r10, zero | ||
119 | .elseif (\arg1 % 6 == 3) | ||
120 | mulx r11, rax, [x+40] | ||
121 | adcx r10, rax | ||
122 | adox r11, zero | ||
123 | adcx r11, zero | ||
124 | .elseif (\arg1 % 6 == 4) | ||
125 | mulx r12, rax, [x+40] | ||
126 | adcx r11, rax | ||
127 | adox r12, zero | ||
128 | adcx r12, zero | ||
129 | .elseif (\arg1 % 6 == 5) | ||
130 | mulx r13, rax, [x+40] | ||
131 | adcx r12, rax | ||
132 | adox r13, zero | ||
133 | adcx r13, zero | ||
134 | .endif | ||
135 | |||
136 | .endm | ||
137 | |||
138 | |||
139 | |||
140 | S2N_BN_SYMBOL(bignum_mul_6_12): | ||
141 | _CET_ENDBR | ||
142 | |||
143 | #if WINDOWS_ABI | ||
144 | push rdi | ||
145 | push rsi | ||
146 | mov rdi, rcx | ||
147 | mov rsi, rdx | ||
148 | mov rdx, r8 | ||
149 | #endif | ||
150 | |||
151 | // Save more registers to play with | ||
152 | |||
153 | push rbp | ||
154 | push rbx | ||
155 | push r12 | ||
156 | push r13 | ||
157 | |||
158 | // Copy y into a safe register to start with | ||
159 | |||
160 | mov y, rdx | ||
161 | |||
162 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
163 | |||
164 | xor zeroe, zeroe | ||
165 | |||
166 | // Do the zeroth row, which is a bit different | ||
167 | // Write back the zero-zero product and then accumulate | ||
168 | // r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6 | ||
169 | |||
170 | mov rdx, [y] | ||
171 | |||
172 | mulx r9, r8, [x] | ||
173 | mov [z], r8 | ||
174 | |||
175 | mulx r10, rbx, [x+8] | ||
176 | adcx r9, rbx | ||
177 | |||
178 | mulx r11, rbx, [x+16] | ||
179 | adcx r10, rbx | ||
180 | |||
181 | mulx r12, rbx, [x+24] | ||
182 | adcx r11, rbx | ||
183 | |||
184 | mulx r13, rbx, [x+32] | ||
185 | adcx r12, rbx | ||
186 | |||
187 | mulx r8, rbx, [x+40] | ||
188 | adcx r13, rbx | ||
189 | adcx r8, zero | ||
190 | |||
191 | // Now all the other rows in a uniform pattern | ||
192 | |||
193 | addrow 1 | ||
194 | addrow 2 | ||
195 | addrow 3 | ||
196 | addrow 4 | ||
197 | addrow 5 | ||
198 | |||
199 | // Now write back the additional columns | ||
200 | |||
201 | mov [z+48], r8 | ||
202 | mov [z+56], r9 | ||
203 | mov [z+64], r10 | ||
204 | mov [z+72], r11 | ||
205 | mov [z+80], r12 | ||
206 | mov [z+88], r13 | ||
207 | |||
208 | // Restore registers and return | ||
209 | |||
210 | pop r13 | ||
211 | pop r12 | ||
212 | pop rbx | ||
213 | pop rbp | ||
214 | |||
215 | #if WINDOWS_ABI | ||
216 | pop rsi | ||
217 | pop rdi | ||
218 | #endif | ||
219 | ret | ||
220 | |||
221 | #if defined(__linux__) && defined(__ELF__) | ||
222 | .section .note.GNU-stack,"",%progbits | ||
223 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S new file mode 100644 index 0000000000..077c52b38e --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S | |||
@@ -0,0 +1,199 @@ | |||
1 | // $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Multiply z := x * y | ||
19 | // Inputs x[6], y[6]; output z[12] | ||
20 | // | ||
21 | // extern void bignum_mul_6_12_alt(uint64_t z[static 12], | ||
22 | // const uint64_t x[static 6], | ||
23 | // const uint64_t y[static 6]); | ||
24 | // | ||
25 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
26 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
27 | // ---------------------------------------------------------------------------- | ||
28 | |||
29 | #include "s2n_bignum_internal.h" | ||
30 | |||
31 | .intel_syntax noprefix | ||
32 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt) | ||
33 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt) | ||
34 | .text | ||
35 | |||
36 | // These are actually right | ||
37 | |||
38 | #define z rdi | ||
39 | #define x rsi | ||
40 | |||
41 | // This is moved from rdx to free it for muls | ||
42 | |||
43 | #define y rcx | ||
44 | |||
45 | // Other variables used as a rotating 3-word window to add terms to | ||
46 | |||
47 | #define t0 r8 | ||
48 | #define t1 r9 | ||
49 | #define t2 r10 | ||
50 | |||
51 | // Macro for the key "multiply and add to (c,h,l)" step | ||
52 | |||
53 | #define combadd(c,h,l,numa,numb) \ | ||
54 | mov rax, numa; \ | ||
55 | mul QWORD PTR numb; \ | ||
56 | add l, rax; \ | ||
57 | adc h, rdx; \ | ||
58 | adc c, 0 | ||
59 | |||
60 | // A minutely shorter form for when c = 0 initially | ||
61 | |||
62 | #define combadz(c,h,l,numa,numb) \ | ||
63 | mov rax, numa; \ | ||
64 | mul QWORD PTR numb; \ | ||
65 | add l, rax; \ | ||
66 | adc h, rdx; \ | ||
67 | adc c, c | ||
68 | |||
69 | // A short form where we don't expect a top carry | ||
70 | |||
71 | #define combads(h,l,numa,numb) \ | ||
72 | mov rax, numa; \ | ||
73 | mul QWORD PTR numb; \ | ||
74 | add l, rax; \ | ||
75 | adc h, rdx | ||
76 | |||
77 | S2N_BN_SYMBOL(bignum_mul_6_12_alt): | ||
78 | _CET_ENDBR | ||
79 | |||
80 | #if WINDOWS_ABI | ||
81 | push rdi | ||
82 | push rsi | ||
83 | mov rdi, rcx | ||
84 | mov rsi, rdx | ||
85 | mov rdx, r8 | ||
86 | #endif | ||
87 | |||
88 | // Copy y into a safe register to start with | ||
89 | |||
90 | mov y, rdx | ||
91 | |||
92 | // Result term 0 | ||
93 | |||
94 | mov rax, [x] | ||
95 | mul QWORD PTR [y] | ||
96 | |||
97 | mov [z], rax | ||
98 | mov t0, rdx | ||
99 | xor t1, t1 | ||
100 | |||
101 | // Result term 1 | ||
102 | |||
103 | xor t2, t2 | ||
104 | combads(t1,t0,[x],[y+8]) | ||
105 | combadz(t2,t1,t0,[x+8],[y]) | ||
106 | mov [z+8], t0 | ||
107 | |||
108 | // Result term 2 | ||
109 | |||
110 | xor t0, t0 | ||
111 | combadz(t0,t2,t1,[x],[y+16]) | ||
112 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
113 | combadd(t0,t2,t1,[x+16],[y]) | ||
114 | mov [z+16], t1 | ||
115 | |||
116 | // Result term 3 | ||
117 | |||
118 | xor t1, t1 | ||
119 | combadz(t1,t0,t2,[x],[y+24]) | ||
120 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
121 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
122 | combadd(t1,t0,t2,[x+24],[y]) | ||
123 | mov [z+24], t2 | ||
124 | |||
125 | // Result term 4 | ||
126 | |||
127 | xor t2, t2 | ||
128 | combadz(t2,t1,t0,[x],[y+32]) | ||
129 | combadd(t2,t1,t0,[x+8],[y+24]) | ||
130 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
131 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
132 | combadd(t2,t1,t0,[x+32],[y]) | ||
133 | mov [z+32], t0 | ||
134 | |||
135 | // Result term 5 | ||
136 | |||
137 | xor t0, t0 | ||
138 | combadz(t0,t2,t1,[x],[y+40]) | ||
139 | combadd(t0,t2,t1,[x+8],[y+32]) | ||
140 | combadd(t0,t2,t1,[x+16],[y+24]) | ||
141 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
142 | combadd(t0,t2,t1,[x+32],[y+8]) | ||
143 | combadd(t0,t2,t1,[x+40],[y]) | ||
144 | mov [z+40], t1 | ||
145 | |||
146 | // Result term 6 | ||
147 | |||
148 | xor t1, t1 | ||
149 | combadz(t1,t0,t2,[x+8],[y+40]) | ||
150 | combadd(t1,t0,t2,[x+16],[y+32]) | ||
151 | combadd(t1,t0,t2,[x+24],[y+24]) | ||
152 | combadd(t1,t0,t2,[x+32],[y+16]) | ||
153 | combadd(t1,t0,t2,[x+40],[y+8]) | ||
154 | mov [z+48], t2 | ||
155 | |||
156 | // Result term 7 | ||
157 | |||
158 | xor t2, t2 | ||
159 | combadz(t2,t1,t0,[x+16],[y+40]) | ||
160 | combadd(t2,t1,t0,[x+24],[y+32]) | ||
161 | combadd(t2,t1,t0,[x+32],[y+24]) | ||
162 | combadd(t2,t1,t0,[x+40],[y+16]) | ||
163 | mov [z+56], t0 | ||
164 | |||
165 | // Result term 8 | ||
166 | |||
167 | xor t0, t0 | ||
168 | combadz(t0,t2,t1,[x+24],[y+40]) | ||
169 | combadd(t0,t2,t1,[x+32],[y+32]) | ||
170 | combadd(t0,t2,t1,[x+40],[y+24]) | ||
171 | mov [z+64], t1 | ||
172 | |||
173 | // Result term 9 | ||
174 | |||
175 | xor t1, t1 | ||
176 | combadz(t1,t0,t2,[x+32],[y+40]) | ||
177 | combadd(t1,t0,t2,[x+40],[y+32]) | ||
178 | mov [z+72], t2 | ||
179 | |||
180 | // Result term 10 | ||
181 | |||
182 | combads(t1,t0,[x+40],[y+40]) | ||
183 | mov [z+80], t0 | ||
184 | |||
185 | // Result term 11 | ||
186 | |||
187 | mov [z+88], t1 | ||
188 | |||
189 | // Return | ||
190 | |||
191 | #if WINDOWS_ABI | ||
192 | pop rsi | ||
193 | pop rdi | ||
194 | #endif | ||
195 | ret | ||
196 | |||
197 | #if defined(__linux__) && defined(__ELF__) | ||
198 | .section .note.GNU-stack,"",%progbits | ||
199 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S new file mode 100644 index 0000000000..faa0196d8e --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S | |||
@@ -0,0 +1,273 @@ | |||
1 | // $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Multiply z := x * y | ||
19 | // Inputs x[8], y[8]; output z[16] | ||
20 | // | ||
21 | // extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8], | ||
22 | // const uint64_t y[static 8]); | ||
23 | // | ||
24 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
25 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
26 | // ---------------------------------------------------------------------------- | ||
27 | |||
28 | #include "s2n_bignum_internal.h" | ||
29 | |||
30 | .intel_syntax noprefix | ||
31 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16) | ||
32 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16) | ||
33 | .text | ||
34 | |||
35 | // These are actually right | ||
36 | |||
37 | #define z rdi | ||
38 | #define x rsi | ||
39 | |||
40 | // Copied in or set up | ||
41 | |||
42 | #define y rcx | ||
43 | |||
44 | // A zero register | ||
45 | |||
46 | #define zero rbp | ||
47 | #define zeroe ebp | ||
48 | |||
49 | // mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j | ||
50 | |||
51 | .macro mulpadd arg1,arg2 | ||
52 | mulx rbx, rax, [x+8*\arg1] | ||
53 | .if ((\arg1 + \arg2) % 8 == 0) | ||
54 | adcx r8, rax | ||
55 | adox r9, rbx | ||
56 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
57 | adcx r9, rax | ||
58 | adox r10, rbx | ||
59 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
60 | adcx r10, rax | ||
61 | adox r11, rbx | ||
62 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
63 | adcx r11, rax | ||
64 | adox r12, rbx | ||
65 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
66 | adcx r12, rax | ||
67 | adox r13, rbx | ||
68 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
69 | adcx r13, rax | ||
70 | adox r14, rbx | ||
71 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
72 | adcx r14, rax | ||
73 | adox r15, rbx | ||
74 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
75 | adcx r15, rax | ||
76 | adox r8, rbx | ||
77 | .endif | ||
78 | |||
79 | .endm | ||
80 | |||
81 | // mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j | ||
82 | // but re-creates the top word assuming nothing to add there | ||
83 | |||
84 | .macro mulpade arg1,arg2 | ||
85 | .if ((\arg1 + \arg2) % 8 == 0) | ||
86 | mulx r9, rax, [x+8*\arg1] | ||
87 | adcx r8, rax | ||
88 | adox r9, zero | ||
89 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
90 | mulx r10, rax, [x+8*\arg1] | ||
91 | adcx r9, rax | ||
92 | adox r10, zero | ||
93 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
94 | mulx r11, rax, [x+8*\arg1] | ||
95 | adcx r10, rax | ||
96 | adox r11, zero | ||
97 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
98 | mulx r12, rax, [x+8*\arg1] | ||
99 | adcx r11, rax | ||
100 | adox r12, zero | ||
101 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
102 | mulx r13, rax, [x+8*\arg1] | ||
103 | adcx r12, rax | ||
104 | adox r13, zero | ||
105 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
106 | mulx r14, rax, [x+8*\arg1] | ||
107 | adcx r13, rax | ||
108 | adox r14, zero | ||
109 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
110 | mulx r15, rax, [x+8*\arg1] | ||
111 | adcx r14, rax | ||
112 | adox r15, zero | ||
113 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
114 | mulx r8, rax, [x+8*\arg1] | ||
115 | adcx r15, rax | ||
116 | adox r8, zero | ||
117 | .endif | ||
118 | |||
119 | .endm | ||
120 | |||
121 | // Add in the whole j'th row | ||
122 | |||
123 | .macro addrow arg1 | ||
124 | mov rdx, [y+8*\arg1] | ||
125 | xor zeroe, zeroe | ||
126 | |||
127 | mulpadd 0, \arg1 | ||
128 | |||
129 | .if (\arg1 % 8 == 0) | ||
130 | mov [z+8*\arg1],r8 | ||
131 | .elseif (\arg1 % 8 == 1) | ||
132 | mov [z+8*\arg1],r9 | ||
133 | .elseif (\arg1 % 8 == 2) | ||
134 | mov [z+8*\arg1],r10 | ||
135 | .elseif (\arg1 % 8 == 3) | ||
136 | mov [z+8*\arg1],r11 | ||
137 | .elseif (\arg1 % 8 == 4) | ||
138 | mov [z+8*\arg1],r12 | ||
139 | .elseif (\arg1 % 8 == 5) | ||
140 | mov [z+8*\arg1],r13 | ||
141 | .elseif (\arg1 % 8 == 6) | ||
142 | mov [z+8*\arg1],r14 | ||
143 | .elseif (\arg1 % 8 == 7) | ||
144 | mov [z+8*\arg1],r15 | ||
145 | .endif | ||
146 | |||
147 | mulpadd 1, \arg1 | ||
148 | mulpadd 2, \arg1 | ||
149 | mulpadd 3, \arg1 | ||
150 | mulpadd 4, \arg1 | ||
151 | mulpadd 5, \arg1 | ||
152 | mulpadd 6, \arg1 | ||
153 | mulpade 7, \arg1 | ||
154 | |||
155 | .if (\arg1 % 8 == 0) | ||
156 | adc r8, zero | ||
157 | .elseif (\arg1 % 8 == 1) | ||
158 | adc r9, zero | ||
159 | .elseif (\arg1 % 8 == 2) | ||
160 | adc r10, zero | ||
161 | .elseif (\arg1 % 8 == 3) | ||
162 | adc r11, zero | ||
163 | .elseif (\arg1 % 8 == 4) | ||
164 | adc r12, zero | ||
165 | .elseif (\arg1 % 8 == 5) | ||
166 | adc r13, zero | ||
167 | .elseif (\arg1 % 8 == 6) | ||
168 | adc r14, zero | ||
169 | .elseif (\arg1 % 8 == 7) | ||
170 | adc r15, zero | ||
171 | .endif | ||
172 | |||
173 | .endm | ||
174 | |||
175 | |||
176 | S2N_BN_SYMBOL(bignum_mul_8_16): | ||
177 | _CET_ENDBR | ||
178 | |||
179 | #if WINDOWS_ABI | ||
180 | push rdi | ||
181 | push rsi | ||
182 | mov rdi, rcx | ||
183 | mov rsi, rdx | ||
184 | mov rdx, r8 | ||
185 | #endif | ||
186 | |||
187 | // Save more registers to play with | ||
188 | |||
189 | push rbp | ||
190 | push rbx | ||
191 | push r12 | ||
192 | push r13 | ||
193 | push r14 | ||
194 | push r15 | ||
195 | |||
196 | // Copy y into a safe register to start with | ||
197 | |||
198 | mov y, rdx | ||
199 | |||
200 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
201 | |||
202 | xor zeroe, zeroe | ||
203 | |||
204 | // Do the zeroth row, which is a bit different | ||
205 | // Write back the zero-zero product and then accumulate | ||
206 | // r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8 | ||
207 | |||
208 | mov rdx, [y] | ||
209 | |||
210 | mulx r9, r8, [x] | ||
211 | mov [z], r8 | ||
212 | |||
213 | mulx r10, rbx, [x+8] | ||
214 | adc r9, rbx | ||
215 | |||
216 | mulx r11, rbx, [x+16] | ||
217 | adc r10, rbx | ||
218 | |||
219 | mulx r12, rbx, [x+24] | ||
220 | adc r11, rbx | ||
221 | |||
222 | mulx r13, rbx, [x+32] | ||
223 | adc r12, rbx | ||
224 | |||
225 | mulx r14, rbx, [x+40] | ||
226 | adc r13, rbx | ||
227 | |||
228 | mulx r15, rbx, [x+48] | ||
229 | adc r14, rbx | ||
230 | |||
231 | mulx r8, rbx, [x+56] | ||
232 | adc r15, rbx | ||
233 | adc r8, zero | ||
234 | |||
235 | // Now all the other rows in a uniform pattern | ||
236 | |||
237 | addrow 1 | ||
238 | addrow 2 | ||
239 | addrow 3 | ||
240 | addrow 4 | ||
241 | addrow 5 | ||
242 | addrow 6 | ||
243 | addrow 7 | ||
244 | |||
245 | // Now write back the additional columns | ||
246 | |||
247 | mov [z+64], r8 | ||
248 | mov [z+72], r9 | ||
249 | mov [z+80], r10 | ||
250 | mov [z+88], r11 | ||
251 | mov [z+96], r12 | ||
252 | mov [z+104], r13 | ||
253 | mov [z+112], r14 | ||
254 | mov [z+120], r15 | ||
255 | |||
256 | // Real epilog | ||
257 | |||
258 | pop r15 | ||
259 | pop r14 | ||
260 | pop r13 | ||
261 | pop r12 | ||
262 | pop rbx | ||
263 | pop rbp | ||
264 | |||
265 | #if WINDOWS_ABI | ||
266 | pop rsi | ||
267 | pop rdi | ||
268 | #endif | ||
269 | ret | ||
270 | |||
271 | #if defined(__linux__) && defined(__ELF__) | ||
272 | .section .note.GNU-stack,"",%progbits | ||
273 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S index 066403b074..0e30b9170f 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,8 +18,9 @@ | |||
16 | // Multiply z := x * y | 18 | // Multiply z := x * y |
17 | // Inputs x[8], y[8]; output z[16] | 19 | // Inputs x[8], y[8]; output z[16] |
18 | // | 20 | // |
19 | // extern void bignum_mul_8_16_alt | 21 | // extern void bignum_mul_8_16_alt(uint64_t z[static 16], |
20 | // (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); | 22 | // const uint64_t x[static 8], |
23 | // const uint64_t y[static 8]); | ||
21 | // | 24 | // |
22 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | 25 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y |
23 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | 26 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y |
@@ -72,7 +75,7 @@ | |||
72 | adc h, rdx | 75 | adc h, rdx |
73 | 76 | ||
74 | S2N_BN_SYMBOL(bignum_mul_8_16_alt): | 77 | S2N_BN_SYMBOL(bignum_mul_8_16_alt): |
75 | _CET_ENDBR | 78 | _CET_ENDBR |
76 | 79 | ||
77 | #if WINDOWS_ABI | 80 | #if WINDOWS_ABI |
78 | push rdi | 81 | push rdi |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S index 54e3f59442..86f1af2ac4 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,8 +18,7 @@ | |||
16 | // Square z := x^2 | 18 | // Square z := x^2 |
17 | // Input x[n]; output z[k] | 19 | // Input x[n]; output z[k] |
18 | // | 20 | // |
19 | // extern void bignum_sqr | 21 | // extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x); |
20 | // (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); | ||
21 | // | 22 | // |
22 | // Does the "z := x^2" operation where x is n digits and result z is k. | 23 | // Does the "z := x^2" operation where x is n digits and result z is k. |
23 | // Truncates the result in general unless k >= 2 * n | 24 | // Truncates the result in general unless k >= 2 * n |
@@ -62,7 +63,7 @@ | |||
62 | #define llshort ebp | 63 | #define llshort ebp |
63 | 64 | ||
64 | S2N_BN_SYMBOL(bignum_sqr): | 65 | S2N_BN_SYMBOL(bignum_sqr): |
65 | _CET_ENDBR | 66 | _CET_ENDBR |
66 | 67 | ||
67 | #if WINDOWS_ABI | 68 | #if WINDOWS_ABI |
68 | push rdi | 69 | push rdi |
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr): | |||
86 | // If p = 0 the result is trivial and nothing needs doing | 87 | // If p = 0 the result is trivial and nothing needs doing |
87 | 88 | ||
88 | test p, p | 89 | test p, p |
89 | jz end | 90 | jz bignum_sqr_end |
90 | 91 | ||
91 | // initialize (hh,ll) = 0 | 92 | // initialize (hh,ll) = 0 |
92 | 93 | ||
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr): | |||
97 | 98 | ||
98 | xor k, k | 99 | xor k, k |
99 | 100 | ||
100 | outerloop: | 101 | bignum_sqr_outerloop: |
101 | 102 | ||
102 | // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n | 103 | // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n |
103 | // We want to accumulate all x[i] * x[k - i] for bot <= i < top | 104 | // We want to accumulate all x[i] * x[k - i] for bot <= i < top |
@@ -122,7 +123,7 @@ outerloop: | |||
122 | // If htop <= bot then main doubled part of the sum is empty | 123 | // If htop <= bot then main doubled part of the sum is empty |
123 | 124 | ||
124 | cmp i, htop | 125 | cmp i, htop |
125 | jnc nosumming | 126 | jnc bignum_sqr_nosumming |
126 | 127 | ||
127 | // Use a moving pointer for [y] = x[k-i] for the cofactor | 128 | // Use a moving pointer for [y] = x[k-i] for the cofactor |
128 | 129 | ||
@@ -132,7 +133,7 @@ outerloop: | |||
132 | 133 | ||
133 | // Do the main part of the sum x[i] * x[k - i] for 2 * i < k | 134 | // Do the main part of the sum x[i] * x[k - i] for 2 * i < k |
134 | 135 | ||
135 | innerloop: | 136 | bignum_sqr_innerloop: |
136 | mov a, [x+8*i] | 137 | mov a, [x+8*i] |
137 | mul QWORD PTR [y] | 138 | mul QWORD PTR [y] |
138 | add l, a | 139 | add l, a |
@@ -141,7 +142,7 @@ innerloop: | |||
141 | sub y, 8 | 142 | sub y, 8 |
142 | inc i | 143 | inc i |
143 | cmp i, htop | 144 | cmp i, htop |
144 | jc innerloop | 145 | jc bignum_sqr_innerloop |
145 | 146 | ||
146 | // Now double it | 147 | // Now double it |
147 | 148 | ||
@@ -151,11 +152,11 @@ innerloop: | |||
151 | 152 | ||
152 | // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term | 153 | // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term |
153 | 154 | ||
154 | nosumming: | 155 | bignum_sqr_nosumming: |
155 | test k, 1 | 156 | test k, 1 |
156 | jnz innerend | 157 | jnz bignum_sqr_innerend |
157 | cmp i, n | 158 | cmp i, n |
158 | jnc innerend | 159 | jnc bignum_sqr_innerend |
159 | 160 | ||
160 | mov a, [x+8*i] | 161 | mov a, [x+8*i] |
161 | mul a | 162 | mul a |
@@ -165,7 +166,7 @@ nosumming: | |||
165 | 166 | ||
166 | // Now add the local sum into the global sum, store and shift | 167 | // Now add the local sum into the global sum, store and shift |
167 | 168 | ||
168 | innerend: | 169 | bignum_sqr_innerend: |
169 | add l, ll | 170 | add l, ll |
170 | mov [z+8*k], l | 171 | mov [z+8*k], l |
171 | adc h, hh | 172 | adc h, hh |
@@ -175,11 +176,11 @@ innerend: | |||
175 | 176 | ||
176 | inc k | 177 | inc k |
177 | cmp k, p | 178 | cmp k, p |
178 | jc outerloop | 179 | jc bignum_sqr_outerloop |
179 | 180 | ||
180 | // Restore registers and return | 181 | // Restore registers and return |
181 | 182 | ||
182 | end: | 183 | bignum_sqr_end: |
183 | pop r15 | 184 | pop r15 |
184 | pop r14 | 185 | pop r14 |
185 | pop r13 | 186 | pop r13 |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S new file mode 100644 index 0000000000..25664782f7 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S | |||
@@ -0,0 +1,158 @@ | |||
1 | // $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Square, z := x^2 | ||
19 | // Input x[4]; output z[8] | ||
20 | // | ||
21 | // extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]); | ||
22 | // | ||
23 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
24 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
25 | // ---------------------------------------------------------------------------- | ||
26 | |||
27 | #include "s2n_bignum_internal.h" | ||
28 | |||
29 | .intel_syntax noprefix | ||
30 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8) | ||
31 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8) | ||
32 | .text | ||
33 | |||
34 | // These are actually right | ||
35 | |||
36 | #define z rdi | ||
37 | #define x rsi | ||
38 | |||
39 | // A zero register | ||
40 | |||
41 | #define zero rbp | ||
42 | #define zeroe ebp | ||
43 | |||
44 | // Other registers | ||
45 | |||
46 | #define d1 r8 | ||
47 | #define d2 r9 | ||
48 | #define d3 r10 | ||
49 | #define d4 r11 | ||
50 | #define d5 r12 | ||
51 | #define d6 r13 | ||
52 | |||
53 | |||
54 | |||
55 | S2N_BN_SYMBOL(bignum_sqr_4_8): | ||
56 | _CET_ENDBR | ||
57 | |||
58 | #if WINDOWS_ABI | ||
59 | push rdi | ||
60 | push rsi | ||
61 | mov rdi, rcx | ||
62 | mov rsi, rdx | ||
63 | #endif | ||
64 | |||
65 | // Save more registers to play with | ||
66 | |||
67 | push rbp | ||
68 | push r12 | ||
69 | push r13 | ||
70 | |||
71 | // Set up an initial window [d6;...d1] = [23;03;01] | ||
72 | |||
73 | mov rdx, [x] | ||
74 | mulx d2, d1, [x+8] | ||
75 | mulx d4, d3, [x+24] | ||
76 | mov rdx, [x+16] | ||
77 | mulx d6, d5, [x+24] | ||
78 | |||
79 | // Clear our zero register, and also initialize the flags for the carry chain | ||
80 | |||
81 | xor zeroe, zeroe | ||
82 | |||
83 | // Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) | ||
84 | // This gives all the "heterogeneous" terms of the squaring ready to double | ||
85 | |||
86 | mulx rcx, rax, [x] | ||
87 | adcx d2, rax | ||
88 | adox d3, rcx | ||
89 | mulx rcx, rax, [x+8] | ||
90 | adcx d3, rax | ||
91 | adox d4, rcx | ||
92 | mov rdx, [x+24] | ||
93 | mulx rcx, rax, [x+8] | ||
94 | adcx d4, rax | ||
95 | adox d5, rcx | ||
96 | adcx d5, zero | ||
97 | adox d6, zero | ||
98 | adcx d6, zero | ||
99 | |||
100 | // In principle this is otiose as CF and OF carries are absorbed at this point | ||
101 | // However it seems helpful for the OOO engine to be told it's a fresh start | ||
102 | |||
103 | xor zeroe, zeroe | ||
104 | |||
105 | // Double and add to the 00 + 11 + 22 + 33 terms | ||
106 | // | ||
107 | // We could use shift-double but this seems tidier and in larger squarings | ||
108 | // it was actually more efficient. I haven't experimented with this small | ||
109 | // case to see how much that matters. Note: the writeback here is sprinkled | ||
110 | // into the sequence in such a way that things still work if z = x, i.e. if | ||
111 | // the output overwrites the input buffer and beyond. | ||
112 | |||
113 | mov rdx, [x] | ||
114 | mulx rdx, rax, rdx | ||
115 | mov [z], rax | ||
116 | adcx d1, d1 | ||
117 | adox d1, rdx | ||
118 | mov rdx, [x+8] | ||
119 | mov [z+8], d1 | ||
120 | mulx rdx, rax, rdx | ||
121 | adcx d2, d2 | ||
122 | adox d2, rax | ||
123 | adcx d3, d3 | ||
124 | adox d3, rdx | ||
125 | mov rdx, [x+16] | ||
126 | mov [z+16], d2 | ||
127 | mulx rdx, rax, rdx | ||
128 | adcx d4, d4 | ||
129 | adox d4, rax | ||
130 | adcx d5, d5 | ||
131 | adox d5, rdx | ||
132 | mov rdx, [x+24] | ||
133 | mov [z+24], d3 | ||
134 | mulx rdx, rax, rdx | ||
135 | mov [z+32], d4 | ||
136 | adcx d6, d6 | ||
137 | mov [z+40], d5 | ||
138 | adox d6, rax | ||
139 | mov [z+48], d6 | ||
140 | adcx rdx, zero | ||
141 | adox rdx, zero | ||
142 | mov [z+56], rdx | ||
143 | |||
144 | // Restore saved registers and return | ||
145 | |||
146 | pop r13 | ||
147 | pop r12 | ||
148 | pop rbp | ||
149 | |||
150 | #if WINDOWS_ABI | ||
151 | pop rsi | ||
152 | pop rdi | ||
153 | #endif | ||
154 | ret | ||
155 | |||
156 | #if defined(__linux__) && defined(__ELF__) | ||
157 | .section .note.GNU-stack,"",%progbits | ||
158 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S index 7c534ae907..7eafac3284 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,8 +18,8 @@ | |||
16 | // Square, z := x^2 | 18 | // Square, z := x^2 |
17 | // Input x[4]; output z[8] | 19 | // Input x[4]; output z[8] |
18 | // | 20 | // |
19 | // extern void bignum_sqr_4_8_alt | 21 | // extern void bignum_sqr_4_8_alt(uint64_t z[static 8], |
20 | // (uint64_t z[static 8], uint64_t x[static 4]); | 22 | // const uint64_t x[static 4]); |
21 | // | 23 | // |
22 | // Standard x86-64 ABI: RDI = z, RSI = x | 24 | // Standard x86-64 ABI: RDI = z, RSI = x |
23 | // Microsoft x64 ABI: RCX = z, RDX = x | 25 | // Microsoft x64 ABI: RCX = z, RDX = x |
@@ -71,7 +73,7 @@ | |||
71 | adc c, 0 | 73 | adc c, 0 |
72 | 74 | ||
73 | S2N_BN_SYMBOL(bignum_sqr_4_8_alt): | 75 | S2N_BN_SYMBOL(bignum_sqr_4_8_alt): |
74 | _CET_ENDBR | 76 | _CET_ENDBR |
75 | 77 | ||
76 | #if WINDOWS_ABI | 78 | #if WINDOWS_ABI |
77 | push rdi | 79 | push rdi |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S new file mode 100644 index 0000000000..3f055e8b75 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S | |||
@@ -0,0 +1,227 @@ | |||
1 | // $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Square, z := x^2 | ||
19 | // Input x[6]; output z[12] | ||
20 | // | ||
21 | // extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]); | ||
22 | // | ||
23 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
24 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
25 | // ---------------------------------------------------------------------------- | ||
26 | |||
27 | #include "s2n_bignum_internal.h" | ||
28 | |||
29 | .intel_syntax noprefix | ||
30 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12) | ||
31 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12) | ||
32 | .text | ||
33 | |||
34 | // These are actually right | ||
35 | |||
36 | #define z rdi | ||
37 | #define x rsi | ||
38 | |||
39 | // A zero register | ||
40 | |||
41 | #define zero rbp | ||
42 | #define zeroe ebp | ||
43 | |||
44 | // Other registers | ||
45 | |||
46 | #define d1 r8 | ||
47 | #define d2 r9 | ||
48 | #define d3 r10 | ||
49 | #define d4 r11 | ||
50 | #define d5 r12 | ||
51 | #define d6 r13 | ||
52 | #define d7 r14 | ||
53 | #define d8 r15 | ||
54 | #define d9 rbx | ||
55 | |||
56 | // Care is needed: re-using the zero register | ||
57 | |||
58 | #define d10 rbp | ||
59 | |||
60 | |||
61 | S2N_BN_SYMBOL(bignum_sqr_6_12): | ||
62 | _CET_ENDBR | ||
63 | |||
64 | #if WINDOWS_ABI | ||
65 | push rdi | ||
66 | push rsi | ||
67 | mov rdi, rcx | ||
68 | mov rsi, rdx | ||
69 | #endif | ||
70 | |||
71 | // Save more registers to play with | ||
72 | |||
73 | push rbp | ||
74 | push rbx | ||
75 | push r12 | ||
76 | push r13 | ||
77 | push r14 | ||
78 | push r15 | ||
79 | |||
80 | // Set up an initial window [d8;...d1] = [34;05;03;01] | ||
81 | |||
82 | mov rdx, [x] | ||
83 | mulx d2, d1, [x+8] | ||
84 | mulx d4, d3, [x+24] | ||
85 | mulx d6, d5, [x+40] | ||
86 | mov rdx, [x+24] | ||
87 | mulx d8, d7, [x+32] | ||
88 | |||
89 | // Clear our zero register, and also initialize the flags for the carry chain | ||
90 | |||
91 | xor zeroe, zeroe | ||
92 | |||
93 | // Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window | ||
94 | // (no carry-out possible since we add it to the top of a product) | ||
95 | |||
96 | mov rdx, [x+16] | ||
97 | mulx rcx, rax, [x] | ||
98 | adcx d2, rax | ||
99 | adox d3, rcx | ||
100 | mulx rcx, rax, [x+8] | ||
101 | adcx d3, rax | ||
102 | adox d4, rcx | ||
103 | mov rdx, [x+8] | ||
104 | mulx rcx, rax, [x+24] | ||
105 | adcx d4, rax | ||
106 | adox d5, rcx | ||
107 | mulx rcx, rax, [x+32] | ||
108 | adcx d5, rax | ||
109 | adox d6, rcx | ||
110 | mulx rcx, rax, [x+40] | ||
111 | adcx d6, rax | ||
112 | adox d7, rcx | ||
113 | adcx d7, zero | ||
114 | adox d8, zero | ||
115 | adcx d8, zero | ||
116 | |||
117 | // Again zero out the flags. Actually they are already cleared but it may | ||
118 | // help decouple these in the OOO engine not to wait for the chain above | ||
119 | |||
120 | xor zeroe, zeroe | ||
121 | |||
122 | // Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms | ||
123 | // We are running out of registers and here our zero register is not zero! | ||
124 | |||
125 | mov rdx, [x+32] | ||
126 | mulx rcx, rax, [x] | ||
127 | adcx d4, rax | ||
128 | adox d5, rcx | ||
129 | mov rdx, [x+16] | ||
130 | mulx rcx, rax, [x+24] | ||
131 | adcx d5, rax | ||
132 | adox d6, rcx | ||
133 | mulx rcx, rax, [x+32] | ||
134 | adcx d6, rax | ||
135 | adox d7, rcx | ||
136 | mulx rcx, rax, [x+40] | ||
137 | adcx d7, rax | ||
138 | adox d8, rcx | ||
139 | mov rdx, [x+24] | ||
140 | mulx d9, rax, [x+40] | ||
141 | adcx d8, rax | ||
142 | adox d9, zero | ||
143 | mov rdx, [x+32] | ||
144 | mulx d10, rax, [x+40] | ||
145 | adcx d9, rax | ||
146 | mov eax, 0 | ||
147 | adox d10, rax | ||
148 | adcx d10, rax | ||
149 | |||
150 | // Again, just for a clear fresh start for the flags | ||
151 | |||
152 | xor eax, eax | ||
153 | |||
154 | // Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms | ||
155 | // | ||
156 | // We could use shift-double but this seems tidier and in larger squarings | ||
157 | // it was actually more efficient. I haven't experimented with this small | ||
158 | // case to see how much that matters. Note: the writeback here is sprinkled | ||
159 | // into the sequence in such a way that things still work if z = x, i.e. if | ||
160 | // the output overwrites the input buffer and beyond. | ||
161 | |||
162 | mov rdx, [x] | ||
163 | mulx rdx, rax, rdx | ||
164 | mov [z], rax | ||
165 | adcx d1, d1 | ||
166 | adox d1, rdx | ||
167 | mov rdx, [x+8] | ||
168 | mov [z+8], d1 | ||
169 | mulx rdx, rax, rdx | ||
170 | adcx d2, d2 | ||
171 | adox d2, rax | ||
172 | adcx d3, d3 | ||
173 | adox d3, rdx | ||
174 | mov rdx, [x+16] | ||
175 | mov [z+16], d2 | ||
176 | mulx rdx, rax, rdx | ||
177 | adcx d4, d4 | ||
178 | adox d4, rax | ||
179 | adcx d5, d5 | ||
180 | adox d5, rdx | ||
181 | mov rdx, [x+24] | ||
182 | mov [z+24], d3 | ||
183 | mulx rdx, rax, rdx | ||
184 | adcx d6, d6 | ||
185 | adox d6, rax | ||
186 | adcx d7, d7 | ||
187 | adox d7, rdx | ||
188 | mov rdx, [x+32] | ||
189 | mov [z+32], d4 | ||
190 | mulx rdx, rax, rdx | ||
191 | adcx d8, d8 | ||
192 | adox d8, rax | ||
193 | adcx d9, d9 | ||
194 | adox d9, rdx | ||
195 | mov rdx, [x+40] | ||
196 | mov [z+40], d5 | ||
197 | mulx rdx, rax, rdx | ||
198 | mov [z+48], d6 | ||
199 | adcx d10, d10 | ||
200 | mov [z+56], d7 | ||
201 | adox d10, rax | ||
202 | mov [z+64], d8 | ||
203 | mov eax, 0 | ||
204 | mov [z+72], d9 | ||
205 | adcx rdx, rax | ||
206 | mov [z+80], d10 | ||
207 | adox rdx, rax | ||
208 | mov [z+88], rdx | ||
209 | |||
210 | // Restore saved registers and return | ||
211 | |||
212 | pop r15 | ||
213 | pop r14 | ||
214 | pop r13 | ||
215 | pop r12 | ||
216 | pop rbx | ||
217 | pop rbp | ||
218 | |||
219 | #if WINDOWS_ABI | ||
220 | pop rsi | ||
221 | pop rdi | ||
222 | #endif | ||
223 | ret | ||
224 | |||
225 | #if defined(__linux__) && defined(__ELF__) | ||
226 | .section .note.GNU-stack,"",%progbits | ||
227 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S new file mode 100644 index 0000000000..eb43b0a15b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S | |||
@@ -0,0 +1,210 @@ | |||
1 | // $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Square, z := x^2 | ||
19 | // Input x[6]; output z[12] | ||
20 | // | ||
21 | // extern void bignum_sqr_6_12_alt(uint64_t z[static 12], | ||
22 | // const uint64_t x[static 6]); | ||
23 | // | ||
24 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
25 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
26 | // ---------------------------------------------------------------------------- | ||
27 | |||
28 | #include "s2n_bignum_internal.h" | ||
29 | |||
30 | .intel_syntax noprefix | ||
31 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt) | ||
32 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt) | ||
33 | .text | ||
34 | |||
35 | // Input arguments | ||
36 | |||
37 | #define z rdi | ||
38 | #define x rsi | ||
39 | |||
40 | // Other variables used as a rotating 3-word window to add terms to | ||
41 | |||
42 | #define t0 r8 | ||
43 | #define t1 r9 | ||
44 | #define t2 r10 | ||
45 | |||
46 | // Additional temporaries for local windows to share doublings | ||
47 | |||
48 | #define u0 rcx | ||
49 | #define u1 r11 | ||
50 | |||
51 | // Macro for the key "multiply and add to (c,h,l)" step | ||
52 | |||
53 | #define combadd(c,h,l,numa,numb) \ | ||
54 | mov rax, numa; \ | ||
55 | mul QWORD PTR numb; \ | ||
56 | add l, rax; \ | ||
57 | adc h, rdx; \ | ||
58 | adc c, 0 | ||
59 | |||
60 | // Set up initial window (c,h,l) = numa * numb | ||
61 | |||
62 | #define combaddz(c,h,l,numa,numb) \ | ||
63 | mov rax, numa; \ | ||
64 | mul QWORD PTR numb; \ | ||
65 | xor c, c; \ | ||
66 | mov l, rax; \ | ||
67 | mov h, rdx | ||
68 | |||
69 | // Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) | ||
70 | |||
71 | #define doubladd(c,h,l,hh,ll) \ | ||
72 | add ll, ll; \ | ||
73 | adc hh, hh; \ | ||
74 | adc c, c; \ | ||
75 | add l, ll; \ | ||
76 | adc h, hh; \ | ||
77 | adc c, 0 | ||
78 | |||
79 | // Square term incorporation (c,h,l) += numba^2 | ||
80 | |||
81 | #define combadd1(c,h,l,numa) \ | ||
82 | mov rax, numa; \ | ||
83 | mul rax; \ | ||
84 | add l, rax; \ | ||
85 | adc h, rdx; \ | ||
86 | adc c, 0 | ||
87 | |||
88 | // A short form where we don't expect a top carry | ||
89 | |||
90 | #define combads(h,l,numa) \ | ||
91 | mov rax, numa; \ | ||
92 | mul rax; \ | ||
93 | add l, rax; \ | ||
94 | adc h, rdx | ||
95 | |||
96 | // A version doubling directly before adding, for single non-square terms | ||
97 | |||
98 | #define combadd2(c,h,l,numa,numb) \ | ||
99 | mov rax, numa; \ | ||
100 | mul QWORD PTR numb; \ | ||
101 | add rax, rax; \ | ||
102 | adc rdx, rdx; \ | ||
103 | adc c, 0; \ | ||
104 | add l, rax; \ | ||
105 | adc h, rdx; \ | ||
106 | adc c, 0 | ||
107 | |||
108 | S2N_BN_SYMBOL(bignum_sqr_6_12_alt): | ||
109 | _CET_ENDBR | ||
110 | |||
111 | #if WINDOWS_ABI | ||
112 | push rdi | ||
113 | push rsi | ||
114 | mov rdi, rcx | ||
115 | mov rsi, rdx | ||
116 | #endif | ||
117 | |||
118 | // Result term 0 | ||
119 | |||
120 | mov rax, [x] | ||
121 | mul rax | ||
122 | |||
123 | mov [z], rax | ||
124 | mov t0, rdx | ||
125 | xor t1, t1 | ||
126 | |||
127 | // Result term 1 | ||
128 | |||
129 | xor t2, t2 | ||
130 | combadd2(t2,t1,t0,[x],[x+8]) | ||
131 | mov [z+8], t0 | ||
132 | |||
133 | // Result term 2 | ||
134 | |||
135 | xor t0, t0 | ||
136 | combadd1(t0,t2,t1,[x+8]) | ||
137 | combadd2(t0,t2,t1,[x],[x+16]) | ||
138 | mov [z+16], t1 | ||
139 | |||
140 | // Result term 3 | ||
141 | |||
142 | combaddz(t1,u1,u0,[x],[x+24]) | ||
143 | combadd(t1,u1,u0,[x+8],[x+16]) | ||
144 | doubladd(t1,t0,t2,u1,u0) | ||
145 | mov [z+24], t2 | ||
146 | |||
147 | // Result term 4 | ||
148 | |||
149 | combaddz(t2,u1,u0,[x],[x+32]) | ||
150 | combadd(t2,u1,u0,[x+8],[x+24]) | ||
151 | doubladd(t2,t1,t0,u1,u0) | ||
152 | combadd1(t2,t1,t0,[x+16]) | ||
153 | mov [z+32], t0 | ||
154 | |||
155 | // Result term 5 | ||
156 | |||
157 | combaddz(t0,u1,u0,[x],[x+40]) | ||
158 | combadd(t0,u1,u0,[x+8],[x+32]) | ||
159 | combadd(t0,u1,u0,[x+16],[x+24]) | ||
160 | doubladd(t0,t2,t1,u1,u0) | ||
161 | mov [z+40], t1 | ||
162 | |||
163 | // Result term 6 | ||
164 | |||
165 | combaddz(t1,u1,u0,[x+8],[x+40]) | ||
166 | combadd(t1,u1,u0,[x+16],[x+32]) | ||
167 | doubladd(t1,t0,t2,u1,u0) | ||
168 | combadd1(t1,t0,t2,[x+24]) | ||
169 | mov [z+48], t2 | ||
170 | |||
171 | // Result term 7 | ||
172 | |||
173 | combaddz(t2,u1,u0,[x+16],[x+40]) | ||
174 | combadd(t2,u1,u0,[x+24],[x+32]) | ||
175 | doubladd(t2,t1,t0,u1,u0) | ||
176 | mov [z+56], t0 | ||
177 | |||
178 | // Result term 8 | ||
179 | |||
180 | xor t0, t0 | ||
181 | combadd2(t0,t2,t1,[x+24],[x+40]) | ||
182 | combadd1(t0,t2,t1,[x+32]) | ||
183 | mov [z+64], t1 | ||
184 | |||
185 | // Result term 9 | ||
186 | |||
187 | xor t1, t1 | ||
188 | combadd2(t1,t0,t2,[x+32],[x+40]) | ||
189 | mov [z+72], t2 | ||
190 | |||
191 | // Result term 10 | ||
192 | |||
193 | combads(t1,t0,[x+40]) | ||
194 | mov [z+80], t0 | ||
195 | |||
196 | // Result term 11 | ||
197 | |||
198 | mov [z+88], t1 | ||
199 | |||
200 | // Return | ||
201 | |||
202 | #if WINDOWS_ABI | ||
203 | pop rsi | ||
204 | pop rdi | ||
205 | #endif | ||
206 | ret | ||
207 | |||
208 | #if defined(__linux__) && defined(__ELF__) | ||
209 | .section .note.GNU-stack,"",%progbits | ||
210 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S new file mode 100644 index 0000000000..41277b5b6a --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S | |||
@@ -0,0 +1,311 @@ | |||
1 | // $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $ | ||
2 | // | ||
3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | // | ||
5 | // Permission to use, copy, modify, and/or distribute this software for any | ||
6 | // purpose with or without fee is hereby granted, provided that the above | ||
7 | // copyright notice and this permission notice appear in all copies. | ||
8 | // | ||
9 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | |||
17 | // ---------------------------------------------------------------------------- | ||
18 | // Square, z := x^2 | ||
19 | // Input x[8]; output z[16] | ||
20 | // | ||
21 | // extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]); | ||
22 | // | ||
23 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
24 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
25 | // ---------------------------------------------------------------------------- | ||
26 | |||
27 | #include "s2n_bignum_internal.h" | ||
28 | |||
29 | .intel_syntax noprefix | ||
30 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16) | ||
31 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16) | ||
32 | .text | ||
33 | |||
34 | // These are actually right | ||
35 | |||
36 | #define z rdi | ||
37 | #define x rsi | ||
38 | |||
39 | // A zero register | ||
40 | |||
41 | #define zero rbp | ||
42 | #define zeroe ebp | ||
43 | |||
44 | // mulpadd i, j adds rdx * x[i] into the window at the i+j point | ||
45 | |||
46 | .macro mulpadd arg1,arg2 | ||
47 | mulx rcx, rax, [x+8*\arg1] | ||
48 | .if ((\arg1 + \arg2) % 8 == 0) | ||
49 | adcx r8, rax | ||
50 | adox r9, rcx | ||
51 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
52 | adcx r9, rax | ||
53 | adox r10, rcx | ||
54 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
55 | adcx r10, rax | ||
56 | adox r11, rcx | ||
57 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
58 | adcx r11, rax | ||
59 | adox r12, rcx | ||
60 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
61 | adcx r12, rax | ||
62 | adox r13, rcx | ||
63 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
64 | adcx r13, rax | ||
65 | adox r14, rcx | ||
66 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
67 | adcx r14, rax | ||
68 | adox r15, rcx | ||
69 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
70 | adcx r15, rax | ||
71 | adox r8, rcx | ||
72 | .endif | ||
73 | |||
74 | .endm | ||
75 | |||
76 | // mulpade i, j adds rdx * x[i] into the window at i+j | ||
77 | // but re-creates the top word assuming nothing to add there | ||
78 | |||
79 | .macro mulpade arg1,arg2 | ||
80 | .if ((\arg1 + \arg2) % 8 == 0) | ||
81 | mulx r9, rax, [x+8*\arg1] | ||
82 | adcx r8, rax | ||
83 | adox r9, zero | ||
84 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
85 | mulx r10, rax, [x+8*\arg1] | ||
86 | adcx r9, rax | ||
87 | adox r10, zero | ||
88 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
89 | mulx r11, rax, [x+8*\arg1] | ||
90 | adcx r10, rax | ||
91 | adox r11, zero | ||
92 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
93 | mulx r12, rax, [x+8*\arg1] | ||
94 | adcx r11, rax | ||
95 | adox r12, zero | ||
96 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
97 | mulx r13, rax, [x+8*\arg1] | ||
98 | adcx r12, rax | ||
99 | adox r13, zero | ||
100 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
101 | mulx r14, rax, [x+8*\arg1] | ||
102 | adcx r13, rax | ||
103 | adox r14, zero | ||
104 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
105 | mulx r15, rax, [x+8*\arg1] | ||
106 | adcx r14, rax | ||
107 | adox r15, zero | ||
108 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
109 | mulx r8, rax, [x+8*\arg1] | ||
110 | adcx r15, rax | ||
111 | adox r8, zero | ||
112 | .endif | ||
113 | |||
114 | .endm | ||
115 | |||
116 | .macro diagonals | ||
117 | |||
118 | xor zeroe, zeroe | ||
119 | |||
120 | // Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 | ||
121 | |||
122 | mov rdx, [x] | ||
123 | mulx rax, r9, [x+8] | ||
124 | mov [z+8], r9 | ||
125 | mulx rcx, r10, [x+16] | ||
126 | adcx r10, rax | ||
127 | mov [z+16], r10 | ||
128 | mulx rax, r11, [x+24] | ||
129 | adcx r11, rcx | ||
130 | mulx rcx, r12, [x+32] | ||
131 | adcx r12, rax | ||
132 | mulx rax, r13, [x+40] | ||
133 | adcx r13, rcx | ||
134 | mulx rcx, r14, [x+48] | ||
135 | adcx r14, rax | ||
136 | mulx r8, r15, [x+56] | ||
137 | adcx r15, rcx | ||
138 | adcx r8, zero | ||
139 | |||
140 | // Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 | ||
141 | |||
142 | xor zeroe, zeroe | ||
143 | mov rdx, [x+8] | ||
144 | mulpadd 2, 1 | ||
145 | mov [z+24], r11 | ||
146 | mulpadd 3, 1 | ||
147 | mov [z+32], r12 | ||
148 | mulpadd 4, 1 | ||
149 | mulpadd 5, 1 | ||
150 | mulpadd 6, 1 | ||
151 | mulpade 7, 1 | ||
152 | mov rdx, [x+32] | ||
153 | mulpade 5, 4 | ||
154 | adcx r10, zero | ||
155 | |||
156 | // And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 | ||
157 | |||
158 | xor zeroe, zeroe | ||
159 | mov rdx, [x+16] | ||
160 | mulpadd 3, 2 | ||
161 | mov [z+40], r13 | ||
162 | mulpadd 4, 2 | ||
163 | mov [z+48], r14 | ||
164 | mulpadd 5, 2 | ||
165 | mulpadd 6, 2 | ||
166 | mulpadd 7, 2 | ||
167 | mov rdx, [x+48] | ||
168 | mulpade 4, 6 | ||
169 | mulpade 5, 6 | ||
170 | adcx r12, zero | ||
171 | |||
172 | // And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 | ||
173 | |||
174 | xor zeroe, zeroe | ||
175 | mov rdx, [x+24] | ||
176 | mulpadd 4, 3 | ||
177 | mov [z+56], r15 | ||
178 | mulpadd 5, 3 | ||
179 | mov [z+64], r8 | ||
180 | mulpadd 6, 3 | ||
181 | mulpadd 7, 3 | ||
182 | mov rdx, [x+56] | ||
183 | mulpadd 4, 7 | ||
184 | mulpade 5, 7 | ||
185 | mulpade 6, 7 | ||
186 | adcx r14, zero | ||
187 | |||
188 | // Double and add things; use z[1]..z[8] and thereafter the registers | ||
189 | // r9..r15 which haven't been written back yet | ||
190 | |||
191 | xor zeroe, zeroe | ||
192 | mov rdx, [x] | ||
193 | mulx rcx, rax, rdx | ||
194 | mov [z], rax | ||
195 | mov rax, [z+8] | ||
196 | adcx rax, rax | ||
197 | adox rax, rcx | ||
198 | mov [z+8], rax | ||
199 | |||
200 | mov rax, [z+16] | ||
201 | mov rdx, [x+8] | ||
202 | mulx rcx, rdx, rdx | ||
203 | adcx rax, rax | ||
204 | adox rax, rdx | ||
205 | mov [z+16], rax | ||
206 | mov rax, [z+24] | ||
207 | adcx rax, rax | ||
208 | adox rax, rcx | ||
209 | mov [z+24], rax | ||
210 | |||
211 | mov rax, [z+32] | ||
212 | mov rdx, [x+16] | ||
213 | mulx rcx, rdx, rdx | ||
214 | adcx rax, rax | ||
215 | adox rax, rdx | ||
216 | mov [z+32], rax | ||
217 | mov rax, [z+40] | ||
218 | adcx rax, rax | ||
219 | adox rax, rcx | ||
220 | mov [z+40], rax | ||
221 | |||
222 | mov rax, [z+48] | ||
223 | mov rdx, [x+24] | ||
224 | mulx rcx, rdx, rdx | ||
225 | adcx rax, rax | ||
226 | adox rax, rdx | ||
227 | mov [z+48], rax | ||
228 | mov rax, [z+56] | ||
229 | adcx rax, rax | ||
230 | adox rax, rcx | ||
231 | mov [z+56], rax | ||
232 | |||
233 | mov rax, [z+64] | ||
234 | mov rdx, [x+32] | ||
235 | mulx rcx, rdx, rdx | ||
236 | adcx rax, rax | ||
237 | adox rax, rdx | ||
238 | mov [z+64], rax | ||
239 | adcx r9, r9 | ||
240 | adox r9, rcx | ||
241 | mov [z+72], r9 | ||
242 | |||
243 | mov rdx, [x+40] | ||
244 | mulx rcx, rdx, rdx | ||
245 | adcx r10, r10 | ||
246 | adox r10, rdx | ||
247 | mov [z+80], r10 | ||
248 | adcx r11, r11 | ||
249 | adox r11, rcx | ||
250 | mov [z+88], r11 | ||
251 | |||
252 | mov rdx, [x+48] | ||
253 | mulx rcx, rdx, rdx | ||
254 | adcx r12, r12 | ||
255 | adox r12, rdx | ||
256 | mov [z+96], r12 | ||
257 | adcx r13, r13 | ||
258 | adox r13, rcx | ||
259 | mov [z+104], r13 | ||
260 | |||
261 | mov rdx, [x+56] | ||
262 | mulx r15, rdx, rdx | ||
263 | adcx r14, r14 | ||
264 | adox r14, rdx | ||
265 | mov [z+112], r14 | ||
266 | adcx r15, zero | ||
267 | adox r15, zero | ||
268 | mov [z+120], r15 | ||
269 | |||
270 | .endm | ||
271 | |||
272 | |||
273 | S2N_BN_SYMBOL(bignum_sqr_8_16): | ||
274 | _CET_ENDBR | ||
275 | |||
276 | #if WINDOWS_ABI | ||
277 | push rdi | ||
278 | push rsi | ||
279 | mov rdi, rcx | ||
280 | mov rsi, rdx | ||
281 | #endif | ||
282 | |||
283 | // Save more registers to play with | ||
284 | |||
285 | push rbp | ||
286 | push r12 | ||
287 | push r13 | ||
288 | push r14 | ||
289 | push r15 | ||
290 | |||
291 | // Do the multiplication | ||
292 | |||
293 | diagonals | ||
294 | |||
295 | // Real epilog | ||
296 | |||
297 | pop r15 | ||
298 | pop r14 | ||
299 | pop r13 | ||
300 | pop r12 | ||
301 | pop rbp | ||
302 | |||
303 | #if WINDOWS_ABI | ||
304 | pop rsi | ||
305 | pop rdi | ||
306 | #endif | ||
307 | ret | ||
308 | |||
309 | #if defined(__linux__) && defined(__ELF__) | ||
310 | .section .note.GNU-stack,"",%progbits | ||
311 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S index ac0b6f96c2..cb10ba2a12 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,7 +18,8 @@ | |||
16 | // Square, z := x^2 | 18 | // Square, z := x^2 |
17 | // Input x[8]; output z[16] | 19 | // Input x[8]; output z[16] |
18 | // | 20 | // |
19 | // extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); | 21 | // extern void bignum_sqr_8_16_alt(uint64_t z[static 16], |
22 | // const uint64_t x[static 8]); | ||
20 | // | 23 | // |
21 | // Standard x86-64 ABI: RDI = z, RSI = x | 24 | // Standard x86-64 ABI: RDI = z, RSI = x |
22 | // Microsoft x64 ABI: RCX = z, RDX = x | 25 | // Microsoft x64 ABI: RCX = z, RDX = x |
@@ -103,7 +106,7 @@ | |||
103 | adc c, 0 | 106 | adc c, 0 |
104 | 107 | ||
105 | S2N_BN_SYMBOL(bignum_sqr_8_16_alt): | 108 | S2N_BN_SYMBOL(bignum_sqr_8_16_alt): |
106 | _CET_ENDBR | 109 | _CET_ENDBR |
107 | 110 | ||
108 | #if WINDOWS_ABI | 111 | #if WINDOWS_ABI |
109 | push rdi | 112 | push rdi |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S index 3ff8a30510..7324d3a71e 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,9 +18,8 @@ | |||
16 | // Subtract, z := x - y | 18 | // Subtract, z := x - y |
17 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] | 19 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] |
18 | // | 20 | // |
19 | // extern uint64_t bignum_sub | 21 | // extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m, |
20 | // (uint64_t p, uint64_t *z, | 22 | // const uint64_t *x, uint64_t n, const uint64_t *y); |
21 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
22 | // | 23 | // |
23 | // Does the z := x - y operation, truncating modulo p words in general and | 24 | // Does the z := x - y operation, truncating modulo p words in general and |
24 | // returning a top borrow (0 or 1) in the p'th place, only subtracting input | 25 | // returning a top borrow (0 or 1) in the p'th place, only subtracting input |
@@ -49,7 +50,7 @@ | |||
49 | 50 | ||
50 | 51 | ||
51 | S2N_BN_SYMBOL(bignum_sub): | 52 | S2N_BN_SYMBOL(bignum_sub): |
52 | _CET_ENDBR | 53 | _CET_ENDBR |
53 | 54 | ||
54 | #if WINDOWS_ABI | 55 | #if WINDOWS_ABI |
55 | push rdi | 56 | push rdi |
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub): | |||
75 | cmp p, n | 76 | cmp p, n |
76 | cmovc n, p | 77 | cmovc n, p |
77 | cmp m, n | 78 | cmp m, n |
78 | jc ylonger | 79 | jc bignum_sub_ylonger |
79 | 80 | ||
80 | // The case where x is longer or of the same size (p >= m >= n) | 81 | // The case where x is longer or of the same size (p >= m >= n) |
81 | 82 | ||
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub): | |||
83 | sub m, n | 84 | sub m, n |
84 | inc m | 85 | inc m |
85 | test n, n | 86 | test n, n |
86 | jz xtest | 87 | jz bignum_sub_xtest |
87 | xmainloop: | 88 | bignum_sub_xmainloop: |
88 | mov a, [x+8*i] | 89 | mov a, [x+8*i] |
89 | sbb a, [y+8*i] | 90 | sbb a, [y+8*i] |
90 | mov [z+8*i],a | 91 | mov [z+8*i],a |
91 | inc i | 92 | inc i |
92 | dec n | 93 | dec n |
93 | jnz xmainloop | 94 | jnz bignum_sub_xmainloop |
94 | jmp xtest | 95 | jmp bignum_sub_xtest |
95 | xtoploop: | 96 | bignum_sub_xtoploop: |
96 | mov a, [x+8*i] | 97 | mov a, [x+8*i] |
97 | sbb a, 0 | 98 | sbb a, 0 |
98 | mov [z+8*i],a | 99 | mov [z+8*i],a |
99 | inc i | 100 | inc i |
100 | xtest: | 101 | bignum_sub_xtest: |
101 | dec m | 102 | dec m |
102 | jnz xtoploop | 103 | jnz bignum_sub_xtoploop |
103 | sbb a, a | 104 | sbb a, a |
104 | test p, p | 105 | test p, p |
105 | jz tailskip | 106 | jz bignum_sub_tailskip |
106 | tailloop: | 107 | bignum_sub_tailloop: |
107 | mov [z+8*i],a | 108 | mov [z+8*i],a |
108 | inc i | 109 | inc i |
109 | dec p | 110 | dec p |
110 | jnz tailloop | 111 | jnz bignum_sub_tailloop |
111 | tailskip: | 112 | bignum_sub_tailskip: |
112 | neg a | 113 | neg a |
113 | #if WINDOWS_ABI | 114 | #if WINDOWS_ABI |
114 | pop rsi | 115 | pop rsi |
@@ -118,29 +119,29 @@ tailskip: | |||
118 | 119 | ||
119 | // The case where y is longer (p >= n > m) | 120 | // The case where y is longer (p >= n > m) |
120 | 121 | ||
121 | ylonger: | 122 | bignum_sub_ylonger: |
122 | 123 | ||
123 | sub p, n | 124 | sub p, n |
124 | sub n, m | 125 | sub n, m |
125 | test m, m | 126 | test m, m |
126 | jz ytoploop | 127 | jz bignum_sub_ytoploop |
127 | ymainloop: | 128 | bignum_sub_ymainloop: |
128 | mov a, [x+8*i] | 129 | mov a, [x+8*i] |
129 | sbb a, [y+8*i] | 130 | sbb a, [y+8*i] |
130 | mov [z+8*i],a | 131 | mov [z+8*i],a |
131 | inc i | 132 | inc i |
132 | dec m | 133 | dec m |
133 | jnz ymainloop | 134 | jnz bignum_sub_ymainloop |
134 | ytoploop: | 135 | bignum_sub_ytoploop: |
135 | mov ashort, 0 | 136 | mov ashort, 0 |
136 | sbb a, [y+8*i] | 137 | sbb a, [y+8*i] |
137 | mov [z+8*i],a | 138 | mov [z+8*i],a |
138 | inc i | 139 | inc i |
139 | dec n | 140 | dec n |
140 | jnz ytoploop | 141 | jnz bignum_sub_ytoploop |
141 | sbb a, a | 142 | sbb a, a |
142 | test p, p | 143 | test p, p |
143 | jnz tailloop | 144 | jnz bignum_sub_tailloop |
144 | neg a | 145 | neg a |
145 | #if WINDOWS_ABI | 146 | #if WINDOWS_ABI |
146 | pop rsi | 147 | pop rsi |
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c index a377a05681..9ff8920ca2 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c +++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */ | 1 | /* $OpenBSD: bn_arch.c,v 1.12 2025/08/14 15:29:17 jsing Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> |
4 | * | 4 | * |
@@ -19,6 +19,7 @@ | |||
19 | 19 | ||
20 | #include "bn_arch.h" | 20 | #include "bn_arch.h" |
21 | #include "bn_local.h" | 21 | #include "bn_local.h" |
22 | #include "crypto_arch.h" | ||
22 | #include "s2n_bignum.h" | 23 | #include "s2n_bignum.h" |
23 | 24 | ||
24 | #ifdef HAVE_BN_ADD | 25 | #ifdef HAVE_BN_ADD |
@@ -26,8 +27,8 @@ BN_ULONG | |||
26 | bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, | 27 | bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, |
27 | int b_len) | 28 | int b_len) |
28 | { | 29 | { |
29 | return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a, | 30 | return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a, |
30 | b_len, (uint64_t *)b); | 31 | b_len, (const uint64_t *)b); |
31 | } | 32 | } |
32 | #endif | 33 | #endif |
33 | 34 | ||
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, | |||
36 | BN_ULONG | 37 | BN_ULONG |
37 | bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) | 38 | bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) |
38 | { | 39 | { |
39 | return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n, | 40 | return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n, |
40 | (uint64_t *)bd); | 41 | (const uint64_t *)bd); |
41 | } | 42 | } |
42 | #endif | 43 | #endif |
43 | 44 | ||
@@ -46,8 +47,8 @@ BN_ULONG | |||
46 | bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, | 47 | bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, |
47 | int b_len) | 48 | int b_len) |
48 | { | 49 | { |
49 | return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a, | 50 | return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a, |
50 | b_len, (uint64_t *)b); | 51 | b_len, (const uint64_t *)b); |
51 | } | 52 | } |
52 | #endif | 53 | #endif |
53 | 54 | ||
@@ -55,8 +56,28 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, | |||
55 | BN_ULONG | 56 | BN_ULONG |
56 | bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) | 57 | bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) |
57 | { | 58 | { |
58 | return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n, | 59 | return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n, |
59 | (uint64_t *)bd); | 60 | (const uint64_t *)bd); |
61 | } | ||
62 | #endif | ||
63 | |||
64 | #ifdef HAVE_BN_MOD_ADD_WORDS | ||
65 | void | ||
66 | bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, | ||
67 | const BN_ULONG *m, size_t n) | ||
68 | { | ||
69 | bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a, | ||
70 | (const uint64_t *)b, (const uint64_t *)m); | ||
71 | } | ||
72 | #endif | ||
73 | |||
74 | #ifdef HAVE_BN_MOD_SUB_WORDS | ||
75 | void | ||
76 | bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, | ||
77 | const BN_ULONG *m, size_t n) | ||
78 | { | ||
79 | bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a, | ||
80 | (const uint64_t *)b, (const uint64_t *)m); | ||
60 | } | 81 | } |
61 | #endif | 82 | #endif |
62 | 83 | ||
@@ -64,7 +85,7 @@ bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) | |||
64 | BN_ULONG | 85 | BN_ULONG |
65 | bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) | 86 | bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) |
66 | { | 87 | { |
67 | return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad); | 88 | return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad); |
68 | } | 89 | } |
69 | #endif | 90 | #endif |
70 | 91 | ||
@@ -72,25 +93,52 @@ bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) | |||
72 | BN_ULONG | 93 | BN_ULONG |
73 | bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) | 94 | bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) |
74 | { | 95 | { |
75 | return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad); | 96 | return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad); |
76 | } | 97 | } |
77 | #endif | 98 | #endif |
78 | 99 | ||
79 | #ifdef HAVE_BN_MUL_COMBA4 | 100 | #ifdef HAVE_BN_MUL_COMBA4 |
80 | void | 101 | void |
81 | bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) | 102 | bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd) |
82 | { | 103 | { |
83 | /* XXX - consider using non-alt on CPUs that have the ADX extension. */ | 104 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) { |
84 | bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd); | 105 | bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad, |
106 | (const uint64_t *)bd); | ||
107 | return; | ||
108 | } | ||
109 | |||
110 | bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad, | ||
111 | (const uint64_t *)bd); | ||
112 | } | ||
113 | #endif | ||
114 | |||
115 | #ifdef HAVE_BN_MUL_COMBA6 | ||
116 | void | ||
117 | bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd) | ||
118 | { | ||
119 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) { | ||
120 | bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad, | ||
121 | (const uint64_t *)bd); | ||
122 | return; | ||
123 | } | ||
124 | |||
125 | bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad, | ||
126 | (const uint64_t *)bd); | ||
85 | } | 127 | } |
86 | #endif | 128 | #endif |
87 | 129 | ||
88 | #ifdef HAVE_BN_MUL_COMBA8 | 130 | #ifdef HAVE_BN_MUL_COMBA8 |
89 | void | 131 | void |
90 | bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) | 132 | bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd) |
91 | { | 133 | { |
92 | /* XXX - consider using non-alt on CPUs that have the ADX extension. */ | 134 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) { |
93 | bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd); | 135 | bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad, |
136 | (const uint64_t *)bd); | ||
137 | return; | ||
138 | } | ||
139 | |||
140 | bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad, | ||
141 | (const uint64_t *)bd); | ||
94 | } | 142 | } |
95 | #endif | 143 | #endif |
96 | 144 | ||
@@ -98,7 +146,7 @@ bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) | |||
98 | int | 146 | int |
99 | bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx) | 147 | bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx) |
100 | { | 148 | { |
101 | bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d); | 149 | bignum_sqr(r_len, (uint64_t *)r->d, a->top, (const uint64_t *)a->d); |
102 | 150 | ||
103 | return 1; | 151 | return 1; |
104 | } | 152 | } |
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx) | |||
108 | void | 156 | void |
109 | bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad) | 157 | bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad) |
110 | { | 158 | { |
111 | /* XXX - consider using non-alt on CPUs that have the ADX extension. */ | 159 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) { |
112 | bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad); | 160 | bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad); |
161 | return; | ||
162 | } | ||
163 | |||
164 | bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad); | ||
165 | } | ||
166 | #endif | ||
167 | |||
168 | #ifdef HAVE_BN_SQR_COMBA6 | ||
169 | void | ||
170 | bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad) | ||
171 | { | ||
172 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) { | ||
173 | bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad); | ||
174 | return; | ||
175 | } | ||
176 | |||
177 | bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad); | ||
113 | } | 178 | } |
114 | #endif | 179 | #endif |
115 | 180 | ||
@@ -117,8 +182,12 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad) | |||
117 | void | 182 | void |
118 | bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad) | 183 | bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad) |
119 | { | 184 | { |
120 | /* XXX - consider using non-alt on CPUs that have the ADX extension. */ | 185 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) { |
121 | bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad); | 186 | bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad); |
187 | return; | ||
188 | } | ||
189 | |||
190 | bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad); | ||
122 | } | 191 | } |
123 | #endif | 192 | #endif |
124 | 193 | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h index 927cd75208..7359f993a7 100644 --- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h +++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */ | 1 | /* $OpenBSD: bn_arch.h,v 1.16 2025/08/14 15:22:54 jsing Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> |
4 | * | 4 | * |
@@ -27,13 +27,18 @@ | |||
27 | 27 | ||
28 | #define HAVE_BN_DIV_WORDS | 28 | #define HAVE_BN_DIV_WORDS |
29 | 29 | ||
30 | #define HAVE_BN_MOD_ADD_WORDS | ||
31 | #define HAVE_BN_MOD_SUB_WORDS | ||
32 | |||
30 | #define HAVE_BN_MUL_ADD_WORDS | 33 | #define HAVE_BN_MUL_ADD_WORDS |
31 | #define HAVE_BN_MUL_COMBA4 | 34 | #define HAVE_BN_MUL_COMBA4 |
35 | #define HAVE_BN_MUL_COMBA6 | ||
32 | #define HAVE_BN_MUL_COMBA8 | 36 | #define HAVE_BN_MUL_COMBA8 |
33 | #define HAVE_BN_MUL_WORDS | 37 | #define HAVE_BN_MUL_WORDS |
34 | 38 | ||
35 | #define HAVE_BN_SQR | 39 | #define HAVE_BN_SQR |
36 | #define HAVE_BN_SQR_COMBA4 | 40 | #define HAVE_BN_SQR_COMBA4 |
41 | #define HAVE_BN_SQR_COMBA6 | ||
37 | #define HAVE_BN_SQR_COMBA8 | 42 | #define HAVE_BN_SQR_COMBA8 |
38 | 43 | ||
39 | #define HAVE_BN_SUB | 44 | #define HAVE_BN_SUB |
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S index 3926fcd4b0..705fbdbbda 100644 --- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S +++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S | |||
@@ -1,3 +1,5 @@ | |||
1 | // $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $ | ||
2 | // | ||
1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
2 | // | 4 | // |
3 | // Permission to use, copy, modify, and/or distribute this software for any | 5 | // Permission to use, copy, modify, and/or distribute this software for any |
@@ -16,7 +18,7 @@ | |||
16 | // Count leading zero bits in a single word | 18 | // Count leading zero bits in a single word |
17 | // Input a; output function return | 19 | // Input a; output function return |
18 | // | 20 | // |
19 | // extern uint64_t word_clz (uint64_t a); | 21 | // extern uint64_t word_clz(uint64_t a); |
20 | // | 22 | // |
21 | // Standard x86-64 ABI: RDI = a, returns RAX | 23 | // Standard x86-64 ABI: RDI = a, returns RAX |
22 | // Microsoft x64 ABI: RCX = a, returns RAX | 24 | // Microsoft x64 ABI: RCX = a, returns RAX |
@@ -30,7 +32,7 @@ | |||
30 | .text | 32 | .text |
31 | 33 | ||
32 | S2N_BN_SYMBOL(word_clz): | 34 | S2N_BN_SYMBOL(word_clz): |
33 | _CET_ENDBR | 35 | _CET_ENDBR |
34 | 36 | ||
35 | #if WINDOWS_ABI | 37 | #if WINDOWS_ABI |
36 | push rdi | 38 | push rdi |