summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/arch/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/arch/amd64')
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_add.S51
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S32
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S28
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S112
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S99
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul.S25
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S187
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S8
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S223
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S199
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S273
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S9
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S29
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S158
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S8
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S227
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S210
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S311
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S7
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sub.S47
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bn_arch.c113
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bn_arch.h7
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/word_clz.S6
23 files changed, 2231 insertions, 138 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
index 5fe4aae7a1..1d4e6d08ef 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
16// Add, z := x + y 18// Add, z := x + y
17// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] 19// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
18// 20//
19// extern uint64_t bignum_add 21// extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m,
20// (uint64_t p, uint64_t *z, 22// const uint64_t *x, uint64_t n, const uint64_t *y);
21// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22// 23//
23// Does the z := x + y operation, truncating modulo p words in general and 24// Does the z := x + y operation, truncating modulo p words in general and
24// returning a top carry (0 or 1) in the p'th place, only adding the input 25// returning a top carry (0 or 1) in the p'th place, only adding the input
@@ -49,7 +50,7 @@
49 50
50 51
51S2N_BN_SYMBOL(bignum_add): 52S2N_BN_SYMBOL(bignum_add):
52 _CET_ENDBR 53 _CET_ENDBR
53 54
54#if WINDOWS_ABI 55#if WINDOWS_ABI
55 push rdi 56 push rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add):
75 cmp p, n 76 cmp p, n
76 cmovc n, p 77 cmovc n, p
77 cmp m, n 78 cmp m, n
78 jc ylonger 79 jc bignum_add_ylonger
79 80
80// The case where x is longer or of the same size (p >= m >= n) 81// The case where x is longer or of the same size (p >= m >= n)
81 82
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add):
83 sub m, n 84 sub m, n
84 inc m 85 inc m
85 test n, n 86 test n, n
86 jz xtest 87 jz bignum_add_xtest
87xmainloop: 88bignum_add_xmainloop:
88 mov a, [x+8*i] 89 mov a, [x+8*i]
89 adc a, [y+8*i] 90 adc a, [y+8*i]
90 mov [z+8*i],a 91 mov [z+8*i],a
91 inc i 92 inc i
92 dec n 93 dec n
93 jnz xmainloop 94 jnz bignum_add_xmainloop
94 jmp xtest 95 jmp bignum_add_xtest
95xtoploop: 96bignum_add_xtoploop:
96 mov a, [x+8*i] 97 mov a, [x+8*i]
97 adc a, 0 98 adc a, 0
98 mov [z+8*i],a 99 mov [z+8*i],a
99 inc i 100 inc i
100xtest: 101bignum_add_xtest:
101 dec m 102 dec m
102 jnz xtoploop 103 jnz bignum_add_xtoploop
103 mov ashort, 0 104 mov ashort, 0
104 adc a, 0 105 adc a, 0
105 test p, p 106 test p, p
106 jnz tails 107 jnz bignum_add_tails
107#if WINDOWS_ABI 108#if WINDOWS_ABI
108 pop rsi 109 pop rsi
109 pop rdi 110 pop rdi
@@ -112,30 +113,30 @@ xtest:
112 113
113// The case where y is longer (p >= n > m) 114// The case where y is longer (p >= n > m)
114 115
115ylonger: 116bignum_add_ylonger:
116 117
117 sub p, n 118 sub p, n
118 sub n, m 119 sub n, m
119 test m, m 120 test m, m
120 jz ytoploop 121 jz bignum_add_ytoploop
121ymainloop: 122bignum_add_ymainloop:
122 mov a, [x+8*i] 123 mov a, [x+8*i]
123 adc a, [y+8*i] 124 adc a, [y+8*i]
124 mov [z+8*i],a 125 mov [z+8*i],a
125 inc i 126 inc i
126 dec m 127 dec m
127 jnz ymainloop 128 jnz bignum_add_ymainloop
128ytoploop: 129bignum_add_ytoploop:
129 mov a, [y+8*i] 130 mov a, [y+8*i]
130 adc a, 0 131 adc a, 0
131 mov [z+8*i],a 132 mov [z+8*i],a
132 inc i 133 inc i
133 dec n 134 dec n
134 jnz ytoploop 135 jnz bignum_add_ytoploop
135 mov ashort, 0 136 mov ashort, 0
136 adc a, 0 137 adc a, 0
137 test p, p 138 test p, p
138 jnz tails 139 jnz bignum_add_tails
139#if WINDOWS_ABI 140#if WINDOWS_ABI
140 pop rsi 141 pop rsi
141 pop rdi 142 pop rdi
@@ -144,16 +145,16 @@ ytoploop:
144 145
145// Adding a non-trivial tail, when p > max(m,n) 146// Adding a non-trivial tail, when p > max(m,n)
146 147
147tails: 148bignum_add_tails:
148 mov [z+8*i],a 149 mov [z+8*i],a
149 xor a, a 150 xor a, a
150 jmp tail 151 jmp bignum_add_tail
151tailloop: 152bignum_add_tailloop:
152 mov [z+8*i],a 153 mov [z+8*i],a
153tail: 154bignum_add_tail:
154 inc i 155 inc i
155 dec p 156 dec p
156 jnz tailloop 157 jnz bignum_add_tailloop
157#if WINDOWS_ABI 158#if WINDOWS_ABI
158 pop rsi 159 pop rsi
159 pop rdi 160 pop rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
index 25ba17bce2..a611919603 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Multiply-add with single-word multiplier, z := z + c * y 18// Multiply-add with single-word multiplier, z := z + c * y
17// Inputs c, y[n]; outputs function return (carry-out) and z[k] 19// Inputs c, y[n]; outputs function return (carry-out) and z[k]
18// 20//
19// extern uint64_t bignum_cmadd 21// extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
20// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 22// const uint64_t *y);
21// 23//
22// Does the "z := z + c * y" operation where y is n digits, result z is p. 24// Does the "z := z + c * y" operation where y is n digits, result z is p.
23// Truncates the result in general. 25// Truncates the result in general.
@@ -54,7 +56,7 @@
54 56
55 57
56S2N_BN_SYMBOL(bignum_cmadd): 58S2N_BN_SYMBOL(bignum_cmadd):
57 _CET_ENDBR 59 _CET_ENDBR
58 60
59#if WINDOWS_ABI 61#if WINDOWS_ABI
60 push rdi 62 push rdi
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd):
82 84
83 xor h, h 85 xor h, h
84 test n, n 86 test n, n
85 jz end 87 jz bignum_cmadd_end
86 88
87// Move c into a safer register as multiplies overwrite rdx 89// Move c into a safer register as multiplies overwrite rdx
88 90
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd):
96 mov h, rdx 98 mov h, rdx
97 mov ishort, 1 99 mov ishort, 1
98 dec n 100 dec n
99 jz hightail 101 jz bignum_cmadd_hightail
100 102
101// Main loop, where we always have CF + previous high part h to add in 103// Main loop, where we always have CF + previous high part h to add in
102 104
103loop: 105bignum_cmadd_loop:
104 adc h, [z+8*i] 106 adc h, [z+8*i]
105 sbb r, r 107 sbb r, r
106 mov rax, [x+8*i] 108 mov rax, [x+8*i]
@@ -111,36 +113,36 @@ loop:
111 mov h, rdx 113 mov h, rdx
112 inc i 114 inc i
113 dec n 115 dec n
114 jnz loop 116 jnz bignum_cmadd_loop
115 117
116hightail: 118bignum_cmadd_hightail:
117 adc h, 0 119 adc h, 0
118 120
119// Propagate the carry all the way to the end with h as extra carry word 121// Propagate the carry all the way to the end with h as extra carry word
120 122
121tail: 123bignum_cmadd_tail:
122 test p, p 124 test p, p
123 jz end 125 jz bignum_cmadd_end
124 126
125 add [z+8*i], h 127 add [z+8*i], h
126 mov hshort, 0 128 mov hshort, 0
127 inc i 129 inc i
128 dec p 130 dec p
129 jz highend 131 jz bignum_cmadd_highend
130 132
131tloop: 133bignum_cmadd_tloop:
132 adc [z+8*i], h 134 adc [z+8*i], h
133 inc i 135 inc i
134 dec p 136 dec p
135 jnz tloop 137 jnz bignum_cmadd_tloop
136 138
137highend: 139bignum_cmadd_highend:
138 140
139 adc h, 0 141 adc h, 0
140 142
141// Return the high/carry word 143// Return the high/carry word
142 144
143end: 145bignum_cmadd_end:
144 mov rax, h 146 mov rax, h
145 147
146 pop rbx 148 pop rbx
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
index 12f785d63a..eb71d9da44 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Multiply by a single word, z := c * y 18// Multiply by a single word, z := c * y
17// Inputs c, y[n]; outputs function return (carry-out) and z[k] 19// Inputs c, y[n]; outputs function return (carry-out) and z[k]
18// 20//
19// extern uint64_t bignum_cmul 21// extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
20// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 22// const uint64_t *y);
21// 23//
22// Does the "z := c * y" operation where y is n digits, result z is p. 24// Does the "z := c * y" operation where y is n digits, result z is p.
23// Truncates the result in general unless p >= n + 1. 25// Truncates the result in general unless p >= n + 1.
@@ -51,7 +53,7 @@
51 53
52 54
53S2N_BN_SYMBOL(bignum_cmul): 55S2N_BN_SYMBOL(bignum_cmul):
54 _CET_ENDBR 56 _CET_ENDBR
55 57
56#if WINDOWS_ABI 58#if WINDOWS_ABI
57 push rdi 59 push rdi
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul):
76 xor h, h 78 xor h, h
77 xor i, i 79 xor i, i
78 test n, n 80 test n, n
79 jz tail 81 jz bignum_cmul_tail
80 82
81// Move c into a safer register as multiplies overwrite rdx 83// Move c into a safer register as multiplies overwrite rdx
82 84
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul):
90 mov h, rdx 92 mov h, rdx
91 inc i 93 inc i
92 cmp i, n 94 cmp i, n
93 jz tail 95 jz bignum_cmul_tail
94 96
95// Main loop doing the multiplications 97// Main loop doing the multiplications
96 98
97loop: 99bignum_cmul_loop:
98 mov rax, [x+8*i] 100 mov rax, [x+8*i]
99 mul c 101 mul c
100 add rax, h 102 add rax, h
@@ -103,28 +105,28 @@ loop:
103 mov h, rdx 105 mov h, rdx
104 inc i 106 inc i
105 cmp i, n 107 cmp i, n
106 jc loop 108 jc bignum_cmul_loop
107 109
108// Add a tail when the destination is longer 110// Add a tail when the destination is longer
109 111
110tail: 112bignum_cmul_tail:
111 cmp i, p 113 cmp i, p
112 jnc end 114 jnc bignum_cmul_end
113 mov [z+8*i], h 115 mov [z+8*i], h
114 xor h, h 116 xor h, h
115 inc i 117 inc i
116 cmp i, p 118 cmp i, p
117 jnc end 119 jnc bignum_cmul_end
118 120
119tloop: 121bignum_cmul_tloop:
120 mov [z+8*i], h 122 mov [z+8*i], h
121 inc i 123 inc i
122 cmp i, p 124 cmp i, p
123 jc tloop 125 jc bignum_cmul_tloop
124 126
125// Return the high/carry word 127// Return the high/carry word
126 128
127end: 129bignum_cmul_end:
128 mov rax, h 130 mov rax, h
129 131
130#if WINDOWS_ABI 132#if WINDOWS_ABI
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
new file mode 100644
index 0000000000..baf27fdc7f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
@@ -0,0 +1,112 @@
1// $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Add modulo m, z := (x + y) mod m, assuming x and y reduced
19// Inputs x[k], y[k], m[k]; output z[k]
20//
21// extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x,
22// const uint64_t *y, const uint64_t *m);
23//
24// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
25// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
33 .text
34
35#define k rdi
36#define z rsi
37#define x rdx
38#define y rcx
39#define m r8
40#define i r9
41#define j r10
42#define a rax
43#define c r11
44
45S2N_BN_SYMBOL(bignum_modadd):
46 _CET_ENDBR
47
48#if WINDOWS_ABI
49 push rdi
50 push rsi
51 mov rdi, rcx
52 mov rsi, rdx
53 mov rdx, r8
54 mov rcx, r9
55 mov r8, [rsp+56]
56#endif
57
58// If k = 0 do nothing
59
60 test k, k
61 jz bignum_modadd_end
62
63// First just add (c::z) := x + y
64
65 xor c, c
66 mov j, k
67 xor i, i
68bignum_modadd_addloop:
69 mov a, [x+8*i]
70 adc a, [y+8*i]
71 mov [z+8*i], a
72 inc i
73 dec j
74 jnz bignum_modadd_addloop
75 adc c, 0
76
77// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
78
79 mov j, k
80 xor i, i
81bignum_modadd_cmploop:
82 mov a, [z+8*i]
83 sbb a, [m+8*i]
84 inc i
85 dec j
86 jnz bignum_modadd_cmploop
87 sbb c, 0
88 not c
89
90// Now do a masked subtraction z := z - [c] * m
91
92 xor i, i
93bignum_modadd_subloop:
94 mov a, [m+8*i]
95 and a, c
96 neg j
97 sbb [z+8*i], a
98 sbb j, j
99 inc i
100 cmp i, k
101 jc bignum_modadd_subloop
102
103bignum_modadd_end:
104#if WINDOWS_ABI
105 pop rsi
106 pop rdi
107#endif
108 ret
109
110#if defined(__linux__) && defined(__ELF__)
111.section .note.GNU-stack,"",%progbits
112#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
new file mode 100644
index 0000000000..63b3230e35
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
@@ -0,0 +1,99 @@
1// $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
19// Inputs x[k], y[k], m[k]; output z[k]
20//
21// extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x,
22// const uint64_t *y, const uint64_t *m);
23//
24// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
25// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
33 .text
34
35#define k rdi
36#define z rsi
37#define x rdx
38#define y rcx
39#define m r8
40#define i r9
41#define j r10
42#define a rax
43#define c r11
44
45S2N_BN_SYMBOL(bignum_modsub):
46 _CET_ENDBR
47
48#if WINDOWS_ABI
49 push rdi
50 push rsi
51 mov rdi, rcx
52 mov rsi, rdx
53 mov rdx, r8
54 mov rcx, r9
55 mov r8, [rsp+56]
56#endif
57
58// If k = 0 do nothing
59
60 test k, k
61 jz bignum_modsub_end
62
63// Subtract z := x - y and record a mask for the carry x - y < 0
64
65 xor c, c
66 mov j, k
67 xor i, i
68bignum_modsub_subloop:
69 mov a, [x+8*i]
70 sbb a, [y+8*i]
71 mov [z+8*i], a
72 inc i
73 dec j
74 jnz bignum_modsub_subloop
75 sbb c, c
76
77// Now do a masked addition z := z + [c] * m
78
79 xor i, i
80bignum_modsub_addloop:
81 mov a, [m+8*i]
82 and a, c
83 neg j
84 adc [z+8*i], a
85 sbb j, j
86 inc i
87 cmp i, k
88 jc bignum_modsub_addloop
89
90bignum_modsub_end:
91#if WINDOWS_ABI
92 pop rsi
93 pop rdi
94#endif
95 ret
96
97#if defined(__linux__) && defined(__ELF__)
98.section .note.GNU-stack,"",%progbits
99#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
index a3552679a2..538cce9af7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
16// Multiply z := x * y 18// Multiply z := x * y
17// Inputs x[m], y[n]; output z[k] 19// Inputs x[m], y[n]; output z[k]
18// 20//
19// extern void bignum_mul 21// extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
20// (uint64_t k, uint64_t *z, 22// uint64_t n, const uint64_t *y);
21// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22// 23//
23// Does the "z := x * y" operation where x is m digits, y is n, result z is k. 24// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
24// Truncates the result in general unless k >= m + n 25// Truncates the result in general unless k >= m + n
@@ -59,7 +60,7 @@
59 60
60 61
61S2N_BN_SYMBOL(bignum_mul): 62S2N_BN_SYMBOL(bignum_mul):
62 _CET_ENDBR 63 _CET_ENDBR
63 64
64#if WINDOWS_ABI 65#if WINDOWS_ABI
65 push rdi 66 push rdi
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul):
88// If we did a multiply-add variant, however, then we could 89// If we did a multiply-add variant, however, then we could
89 90
90 test p, p 91 test p, p
91 jz end 92 jz bignum_mul_end
92 93
93// Set initial 2-part sum to zero (we zero c inside the body) 94// Set initial 2-part sum to zero (we zero c inside the body)
94 95
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul):
99 100
100 xor k, k 101 xor k, k
101 102
102outerloop: 103bignum_mul_outerloop:
103 104
104// Zero our carry term first; we eventually want it and a zero is useful now 105// Zero our carry term first; we eventually want it and a zero is useful now
105// Set a = max 0 (k + 1 - n), i = min (k + 1) m 106// Set a = max 0 (k + 1 - n), i = min (k + 1) m
@@ -125,11 +126,11 @@ outerloop:
125 mov d, k 126 mov d, k
126 sub d, i 127 sub d, i
127 sub i, a 128 sub i, a
128 jbe innerend 129 jbe bignum_mul_innerend
129 lea x,[rcx+8*a] 130 lea x,[rcx+8*a]
130 lea y,[r9+8*d-8] 131 lea y,[r9+8*d-8]
131 132
132innerloop: 133bignum_mul_innerloop:
133 mov rax, [y+8*i] 134 mov rax, [y+8*i]
134 mul QWORD PTR [x] 135 mul QWORD PTR [x]
135 add x, 8 136 add x, 8
@@ -137,9 +138,9 @@ innerloop:
137 adc h, rdx 138 adc h, rdx
138 adc c, 0 139 adc c, 0
139 dec i 140 dec i
140 jnz innerloop 141 jnz bignum_mul_innerloop
141 142
142innerend: 143bignum_mul_innerend:
143 144
144 mov [z], l 145 mov [z], l
145 mov l, h 146 mov l, h
@@ -147,9 +148,9 @@ innerend:
147 add z, 8 148 add z, 8
148 149
149 cmp k, p 150 cmp k, p
150 jc outerloop 151 jc bignum_mul_outerloop
151 152
152end: 153bignum_mul_end:
153 pop r15 154 pop r15
154 pop r14 155 pop r14
155 pop r13 156 pop r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..d6ad514020
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,187 @@
1// $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[4], y[4]; output z[8]
20//
21// extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
22// const uint64_t y[static 4]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
33 .text
34
35// These are actually right
36
37#define z rdi
38#define x rsi
39
40// Copied in or set up
41
42#define y rcx
43
44// A zero register
45
46#define zero rbp
47#define zeroe ebp
48
49// Add in x[i] * rdx to the (i,i+1) position with the register window
50// Would be nice to have conditional expressions reg[i], reg[i+1] ...
51
52.macro mulpadd arg1,arg2
53 mulx rbx, rax, [x+8*\arg2]
54.if ((\arg1 + \arg2) % 4 == 0)
55 adcx r8, rax
56 adox r9, rbx
57.elseif ((\arg1 + \arg2) % 4 == 1)
58 adcx r9, rax
59 adox r10, rbx
60.elseif ((\arg1 + \arg2) % 4 == 2)
61 adcx r10, rax
62 adox r11, rbx
63.elseif ((\arg1 + \arg2) % 4 == 3)
64 adcx r11, rax
65 adox r8, rbx
66.endif
67
68.endm
69
70
71// Add in the whole j'th row
72
73.macro addrow arg1
74 mov rdx, [y+8*\arg1]
75 xor zeroe, zeroe
76
77 mulpadd \arg1, 0
78
79.if (\arg1 % 4 == 0)
80 mov [z+8*\arg1],r8
81.elseif (\arg1 % 4 == 1)
82 mov [z+8*\arg1],r9
83.elseif (\arg1 % 4 == 2)
84 mov [z+8*\arg1],r10
85.elseif (\arg1 % 4 == 3)
86 mov [z+8*\arg1],r11
87.endif
88
89 mulpadd \arg1, 1
90 mulpadd \arg1, 2
91
92.if (\arg1 % 4 == 0)
93 mulx r8, rax, [x+24]
94 adcx r11, rax
95 adox r8, zero
96 adcx r8, zero
97.elseif (\arg1 % 4 == 1)
98 mulx r9, rax, [x+24]
99 adcx r8, rax
100 adox r9, zero
101 adcx r9, zero
102.elseif (\arg1 % 4 == 2)
103 mulx r10, rax, [x+24]
104 adcx r9, rax
105 adox r10, zero
106 adcx r10, zero
107.elseif (\arg1 % 4 == 3)
108 mulx r11, rax, [x+24]
109 adcx r10, rax
110 adox r11, zero
111 adcx r11, zero
112.endif
113
114.endm
115
116
117
118S2N_BN_SYMBOL(bignum_mul_4_8):
119 _CET_ENDBR
120
121#if WINDOWS_ABI
122 push rdi
123 push rsi
124 mov rdi, rcx
125 mov rsi, rdx
126 mov rdx, r8
127#endif
128
129// Save more registers to play with
130
131 push rbp
132 push rbx
133
134// Copy y into a safe register to start with
135
136 mov y, rdx
137
138// Zero a register, which also makes sure we don't get a fake carry-in
139
140 xor zeroe, zeroe
141
142// Do the zeroth row, which is a bit different
143// Write back the zero-zero product and then accumulate
144// r8,r11,r10,r9 as y[0] * x from 1..4
145
146 mov rdx, [y]
147
148 mulx r9, r8, [x]
149 mov [z], r8
150
151 mulx r10, rbx, [x+8]
152 adcx r9, rbx
153
154 mulx r11, rbx, [x+16]
155 adcx r10, rbx
156
157 mulx r8, rbx, [x+24]
158 adcx r11, rbx
159 adcx r8, zero
160
161// Now all the other rows in a uniform pattern
162
163 addrow 1
164 addrow 2
165 addrow 3
166
167// Now write back the additional columns
168
169 mov [z+32], r8
170 mov [z+40], r9
171 mov [z+48], r10
172 mov [z+56], r11
173
174// Restore registers and return
175
176 pop rbx
177 pop rbp
178
179#if WINDOWS_ABI
180 pop rsi
181 pop rdi
182#endif
183 ret
184
185#if defined(__linux__) && defined(__ELF__)
186.section .note.GNU-stack,"",%progbits
187#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
index 70ff69e372..2592d1d658 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Multiply z := x * y 18// Multiply z := x * y
17// Inputs x[4], y[4]; output z[8] 19// Inputs x[4], y[4]; output z[8]
18// 20//
19// extern void bignum_mul_4_8_alt 21// extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4],
20// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); 22// const uint64_t y[static 4]);
21// 23//
22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y 24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
23// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y 25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
@@ -72,7 +74,7 @@
72 adc h, rdx 74 adc h, rdx
73 75
74S2N_BN_SYMBOL(bignum_mul_4_8_alt): 76S2N_BN_SYMBOL(bignum_mul_4_8_alt):
75 _CET_ENDBR 77 _CET_ENDBR
76 78
77#if WINDOWS_ABI 79#if WINDOWS_ABI
78 push rdi 80 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..56cbdf06e0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,223 @@
1// $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[6], y[6]; output z[12]
20//
21// extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
22// const uint64_t y[static 6]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
33 .text
34
35// These are actually right
36
37#define z rdi
38#define x rsi
39
40// Copied in or set up
41
42#define y rcx
43
44// A zero register
45
46#define zero rbp
47#define zeroe ebp
48
49// Add in x[i] * rdx to the (i,i+1) position with the register window
50// Would be nice to have conditional expressions reg[i], reg[i+1] ...
51
52.macro mulpadd arg1,arg2
53 mulx rbx, rax, [x+8*\arg2]
54.if ((\arg1 + \arg2) % 6 == 0)
55 adcx r8, rax
56 adox r9, rbx
57.elseif ((\arg1 + \arg2) % 6 == 1)
58 adcx r9, rax
59 adox r10, rbx
60.elseif ((\arg1 + \arg2) % 6 == 2)
61 adcx r10, rax
62 adox r11, rbx
63.elseif ((\arg1 + \arg2) % 6 == 3)
64 adcx r11, rax
65 adox r12, rbx
66.elseif ((\arg1 + \arg2) % 6 == 4)
67 adcx r12, rax
68 adox r13, rbx
69.elseif ((\arg1 + \arg2) % 6 == 5)
70 adcx r13, rax
71 adox r8, rbx
72.endif
73
74.endm
75
76
77// Add in the whole j'th row
78
79.macro addrow arg1
80 mov rdx, [y+8*\arg1]
81 xor zeroe, zeroe
82
83 mulpadd \arg1, 0
84
85.if (\arg1 % 6 == 0)
86 mov [z+8*\arg1],r8
87.elseif (\arg1 % 6 == 1)
88 mov [z+8*\arg1],r9
89.elseif (\arg1 % 6 == 2)
90 mov [z+8*\arg1],r10
91.elseif (\arg1 % 6 == 3)
92 mov [z+8*\arg1],r11
93.elseif (\arg1 % 6 == 4)
94 mov [z+8*\arg1],r12
95.elseif (\arg1 % 6 == 5)
96 mov [z+8*\arg1],r13
97.endif
98
99 mulpadd \arg1, 1
100 mulpadd \arg1, 2
101 mulpadd \arg1, 3
102 mulpadd \arg1, 4
103
104.if (\arg1 % 6 == 0)
105 mulx r8, rax, [x+40]
106 adcx r13, rax
107 adox r8, zero
108 adcx r8, zero
109.elseif (\arg1 % 6 == 1)
110 mulx r9, rax, [x+40]
111 adcx r8, rax
112 adox r9, zero
113 adcx r9, zero
114.elseif (\arg1 % 6 == 2)
115 mulx r10, rax, [x+40]
116 adcx r9, rax
117 adox r10, zero
118 adcx r10, zero
119.elseif (\arg1 % 6 == 3)
120 mulx r11, rax, [x+40]
121 adcx r10, rax
122 adox r11, zero
123 adcx r11, zero
124.elseif (\arg1 % 6 == 4)
125 mulx r12, rax, [x+40]
126 adcx r11, rax
127 adox r12, zero
128 adcx r12, zero
129.elseif (\arg1 % 6 == 5)
130 mulx r13, rax, [x+40]
131 adcx r12, rax
132 adox r13, zero
133 adcx r13, zero
134.endif
135
136.endm
137
138
139
140S2N_BN_SYMBOL(bignum_mul_6_12):
141 _CET_ENDBR
142
143#if WINDOWS_ABI
144 push rdi
145 push rsi
146 mov rdi, rcx
147 mov rsi, rdx
148 mov rdx, r8
149#endif
150
151// Save more registers to play with
152
153 push rbp
154 push rbx
155 push r12
156 push r13
157
158// Copy y into a safe register to start with
159
160 mov y, rdx
161
162// Zero a register, which also makes sure we don't get a fake carry-in
163
164 xor zeroe, zeroe
165
166// Do the zeroth row, which is a bit different
167// Write back the zero-zero product and then accumulate
168// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
169
170 mov rdx, [y]
171
172 mulx r9, r8, [x]
173 mov [z], r8
174
175 mulx r10, rbx, [x+8]
176 adcx r9, rbx
177
178 mulx r11, rbx, [x+16]
179 adcx r10, rbx
180
181 mulx r12, rbx, [x+24]
182 adcx r11, rbx
183
184 mulx r13, rbx, [x+32]
185 adcx r12, rbx
186
187 mulx r8, rbx, [x+40]
188 adcx r13, rbx
189 adcx r8, zero
190
191// Now all the other rows in a uniform pattern
192
193 addrow 1
194 addrow 2
195 addrow 3
196 addrow 4
197 addrow 5
198
199// Now write back the additional columns
200
201 mov [z+48], r8
202 mov [z+56], r9
203 mov [z+64], r10
204 mov [z+72], r11
205 mov [z+80], r12
206 mov [z+88], r13
207
208// Restore registers and return
209
210 pop r13
211 pop r12
212 pop rbx
213 pop rbp
214
215#if WINDOWS_ABI
216 pop rsi
217 pop rdi
218#endif
219 ret
220
221#if defined(__linux__) && defined(__ELF__)
222.section .note.GNU-stack,"",%progbits
223#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..077c52b38e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,199 @@
1// $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[6], y[6]; output z[12]
20//
21// extern void bignum_mul_6_12_alt(uint64_t z[static 12],
22// const uint64_t x[static 6],
23// const uint64_t y[static 6]);
24//
25// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
26// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
27// ----------------------------------------------------------------------------
28
29#include "s2n_bignum_internal.h"
30
31 .intel_syntax noprefix
32 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
33 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
34 .text
35
36// These are actually right
37
38#define z rdi
39#define x rsi
40
41// This is moved from rdx to free it for muls
42
43#define y rcx
44
45// Other variables used as a rotating 3-word window to add terms to
46
47#define t0 r8
48#define t1 r9
49#define t2 r10
50
51// Macro for the key "multiply and add to (c,h,l)" step
52
53#define combadd(c,h,l,numa,numb) \
54 mov rax, numa; \
55 mul QWORD PTR numb; \
56 add l, rax; \
57 adc h, rdx; \
58 adc c, 0
59
60// A minutely shorter form for when c = 0 initially
61
62#define combadz(c,h,l,numa,numb) \
63 mov rax, numa; \
64 mul QWORD PTR numb; \
65 add l, rax; \
66 adc h, rdx; \
67 adc c, c
68
69// A short form where we don't expect a top carry
70
71#define combads(h,l,numa,numb) \
72 mov rax, numa; \
73 mul QWORD PTR numb; \
74 add l, rax; \
75 adc h, rdx
76
77S2N_BN_SYMBOL(bignum_mul_6_12_alt):
78 _CET_ENDBR
79
80#if WINDOWS_ABI
81 push rdi
82 push rsi
83 mov rdi, rcx
84 mov rsi, rdx
85 mov rdx, r8
86#endif
87
88// Copy y into a safe register to start with
89
90 mov y, rdx
91
92// Result term 0
93
94 mov rax, [x]
95 mul QWORD PTR [y]
96
97 mov [z], rax
98 mov t0, rdx
99 xor t1, t1
100
101// Result term 1
102
103 xor t2, t2
104 combads(t1,t0,[x],[y+8])
105 combadz(t2,t1,t0,[x+8],[y])
106 mov [z+8], t0
107
108// Result term 2
109
110 xor t0, t0
111 combadz(t0,t2,t1,[x],[y+16])
112 combadd(t0,t2,t1,[x+8],[y+8])
113 combadd(t0,t2,t1,[x+16],[y])
114 mov [z+16], t1
115
116// Result term 3
117
118 xor t1, t1
119 combadz(t1,t0,t2,[x],[y+24])
120 combadd(t1,t0,t2,[x+8],[y+16])
121 combadd(t1,t0,t2,[x+16],[y+8])
122 combadd(t1,t0,t2,[x+24],[y])
123 mov [z+24], t2
124
125// Result term 4
126
127 xor t2, t2
128 combadz(t2,t1,t0,[x],[y+32])
129 combadd(t2,t1,t0,[x+8],[y+24])
130 combadd(t2,t1,t0,[x+16],[y+16])
131 combadd(t2,t1,t0,[x+24],[y+8])
132 combadd(t2,t1,t0,[x+32],[y])
133 mov [z+32], t0
134
135// Result term 5
136
137 xor t0, t0
138 combadz(t0,t2,t1,[x],[y+40])
139 combadd(t0,t2,t1,[x+8],[y+32])
140 combadd(t0,t2,t1,[x+16],[y+24])
141 combadd(t0,t2,t1,[x+24],[y+16])
142 combadd(t0,t2,t1,[x+32],[y+8])
143 combadd(t0,t2,t1,[x+40],[y])
144 mov [z+40], t1
145
146// Result term 6
147
148 xor t1, t1
149 combadz(t1,t0,t2,[x+8],[y+40])
150 combadd(t1,t0,t2,[x+16],[y+32])
151 combadd(t1,t0,t2,[x+24],[y+24])
152 combadd(t1,t0,t2,[x+32],[y+16])
153 combadd(t1,t0,t2,[x+40],[y+8])
154 mov [z+48], t2
155
156// Result term 7
157
158 xor t2, t2
159 combadz(t2,t1,t0,[x+16],[y+40])
160 combadd(t2,t1,t0,[x+24],[y+32])
161 combadd(t2,t1,t0,[x+32],[y+24])
162 combadd(t2,t1,t0,[x+40],[y+16])
163 mov [z+56], t0
164
165// Result term 8
166
167 xor t0, t0
168 combadz(t0,t2,t1,[x+24],[y+40])
169 combadd(t0,t2,t1,[x+32],[y+32])
170 combadd(t0,t2,t1,[x+40],[y+24])
171 mov [z+64], t1
172
173// Result term 9
174
175 xor t1, t1
176 combadz(t1,t0,t2,[x+32],[y+40])
177 combadd(t1,t0,t2,[x+40],[y+32])
178 mov [z+72], t2
179
180// Result term 10
181
182 combads(t1,t0,[x+40],[y+40])
183 mov [z+80], t0
184
185// Result term 11
186
187 mov [z+88], t1
188
189// Return
190
191#if WINDOWS_ABI
192 pop rsi
193 pop rdi
194#endif
195 ret
196
197#if defined(__linux__) && defined(__ELF__)
198.section .note.GNU-stack,"",%progbits
199#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..faa0196d8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,273 @@
1// $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[8], y[8]; output z[16]
20//
21// extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
22// const uint64_t y[static 8]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
33 .text
34
35// These are actually right
36
37#define z rdi
38#define x rsi
39
40// Copied in or set up
41
42#define y rcx
43
44// A zero register
45
46#define zero rbp
47#define zeroe ebp
48
49// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
50
51.macro mulpadd arg1,arg2
52 mulx rbx, rax, [x+8*\arg1]
53.if ((\arg1 + \arg2) % 8 == 0)
54 adcx r8, rax
55 adox r9, rbx
56.elseif ((\arg1 + \arg2) % 8 == 1)
57 adcx r9, rax
58 adox r10, rbx
59.elseif ((\arg1 + \arg2) % 8 == 2)
60 adcx r10, rax
61 adox r11, rbx
62.elseif ((\arg1 + \arg2) % 8 == 3)
63 adcx r11, rax
64 adox r12, rbx
65.elseif ((\arg1 + \arg2) % 8 == 4)
66 adcx r12, rax
67 adox r13, rbx
68.elseif ((\arg1 + \arg2) % 8 == 5)
69 adcx r13, rax
70 adox r14, rbx
71.elseif ((\arg1 + \arg2) % 8 == 6)
72 adcx r14, rax
73 adox r15, rbx
74.elseif ((\arg1 + \arg2) % 8 == 7)
75 adcx r15, rax
76 adox r8, rbx
77.endif
78
79.endm
80
81// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
82// but re-creates the top word assuming nothing to add there
83
84.macro mulpade arg1,arg2
85.if ((\arg1 + \arg2) % 8 == 0)
86 mulx r9, rax, [x+8*\arg1]
87 adcx r8, rax
88 adox r9, zero
89.elseif ((\arg1 + \arg2) % 8 == 1)
90 mulx r10, rax, [x+8*\arg1]
91 adcx r9, rax
92 adox r10, zero
93.elseif ((\arg1 + \arg2) % 8 == 2)
94 mulx r11, rax, [x+8*\arg1]
95 adcx r10, rax
96 adox r11, zero
97.elseif ((\arg1 + \arg2) % 8 == 3)
98 mulx r12, rax, [x+8*\arg1]
99 adcx r11, rax
100 adox r12, zero
101.elseif ((\arg1 + \arg2) % 8 == 4)
102 mulx r13, rax, [x+8*\arg1]
103 adcx r12, rax
104 adox r13, zero
105.elseif ((\arg1 + \arg2) % 8 == 5)
106 mulx r14, rax, [x+8*\arg1]
107 adcx r13, rax
108 adox r14, zero
109.elseif ((\arg1 + \arg2) % 8 == 6)
110 mulx r15, rax, [x+8*\arg1]
111 adcx r14, rax
112 adox r15, zero
113.elseif ((\arg1 + \arg2) % 8 == 7)
114 mulx r8, rax, [x+8*\arg1]
115 adcx r15, rax
116 adox r8, zero
117.endif
118
119.endm
120
121// Add in the whole j'th row
122
123.macro addrow arg1
124 mov rdx, [y+8*\arg1]
125 xor zeroe, zeroe
126
127 mulpadd 0, \arg1
128
129.if (\arg1 % 8 == 0)
130 mov [z+8*\arg1],r8
131.elseif (\arg1 % 8 == 1)
132 mov [z+8*\arg1],r9
133.elseif (\arg1 % 8 == 2)
134 mov [z+8*\arg1],r10
135.elseif (\arg1 % 8 == 3)
136 mov [z+8*\arg1],r11
137.elseif (\arg1 % 8 == 4)
138 mov [z+8*\arg1],r12
139.elseif (\arg1 % 8 == 5)
140 mov [z+8*\arg1],r13
141.elseif (\arg1 % 8 == 6)
142 mov [z+8*\arg1],r14
143.elseif (\arg1 % 8 == 7)
144 mov [z+8*\arg1],r15
145.endif
146
147 mulpadd 1, \arg1
148 mulpadd 2, \arg1
149 mulpadd 3, \arg1
150 mulpadd 4, \arg1
151 mulpadd 5, \arg1
152 mulpadd 6, \arg1
153 mulpade 7, \arg1
154
155.if (\arg1 % 8 == 0)
156 adc r8, zero
157.elseif (\arg1 % 8 == 1)
158 adc r9, zero
159.elseif (\arg1 % 8 == 2)
160 adc r10, zero
161.elseif (\arg1 % 8 == 3)
162 adc r11, zero
163.elseif (\arg1 % 8 == 4)
164 adc r12, zero
165.elseif (\arg1 % 8 == 5)
166 adc r13, zero
167.elseif (\arg1 % 8 == 6)
168 adc r14, zero
169.elseif (\arg1 % 8 == 7)
170 adc r15, zero
171.endif
172
173.endm
174
175
176S2N_BN_SYMBOL(bignum_mul_8_16):
177 _CET_ENDBR
178
179#if WINDOWS_ABI
180 push rdi
181 push rsi
182 mov rdi, rcx
183 mov rsi, rdx
184 mov rdx, r8
185#endif
186
187// Save more registers to play with
188
189 push rbp
190 push rbx
191 push r12
192 push r13
193 push r14
194 push r15
195
196// Copy y into a safe register to start with
197
198 mov y, rdx
199
200// Zero a register, which also makes sure we don't get a fake carry-in
201
202 xor zeroe, zeroe
203
204// Do the zeroth row, which is a bit different
205// Write back the zero-zero product and then accumulate
206// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
207
208 mov rdx, [y]
209
210 mulx r9, r8, [x]
211 mov [z], r8
212
213 mulx r10, rbx, [x+8]
214 adc r9, rbx
215
216 mulx r11, rbx, [x+16]
217 adc r10, rbx
218
219 mulx r12, rbx, [x+24]
220 adc r11, rbx
221
222 mulx r13, rbx, [x+32]
223 adc r12, rbx
224
225 mulx r14, rbx, [x+40]
226 adc r13, rbx
227
228 mulx r15, rbx, [x+48]
229 adc r14, rbx
230
231 mulx r8, rbx, [x+56]
232 adc r15, rbx
233 adc r8, zero
234
235// Now all the other rows in a uniform pattern
236
237 addrow 1
238 addrow 2
239 addrow 3
240 addrow 4
241 addrow 5
242 addrow 6
243 addrow 7
244
245// Now write back the additional columns
246
247 mov [z+64], r8
248 mov [z+72], r9
249 mov [z+80], r10
250 mov [z+88], r11
251 mov [z+96], r12
252 mov [z+104], r13
253 mov [z+112], r14
254 mov [z+120], r15
255
256// Real epilog
257
258 pop r15
259 pop r14
260 pop r13
261 pop r12
262 pop rbx
263 pop rbp
264
265#if WINDOWS_ABI
266 pop rsi
267 pop rdi
268#endif
269 ret
270
271#if defined(__linux__) && defined(__ELF__)
272.section .note.GNU-stack,"",%progbits
273#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
index 066403b074..0e30b9170f 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,9 @@
16// Multiply z := x * y 18// Multiply z := x * y
17// Inputs x[8], y[8]; output z[16] 19// Inputs x[8], y[8]; output z[16]
18// 20//
19// extern void bignum_mul_8_16_alt 21// extern void bignum_mul_8_16_alt(uint64_t z[static 16],
20// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); 22// const uint64_t x[static 8],
23// const uint64_t y[static 8]);
21// 24//
22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y 25// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
23// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y 26// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
@@ -72,7 +75,7 @@
72 adc h, rdx 75 adc h, rdx
73 76
74S2N_BN_SYMBOL(bignum_mul_8_16_alt): 77S2N_BN_SYMBOL(bignum_mul_8_16_alt):
75 _CET_ENDBR 78 _CET_ENDBR
76 79
77#if WINDOWS_ABI 80#if WINDOWS_ABI
78 push rdi 81 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
index 54e3f59442..86f1af2ac4 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,7 @@
16// Square z := x^2 18// Square z := x^2
17// Input x[n]; output z[k] 19// Input x[n]; output z[k]
18// 20//
19// extern void bignum_sqr 21// extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
20// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
21// 22//
22// Does the "z := x^2" operation where x is n digits and result z is k. 23// Does the "z := x^2" operation where x is n digits and result z is k.
23// Truncates the result in general unless k >= 2 * n 24// Truncates the result in general unless k >= 2 * n
@@ -62,7 +63,7 @@
62#define llshort ebp 63#define llshort ebp
63 64
64S2N_BN_SYMBOL(bignum_sqr): 65S2N_BN_SYMBOL(bignum_sqr):
65 _CET_ENDBR 66 _CET_ENDBR
66 67
67#if WINDOWS_ABI 68#if WINDOWS_ABI
68 push rdi 69 push rdi
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr):
86// If p = 0 the result is trivial and nothing needs doing 87// If p = 0 the result is trivial and nothing needs doing
87 88
88 test p, p 89 test p, p
89 jz end 90 jz bignum_sqr_end
90 91
91// initialize (hh,ll) = 0 92// initialize (hh,ll) = 0
92 93
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr):
97 98
98 xor k, k 99 xor k, k
99 100
100outerloop: 101bignum_sqr_outerloop:
101 102
102// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n 103// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
103// We want to accumulate all x[i] * x[k - i] for bot <= i < top 104// We want to accumulate all x[i] * x[k - i] for bot <= i < top
@@ -122,7 +123,7 @@ outerloop:
122// If htop <= bot then main doubled part of the sum is empty 123// If htop <= bot then main doubled part of the sum is empty
123 124
124 cmp i, htop 125 cmp i, htop
125 jnc nosumming 126 jnc bignum_sqr_nosumming
126 127
127// Use a moving pointer for [y] = x[k-i] for the cofactor 128// Use a moving pointer for [y] = x[k-i] for the cofactor
128 129
@@ -132,7 +133,7 @@ outerloop:
132 133
133// Do the main part of the sum x[i] * x[k - i] for 2 * i < k 134// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
134 135
135innerloop: 136bignum_sqr_innerloop:
136 mov a, [x+8*i] 137 mov a, [x+8*i]
137 mul QWORD PTR [y] 138 mul QWORD PTR [y]
138 add l, a 139 add l, a
@@ -141,7 +142,7 @@ innerloop:
141 sub y, 8 142 sub y, 8
142 inc i 143 inc i
143 cmp i, htop 144 cmp i, htop
144 jc innerloop 145 jc bignum_sqr_innerloop
145 146
146// Now double it 147// Now double it
147 148
@@ -151,11 +152,11 @@ innerloop:
151 152
152// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term 153// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
153 154
154nosumming: 155bignum_sqr_nosumming:
155 test k, 1 156 test k, 1
156 jnz innerend 157 jnz bignum_sqr_innerend
157 cmp i, n 158 cmp i, n
158 jnc innerend 159 jnc bignum_sqr_innerend
159 160
160 mov a, [x+8*i] 161 mov a, [x+8*i]
161 mul a 162 mul a
@@ -165,7 +166,7 @@ nosumming:
165 166
166// Now add the local sum into the global sum, store and shift 167// Now add the local sum into the global sum, store and shift
167 168
168innerend: 169bignum_sqr_innerend:
169 add l, ll 170 add l, ll
170 mov [z+8*k], l 171 mov [z+8*k], l
171 adc h, hh 172 adc h, hh
@@ -175,11 +176,11 @@ innerend:
175 176
176 inc k 177 inc k
177 cmp k, p 178 cmp k, p
178 jc outerloop 179 jc bignum_sqr_outerloop
179 180
180// Restore registers and return 181// Restore registers and return
181 182
182end: 183bignum_sqr_end:
183 pop r15 184 pop r15
184 pop r14 185 pop r14
185 pop r13 186 pop r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..25664782f7
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,158 @@
1// $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[4]; output z[8]
20//
21// extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
22//
23// Standard x86-64 ABI: RDI = z, RSI = x
24// Microsoft x64 ABI: RCX = z, RDX = x
25// ----------------------------------------------------------------------------
26
27#include "s2n_bignum_internal.h"
28
29 .intel_syntax noprefix
30 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
31 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
32 .text
33
34// These are actually right
35
36#define z rdi
37#define x rsi
38
39// A zero register
40
41#define zero rbp
42#define zeroe ebp
43
44// Other registers
45
46#define d1 r8
47#define d2 r9
48#define d3 r10
49#define d4 r11
50#define d5 r12
51#define d6 r13
52
53
54
55S2N_BN_SYMBOL(bignum_sqr_4_8):
56 _CET_ENDBR
57
58#if WINDOWS_ABI
59 push rdi
60 push rsi
61 mov rdi, rcx
62 mov rsi, rdx
63#endif
64
65// Save more registers to play with
66
67 push rbp
68 push r12
69 push r13
70
71// Set up an initial window [d6;...d1] = [23;03;01]
72
73 mov rdx, [x]
74 mulx d2, d1, [x+8]
75 mulx d4, d3, [x+24]
76 mov rdx, [x+16]
77 mulx d6, d5, [x+24]
78
79// Clear our zero register, and also initialize the flags for the carry chain
80
81 xor zeroe, zeroe
82
83// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
84// This gives all the "heterogeneous" terms of the squaring ready to double
85
86 mulx rcx, rax, [x]
87 adcx d2, rax
88 adox d3, rcx
89 mulx rcx, rax, [x+8]
90 adcx d3, rax
91 adox d4, rcx
92 mov rdx, [x+24]
93 mulx rcx, rax, [x+8]
94 adcx d4, rax
95 adox d5, rcx
96 adcx d5, zero
97 adox d6, zero
98 adcx d6, zero
99
100// In principle this is otiose as CF and OF carries are absorbed at this point
101// However it seems helpful for the OOO engine to be told it's a fresh start
102
103 xor zeroe, zeroe
104
105// Double and add to the 00 + 11 + 22 + 33 terms
106//
107// We could use shift-double but this seems tidier and in larger squarings
108// it was actually more efficient. I haven't experimented with this small
109// case to see how much that matters. Note: the writeback here is sprinkled
110// into the sequence in such a way that things still work if z = x, i.e. if
111// the output overwrites the input buffer and beyond.
112
113 mov rdx, [x]
114 mulx rdx, rax, rdx
115 mov [z], rax
116 adcx d1, d1
117 adox d1, rdx
118 mov rdx, [x+8]
119 mov [z+8], d1
120 mulx rdx, rax, rdx
121 adcx d2, d2
122 adox d2, rax
123 adcx d3, d3
124 adox d3, rdx
125 mov rdx, [x+16]
126 mov [z+16], d2
127 mulx rdx, rax, rdx
128 adcx d4, d4
129 adox d4, rax
130 adcx d5, d5
131 adox d5, rdx
132 mov rdx, [x+24]
133 mov [z+24], d3
134 mulx rdx, rax, rdx
135 mov [z+32], d4
136 adcx d6, d6
137 mov [z+40], d5
138 adox d6, rax
139 mov [z+48], d6
140 adcx rdx, zero
141 adox rdx, zero
142 mov [z+56], rdx
143
144// Restore saved registers and return
145
146 pop r13
147 pop r12
148 pop rbp
149
150#if WINDOWS_ABI
151 pop rsi
152 pop rdi
153#endif
154 ret
155
156#if defined(__linux__) && defined(__ELF__)
157.section .note.GNU-stack,"",%progbits
158#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
index 7c534ae907..7eafac3284 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Square, z := x^2 18// Square, z := x^2
17// Input x[4]; output z[8] 19// Input x[4]; output z[8]
18// 20//
19// extern void bignum_sqr_4_8_alt 21// extern void bignum_sqr_4_8_alt(uint64_t z[static 8],
20// (uint64_t z[static 8], uint64_t x[static 4]); 22// const uint64_t x[static 4]);
21// 23//
22// Standard x86-64 ABI: RDI = z, RSI = x 24// Standard x86-64 ABI: RDI = z, RSI = x
23// Microsoft x64 ABI: RCX = z, RDX = x 25// Microsoft x64 ABI: RCX = z, RDX = x
@@ -71,7 +73,7 @@
71 adc c, 0 73 adc c, 0
72 74
73S2N_BN_SYMBOL(bignum_sqr_4_8_alt): 75S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
74 _CET_ENDBR 76 _CET_ENDBR
75 77
76#if WINDOWS_ABI 78#if WINDOWS_ABI
77 push rdi 79 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..3f055e8b75
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,227 @@
1// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[6]; output z[12]
20//
21// extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
22//
23// Standard x86-64 ABI: RDI = z, RSI = x
24// Microsoft x64 ABI: RCX = z, RDX = x
25// ----------------------------------------------------------------------------
26
27#include "s2n_bignum_internal.h"
28
29 .intel_syntax noprefix
30 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
31 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
32 .text
33
34// These are actually right
35
36#define z rdi
37#define x rsi
38
39// A zero register
40
41#define zero rbp
42#define zeroe ebp
43
44// Other registers
45
46#define d1 r8
47#define d2 r9
48#define d3 r10
49#define d4 r11
50#define d5 r12
51#define d6 r13
52#define d7 r14
53#define d8 r15
54#define d9 rbx
55
56// Care is needed: re-using the zero register
57
58#define d10 rbp
59
60
61S2N_BN_SYMBOL(bignum_sqr_6_12):
62 _CET_ENDBR
63
64#if WINDOWS_ABI
65 push rdi
66 push rsi
67 mov rdi, rcx
68 mov rsi, rdx
69#endif
70
71// Save more registers to play with
72
73 push rbp
74 push rbx
75 push r12
76 push r13
77 push r14
78 push r15
79
80// Set up an initial window [d8;...d1] = [34;05;03;01]
81
82 mov rdx, [x]
83 mulx d2, d1, [x+8]
84 mulx d4, d3, [x+24]
85 mulx d6, d5, [x+40]
86 mov rdx, [x+24]
87 mulx d8, d7, [x+32]
88
89// Clear our zero register, and also initialize the flags for the carry chain
90
91 xor zeroe, zeroe
92
93// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
94// (no carry-out possible since we add it to the top of a product)
95
96 mov rdx, [x+16]
97 mulx rcx, rax, [x]
98 adcx d2, rax
99 adox d3, rcx
100 mulx rcx, rax, [x+8]
101 adcx d3, rax
102 adox d4, rcx
103 mov rdx, [x+8]
104 mulx rcx, rax, [x+24]
105 adcx d4, rax
106 adox d5, rcx
107 mulx rcx, rax, [x+32]
108 adcx d5, rax
109 adox d6, rcx
110 mulx rcx, rax, [x+40]
111 adcx d6, rax
112 adox d7, rcx
113 adcx d7, zero
114 adox d8, zero
115 adcx d8, zero
116
117// Again zero out the flags. Actually they are already cleared but it may
118// help decouple these in the OOO engine not to wait for the chain above
119
120 xor zeroe, zeroe
121
122// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
123// We are running out of registers and here our zero register is not zero!
124
125 mov rdx, [x+32]
126 mulx rcx, rax, [x]
127 adcx d4, rax
128 adox d5, rcx
129 mov rdx, [x+16]
130 mulx rcx, rax, [x+24]
131 adcx d5, rax
132 adox d6, rcx
133 mulx rcx, rax, [x+32]
134 adcx d6, rax
135 adox d7, rcx
136 mulx rcx, rax, [x+40]
137 adcx d7, rax
138 adox d8, rcx
139 mov rdx, [x+24]
140 mulx d9, rax, [x+40]
141 adcx d8, rax
142 adox d9, zero
143 mov rdx, [x+32]
144 mulx d10, rax, [x+40]
145 adcx d9, rax
146 mov eax, 0
147 adox d10, rax
148 adcx d10, rax
149
150// Again, just for a clear fresh start for the flags
151
152 xor eax, eax
153
154// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
155//
156// We could use shift-double but this seems tidier and in larger squarings
157// it was actually more efficient. I haven't experimented with this small
158// case to see how much that matters. Note: the writeback here is sprinkled
159// into the sequence in such a way that things still work if z = x, i.e. if
160// the output overwrites the input buffer and beyond.
161
162 mov rdx, [x]
163 mulx rdx, rax, rdx
164 mov [z], rax
165 adcx d1, d1
166 adox d1, rdx
167 mov rdx, [x+8]
168 mov [z+8], d1
169 mulx rdx, rax, rdx
170 adcx d2, d2
171 adox d2, rax
172 adcx d3, d3
173 adox d3, rdx
174 mov rdx, [x+16]
175 mov [z+16], d2
176 mulx rdx, rax, rdx
177 adcx d4, d4
178 adox d4, rax
179 adcx d5, d5
180 adox d5, rdx
181 mov rdx, [x+24]
182 mov [z+24], d3
183 mulx rdx, rax, rdx
184 adcx d6, d6
185 adox d6, rax
186 adcx d7, d7
187 adox d7, rdx
188 mov rdx, [x+32]
189 mov [z+32], d4
190 mulx rdx, rax, rdx
191 adcx d8, d8
192 adox d8, rax
193 adcx d9, d9
194 adox d9, rdx
195 mov rdx, [x+40]
196 mov [z+40], d5
197 mulx rdx, rax, rdx
198 mov [z+48], d6
199 adcx d10, d10
200 mov [z+56], d7
201 adox d10, rax
202 mov [z+64], d8
203 mov eax, 0
204 mov [z+72], d9
205 adcx rdx, rax
206 mov [z+80], d10
207 adox rdx, rax
208 mov [z+88], rdx
209
210// Restore saved registers and return
211
212 pop r15
213 pop r14
214 pop r13
215 pop r12
216 pop rbx
217 pop rbp
218
219#if WINDOWS_ABI
220 pop rsi
221 pop rdi
222#endif
223 ret
224
225#if defined(__linux__) && defined(__ELF__)
226.section .note.GNU-stack,"",%progbits
227#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..eb43b0a15b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,210 @@
1// $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[6]; output z[12]
20//
21// extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
22// const uint64_t x[static 6]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x
25// Microsoft x64 ABI: RCX = z, RDX = x
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
33 .text
34
35// Input arguments
36
37#define z rdi
38#define x rsi
39
40// Other variables used as a rotating 3-word window to add terms to
41
42#define t0 r8
43#define t1 r9
44#define t2 r10
45
46// Additional temporaries for local windows to share doublings
47
48#define u0 rcx
49#define u1 r11
50
51// Macro for the key "multiply and add to (c,h,l)" step
52
53#define combadd(c,h,l,numa,numb) \
54 mov rax, numa; \
55 mul QWORD PTR numb; \
56 add l, rax; \
57 adc h, rdx; \
58 adc c, 0
59
60// Set up initial window (c,h,l) = numa * numb
61
62#define combaddz(c,h,l,numa,numb) \
63 mov rax, numa; \
64 mul QWORD PTR numb; \
65 xor c, c; \
66 mov l, rax; \
67 mov h, rdx
68
69// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
70
71#define doubladd(c,h,l,hh,ll) \
72 add ll, ll; \
73 adc hh, hh; \
74 adc c, c; \
75 add l, ll; \
76 adc h, hh; \
77 adc c, 0
78
79// Square term incorporation (c,h,l) += numba^2
80
81#define combadd1(c,h,l,numa) \
82 mov rax, numa; \
83 mul rax; \
84 add l, rax; \
85 adc h, rdx; \
86 adc c, 0
87
88// A short form where we don't expect a top carry
89
90#define combads(h,l,numa) \
91 mov rax, numa; \
92 mul rax; \
93 add l, rax; \
94 adc h, rdx
95
96// A version doubling directly before adding, for single non-square terms
97
98#define combadd2(c,h,l,numa,numb) \
99 mov rax, numa; \
100 mul QWORD PTR numb; \
101 add rax, rax; \
102 adc rdx, rdx; \
103 adc c, 0; \
104 add l, rax; \
105 adc h, rdx; \
106 adc c, 0
107
108S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
109 _CET_ENDBR
110
111#if WINDOWS_ABI
112 push rdi
113 push rsi
114 mov rdi, rcx
115 mov rsi, rdx
116#endif
117
118// Result term 0
119
120 mov rax, [x]
121 mul rax
122
123 mov [z], rax
124 mov t0, rdx
125 xor t1, t1
126
127// Result term 1
128
129 xor t2, t2
130 combadd2(t2,t1,t0,[x],[x+8])
131 mov [z+8], t0
132
133// Result term 2
134
135 xor t0, t0
136 combadd1(t0,t2,t1,[x+8])
137 combadd2(t0,t2,t1,[x],[x+16])
138 mov [z+16], t1
139
140// Result term 3
141
142 combaddz(t1,u1,u0,[x],[x+24])
143 combadd(t1,u1,u0,[x+8],[x+16])
144 doubladd(t1,t0,t2,u1,u0)
145 mov [z+24], t2
146
147// Result term 4
148
149 combaddz(t2,u1,u0,[x],[x+32])
150 combadd(t2,u1,u0,[x+8],[x+24])
151 doubladd(t2,t1,t0,u1,u0)
152 combadd1(t2,t1,t0,[x+16])
153 mov [z+32], t0
154
155// Result term 5
156
157 combaddz(t0,u1,u0,[x],[x+40])
158 combadd(t0,u1,u0,[x+8],[x+32])
159 combadd(t0,u1,u0,[x+16],[x+24])
160 doubladd(t0,t2,t1,u1,u0)
161 mov [z+40], t1
162
163// Result term 6
164
165 combaddz(t1,u1,u0,[x+8],[x+40])
166 combadd(t1,u1,u0,[x+16],[x+32])
167 doubladd(t1,t0,t2,u1,u0)
168 combadd1(t1,t0,t2,[x+24])
169 mov [z+48], t2
170
171// Result term 7
172
173 combaddz(t2,u1,u0,[x+16],[x+40])
174 combadd(t2,u1,u0,[x+24],[x+32])
175 doubladd(t2,t1,t0,u1,u0)
176 mov [z+56], t0
177
178// Result term 8
179
180 xor t0, t0
181 combadd2(t0,t2,t1,[x+24],[x+40])
182 combadd1(t0,t2,t1,[x+32])
183 mov [z+64], t1
184
185// Result term 9
186
187 xor t1, t1
188 combadd2(t1,t0,t2,[x+32],[x+40])
189 mov [z+72], t2
190
191// Result term 10
192
193 combads(t1,t0,[x+40])
194 mov [z+80], t0
195
196// Result term 11
197
198 mov [z+88], t1
199
200// Return
201
202#if WINDOWS_ABI
203 pop rsi
204 pop rdi
205#endif
206 ret
207
208#if defined(__linux__) && defined(__ELF__)
209.section .note.GNU-stack,"",%progbits
210#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..41277b5b6a
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,311 @@
1// $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[8]; output z[16]
20//
21// extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
22//
23// Standard x86-64 ABI: RDI = z, RSI = x
24// Microsoft x64 ABI: RCX = z, RDX = x
25// ----------------------------------------------------------------------------
26
27#include "s2n_bignum_internal.h"
28
29 .intel_syntax noprefix
30 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
31 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
32 .text
33
34// These are actually right
35
36#define z rdi
37#define x rsi
38
39// A zero register
40
41#define zero rbp
42#define zeroe ebp
43
44// mulpadd i, j adds rdx * x[i] into the window at the i+j point
45
46.macro mulpadd arg1,arg2
47 mulx rcx, rax, [x+8*\arg1]
48.if ((\arg1 + \arg2) % 8 == 0)
49 adcx r8, rax
50 adox r9, rcx
51.elseif ((\arg1 + \arg2) % 8 == 1)
52 adcx r9, rax
53 adox r10, rcx
54.elseif ((\arg1 + \arg2) % 8 == 2)
55 adcx r10, rax
56 adox r11, rcx
57.elseif ((\arg1 + \arg2) % 8 == 3)
58 adcx r11, rax
59 adox r12, rcx
60.elseif ((\arg1 + \arg2) % 8 == 4)
61 adcx r12, rax
62 adox r13, rcx
63.elseif ((\arg1 + \arg2) % 8 == 5)
64 adcx r13, rax
65 adox r14, rcx
66.elseif ((\arg1 + \arg2) % 8 == 6)
67 adcx r14, rax
68 adox r15, rcx
69.elseif ((\arg1 + \arg2) % 8 == 7)
70 adcx r15, rax
71 adox r8, rcx
72.endif
73
74.endm
75
76// mulpade i, j adds rdx * x[i] into the window at i+j
77// but re-creates the top word assuming nothing to add there
78
79.macro mulpade arg1,arg2
80.if ((\arg1 + \arg2) % 8 == 0)
81 mulx r9, rax, [x+8*\arg1]
82 adcx r8, rax
83 adox r9, zero
84.elseif ((\arg1 + \arg2) % 8 == 1)
85 mulx r10, rax, [x+8*\arg1]
86 adcx r9, rax
87 adox r10, zero
88.elseif ((\arg1 + \arg2) % 8 == 2)
89 mulx r11, rax, [x+8*\arg1]
90 adcx r10, rax
91 adox r11, zero
92.elseif ((\arg1 + \arg2) % 8 == 3)
93 mulx r12, rax, [x+8*\arg1]
94 adcx r11, rax
95 adox r12, zero
96.elseif ((\arg1 + \arg2) % 8 == 4)
97 mulx r13, rax, [x+8*\arg1]
98 adcx r12, rax
99 adox r13, zero
100.elseif ((\arg1 + \arg2) % 8 == 5)
101 mulx r14, rax, [x+8*\arg1]
102 adcx r13, rax
103 adox r14, zero
104.elseif ((\arg1 + \arg2) % 8 == 6)
105 mulx r15, rax, [x+8*\arg1]
106 adcx r14, rax
107 adox r15, zero
108.elseif ((\arg1 + \arg2) % 8 == 7)
109 mulx r8, rax, [x+8*\arg1]
110 adcx r15, rax
111 adox r8, zero
112.endif
113
114.endm
115
116.macro diagonals
117
118 xor zeroe, zeroe
119
120// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
121
122 mov rdx, [x]
123 mulx rax, r9, [x+8]
124 mov [z+8], r9
125 mulx rcx, r10, [x+16]
126 adcx r10, rax
127 mov [z+16], r10
128 mulx rax, r11, [x+24]
129 adcx r11, rcx
130 mulx rcx, r12, [x+32]
131 adcx r12, rax
132 mulx rax, r13, [x+40]
133 adcx r13, rcx
134 mulx rcx, r14, [x+48]
135 adcx r14, rax
136 mulx r8, r15, [x+56]
137 adcx r15, rcx
138 adcx r8, zero
139
140// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
141
142 xor zeroe, zeroe
143 mov rdx, [x+8]
144 mulpadd 2, 1
145 mov [z+24], r11
146 mulpadd 3, 1
147 mov [z+32], r12
148 mulpadd 4, 1
149 mulpadd 5, 1
150 mulpadd 6, 1
151 mulpade 7, 1
152 mov rdx, [x+32]
153 mulpade 5, 4
154 adcx r10, zero
155
156// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
157
158 xor zeroe, zeroe
159 mov rdx, [x+16]
160 mulpadd 3, 2
161 mov [z+40], r13
162 mulpadd 4, 2
163 mov [z+48], r14
164 mulpadd 5, 2
165 mulpadd 6, 2
166 mulpadd 7, 2
167 mov rdx, [x+48]
168 mulpade 4, 6
169 mulpade 5, 6
170 adcx r12, zero
171
172// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
173
174 xor zeroe, zeroe
175 mov rdx, [x+24]
176 mulpadd 4, 3
177 mov [z+56], r15
178 mulpadd 5, 3
179 mov [z+64], r8
180 mulpadd 6, 3
181 mulpadd 7, 3
182 mov rdx, [x+56]
183 mulpadd 4, 7
184 mulpade 5, 7
185 mulpade 6, 7
186 adcx r14, zero
187
188// Double and add things; use z[1]..z[8] and thereafter the registers
189// r9..r15 which haven't been written back yet
190
191 xor zeroe, zeroe
192 mov rdx, [x]
193 mulx rcx, rax, rdx
194 mov [z], rax
195 mov rax, [z+8]
196 adcx rax, rax
197 adox rax, rcx
198 mov [z+8], rax
199
200 mov rax, [z+16]
201 mov rdx, [x+8]
202 mulx rcx, rdx, rdx
203 adcx rax, rax
204 adox rax, rdx
205 mov [z+16], rax
206 mov rax, [z+24]
207 adcx rax, rax
208 adox rax, rcx
209 mov [z+24], rax
210
211 mov rax, [z+32]
212 mov rdx, [x+16]
213 mulx rcx, rdx, rdx
214 adcx rax, rax
215 adox rax, rdx
216 mov [z+32], rax
217 mov rax, [z+40]
218 adcx rax, rax
219 adox rax, rcx
220 mov [z+40], rax
221
222 mov rax, [z+48]
223 mov rdx, [x+24]
224 mulx rcx, rdx, rdx
225 adcx rax, rax
226 adox rax, rdx
227 mov [z+48], rax
228 mov rax, [z+56]
229 adcx rax, rax
230 adox rax, rcx
231 mov [z+56], rax
232
233 mov rax, [z+64]
234 mov rdx, [x+32]
235 mulx rcx, rdx, rdx
236 adcx rax, rax
237 adox rax, rdx
238 mov [z+64], rax
239 adcx r9, r9
240 adox r9, rcx
241 mov [z+72], r9
242
243 mov rdx, [x+40]
244 mulx rcx, rdx, rdx
245 adcx r10, r10
246 adox r10, rdx
247 mov [z+80], r10
248 adcx r11, r11
249 adox r11, rcx
250 mov [z+88], r11
251
252 mov rdx, [x+48]
253 mulx rcx, rdx, rdx
254 adcx r12, r12
255 adox r12, rdx
256 mov [z+96], r12
257 adcx r13, r13
258 adox r13, rcx
259 mov [z+104], r13
260
261 mov rdx, [x+56]
262 mulx r15, rdx, rdx
263 adcx r14, r14
264 adox r14, rdx
265 mov [z+112], r14
266 adcx r15, zero
267 adox r15, zero
268 mov [z+120], r15
269
270.endm
271
272
273S2N_BN_SYMBOL(bignum_sqr_8_16):
274 _CET_ENDBR
275
276#if WINDOWS_ABI
277 push rdi
278 push rsi
279 mov rdi, rcx
280 mov rsi, rdx
281#endif
282
283// Save more registers to play with
284
285 push rbp
286 push r12
287 push r13
288 push r14
289 push r15
290
291// Do the multiplication
292
293 diagonals
294
295// Real epilog
296
297 pop r15
298 pop r14
299 pop r13
300 pop r12
301 pop rbp
302
303#if WINDOWS_ABI
304 pop rsi
305 pop rdi
306#endif
307 ret
308
309#if defined(__linux__) && defined(__ELF__)
310.section .note.GNU-stack,"",%progbits
311#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
index ac0b6f96c2..cb10ba2a12 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,8 @@
16// Square, z := x^2 18// Square, z := x^2
17// Input x[8]; output z[16] 19// Input x[8]; output z[16]
18// 20//
19// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); 21// extern void bignum_sqr_8_16_alt(uint64_t z[static 16],
22// const uint64_t x[static 8]);
20// 23//
21// Standard x86-64 ABI: RDI = z, RSI = x 24// Standard x86-64 ABI: RDI = z, RSI = x
22// Microsoft x64 ABI: RCX = z, RDX = x 25// Microsoft x64 ABI: RCX = z, RDX = x
@@ -103,7 +106,7 @@
103 adc c, 0 106 adc c, 0
104 107
105S2N_BN_SYMBOL(bignum_sqr_8_16_alt): 108S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
106 _CET_ENDBR 109 _CET_ENDBR
107 110
108#if WINDOWS_ABI 111#if WINDOWS_ABI
109 push rdi 112 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
index 3ff8a30510..7324d3a71e 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
16// Subtract, z := x - y 18// Subtract, z := x - y
17// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] 19// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
18// 20//
19// extern uint64_t bignum_sub 21// extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m,
20// (uint64_t p, uint64_t *z, 22// const uint64_t *x, uint64_t n, const uint64_t *y);
21// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22// 23//
23// Does the z := x - y operation, truncating modulo p words in general and 24// Does the z := x - y operation, truncating modulo p words in general and
24// returning a top borrow (0 or 1) in the p'th place, only subtracting input 25// returning a top borrow (0 or 1) in the p'th place, only subtracting input
@@ -49,7 +50,7 @@
49 50
50 51
51S2N_BN_SYMBOL(bignum_sub): 52S2N_BN_SYMBOL(bignum_sub):
52 _CET_ENDBR 53 _CET_ENDBR
53 54
54#if WINDOWS_ABI 55#if WINDOWS_ABI
55 push rdi 56 push rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub):
75 cmp p, n 76 cmp p, n
76 cmovc n, p 77 cmovc n, p
77 cmp m, n 78 cmp m, n
78 jc ylonger 79 jc bignum_sub_ylonger
79 80
80// The case where x is longer or of the same size (p >= m >= n) 81// The case where x is longer or of the same size (p >= m >= n)
81 82
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub):
83 sub m, n 84 sub m, n
84 inc m 85 inc m
85 test n, n 86 test n, n
86 jz xtest 87 jz bignum_sub_xtest
87xmainloop: 88bignum_sub_xmainloop:
88 mov a, [x+8*i] 89 mov a, [x+8*i]
89 sbb a, [y+8*i] 90 sbb a, [y+8*i]
90 mov [z+8*i],a 91 mov [z+8*i],a
91 inc i 92 inc i
92 dec n 93 dec n
93 jnz xmainloop 94 jnz bignum_sub_xmainloop
94 jmp xtest 95 jmp bignum_sub_xtest
95xtoploop: 96bignum_sub_xtoploop:
96 mov a, [x+8*i] 97 mov a, [x+8*i]
97 sbb a, 0 98 sbb a, 0
98 mov [z+8*i],a 99 mov [z+8*i],a
99 inc i 100 inc i
100xtest: 101bignum_sub_xtest:
101 dec m 102 dec m
102 jnz xtoploop 103 jnz bignum_sub_xtoploop
103 sbb a, a 104 sbb a, a
104 test p, p 105 test p, p
105 jz tailskip 106 jz bignum_sub_tailskip
106tailloop: 107bignum_sub_tailloop:
107 mov [z+8*i],a 108 mov [z+8*i],a
108 inc i 109 inc i
109 dec p 110 dec p
110 jnz tailloop 111 jnz bignum_sub_tailloop
111tailskip: 112bignum_sub_tailskip:
112 neg a 113 neg a
113#if WINDOWS_ABI 114#if WINDOWS_ABI
114 pop rsi 115 pop rsi
@@ -118,29 +119,29 @@ tailskip:
118 119
119// The case where y is longer (p >= n > m) 120// The case where y is longer (p >= n > m)
120 121
121ylonger: 122bignum_sub_ylonger:
122 123
123 sub p, n 124 sub p, n
124 sub n, m 125 sub n, m
125 test m, m 126 test m, m
126 jz ytoploop 127 jz bignum_sub_ytoploop
127ymainloop: 128bignum_sub_ymainloop:
128 mov a, [x+8*i] 129 mov a, [x+8*i]
129 sbb a, [y+8*i] 130 sbb a, [y+8*i]
130 mov [z+8*i],a 131 mov [z+8*i],a
131 inc i 132 inc i
132 dec m 133 dec m
133 jnz ymainloop 134 jnz bignum_sub_ymainloop
134ytoploop: 135bignum_sub_ytoploop:
135 mov ashort, 0 136 mov ashort, 0
136 sbb a, [y+8*i] 137 sbb a, [y+8*i]
137 mov [z+8*i],a 138 mov [z+8*i],a
138 inc i 139 inc i
139 dec n 140 dec n
140 jnz ytoploop 141 jnz bignum_sub_ytoploop
141 sbb a, a 142 sbb a, a
142 test p, p 143 test p, p
143 jnz tailloop 144 jnz bignum_sub_tailloop
144 neg a 145 neg a
145#if WINDOWS_ABI 146#if WINDOWS_ABI
146 pop rsi 147 pop rsi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
index a377a05681..9ff8920ca2 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */ 1/* $OpenBSD: bn_arch.c,v 1.12 2025/08/14 15:29:17 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -19,6 +19,7 @@
19 19
20#include "bn_arch.h" 20#include "bn_arch.h"
21#include "bn_local.h" 21#include "bn_local.h"
22#include "crypto_arch.h"
22#include "s2n_bignum.h" 23#include "s2n_bignum.h"
23 24
24#ifdef HAVE_BN_ADD 25#ifdef HAVE_BN_ADD
@@ -26,8 +27,8 @@ BN_ULONG
26bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, 27bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
27 int b_len) 28 int b_len)
28{ 29{
29 return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a, 30 return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
30 b_len, (uint64_t *)b); 31 b_len, (const uint64_t *)b);
31} 32}
32#endif 33#endif
33 34
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
36BN_ULONG 37BN_ULONG
37bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) 38bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
38{ 39{
39 return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n, 40 return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
40 (uint64_t *)bd); 41 (const uint64_t *)bd);
41} 42}
42#endif 43#endif
43 44
@@ -46,8 +47,8 @@ BN_ULONG
46bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, 47bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
47 int b_len) 48 int b_len)
48{ 49{
49 return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a, 50 return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
50 b_len, (uint64_t *)b); 51 b_len, (const uint64_t *)b);
51} 52}
52#endif 53#endif
53 54
@@ -55,8 +56,28 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
55BN_ULONG 56BN_ULONG
56bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) 57bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
57{ 58{
58 return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n, 59 return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
59 (uint64_t *)bd); 60 (const uint64_t *)bd);
61}
62#endif
63
64#ifdef HAVE_BN_MOD_ADD_WORDS
65void
66bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
67 const BN_ULONG *m, size_t n)
68{
69 bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a,
70 (const uint64_t *)b, (const uint64_t *)m);
71}
72#endif
73
74#ifdef HAVE_BN_MOD_SUB_WORDS
75void
76bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
77 const BN_ULONG *m, size_t n)
78{
79 bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a,
80 (const uint64_t *)b, (const uint64_t *)m);
60} 81}
61#endif 82#endif
62 83
@@ -64,7 +85,7 @@ bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
64BN_ULONG 85BN_ULONG
65bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) 86bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
66{ 87{
67 return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad); 88 return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
68} 89}
69#endif 90#endif
70 91
@@ -72,25 +93,52 @@ bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
72BN_ULONG 93BN_ULONG
73bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) 94bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
74{ 95{
75 return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad); 96 return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
76} 97}
77#endif 98#endif
78 99
79#ifdef HAVE_BN_MUL_COMBA4 100#ifdef HAVE_BN_MUL_COMBA4
80void 101void
81bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) 102bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
82{ 103{
83 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 104 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
84 bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd); 105 bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad,
106 (const uint64_t *)bd);
107 return;
108 }
109
110 bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad,
111 (const uint64_t *)bd);
112}
113#endif
114
115#ifdef HAVE_BN_MUL_COMBA6
116void
117bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
118{
119 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
120 bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad,
121 (const uint64_t *)bd);
122 return;
123 }
124
125 bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad,
126 (const uint64_t *)bd);
85} 127}
86#endif 128#endif
87 129
88#ifdef HAVE_BN_MUL_COMBA8 130#ifdef HAVE_BN_MUL_COMBA8
89void 131void
90bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) 132bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
91{ 133{
92 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 134 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
93 bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd); 135 bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad,
136 (const uint64_t *)bd);
137 return;
138 }
139
140 bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad,
141 (const uint64_t *)bd);
94} 142}
95#endif 143#endif
96 144
@@ -98,7 +146,7 @@ bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
98int 146int
99bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx) 147bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
100{ 148{
101 bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d); 149 bignum_sqr(r_len, (uint64_t *)r->d, a->top, (const uint64_t *)a->d);
102 150
103 return 1; 151 return 1;
104} 152}
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
108void 156void
109bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad) 157bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
110{ 158{
111 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 159 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
112 bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad); 160 bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad);
161 return;
162 }
163
164 bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad);
165}
166#endif
167
168#ifdef HAVE_BN_SQR_COMBA6
169void
170bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad)
171{
172 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
173 bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad);
174 return;
175 }
176
177 bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad);
113} 178}
114#endif 179#endif
115 180
@@ -117,8 +182,12 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
117void 182void
118bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad) 183bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
119{ 184{
120 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 185 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
121 bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad); 186 bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad);
187 return;
188 }
189
190 bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad);
122} 191}
123#endif 192#endif
124 193
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 927cd75208..7359f993a7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */ 1/* $OpenBSD: bn_arch.h,v 1.16 2025/08/14 15:22:54 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -27,13 +27,18 @@
27 27
28#define HAVE_BN_DIV_WORDS 28#define HAVE_BN_DIV_WORDS
29 29
30#define HAVE_BN_MOD_ADD_WORDS
31#define HAVE_BN_MOD_SUB_WORDS
32
30#define HAVE_BN_MUL_ADD_WORDS 33#define HAVE_BN_MUL_ADD_WORDS
31#define HAVE_BN_MUL_COMBA4 34#define HAVE_BN_MUL_COMBA4
35#define HAVE_BN_MUL_COMBA6
32#define HAVE_BN_MUL_COMBA8 36#define HAVE_BN_MUL_COMBA8
33#define HAVE_BN_MUL_WORDS 37#define HAVE_BN_MUL_WORDS
34 38
35#define HAVE_BN_SQR 39#define HAVE_BN_SQR
36#define HAVE_BN_SQR_COMBA4 40#define HAVE_BN_SQR_COMBA4
41#define HAVE_BN_SQR_COMBA6
37#define HAVE_BN_SQR_COMBA8 42#define HAVE_BN_SQR_COMBA8
38 43
39#define HAVE_BN_SUB 44#define HAVE_BN_SUB
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
index 3926fcd4b0..705fbdbbda 100644
--- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S
+++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
@@ -1,3 +1,5 @@
1// $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,7 @@
16// Count leading zero bits in a single word 18// Count leading zero bits in a single word
17// Input a; output function return 19// Input a; output function return
18// 20//
19// extern uint64_t word_clz (uint64_t a); 21// extern uint64_t word_clz(uint64_t a);
20// 22//
21// Standard x86-64 ABI: RDI = a, returns RAX 23// Standard x86-64 ABI: RDI = a, returns RAX
22// Microsoft x64 ABI: RCX = a, returns RAX 24// Microsoft x64 ABI: RCX = a, returns RAX
@@ -30,7 +32,7 @@
30 .text 32 .text
31 33
32S2N_BN_SYMBOL(word_clz): 34S2N_BN_SYMBOL(word_clz):
33 _CET_ENDBR 35 _CET_ENDBR
34 36
35#if WINDOWS_ABI 37#if WINDOWS_ABI
36 push rdi 38 push rdi