summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_add.S51
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S32
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S28
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S112
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S99
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul.S25
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S187
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S8
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S223
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S199
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S273
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S9
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S29
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S158
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S8
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S227
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S210
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S311
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S7
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sub.S47
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bn_arch.c113
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bn_arch.h7
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/word_clz.S6
-rw-r--r--src/lib/libcrypto/bn/bn_internal.h4
-rw-r--r--src/lib/libcrypto/bn/bn_local.h8
-rw-r--r--src/lib/libcrypto/bn/bn_mod_words.c40
-rw-r--r--src/lib/libcrypto/bn/bn_mont.c5
-rw-r--r--src/lib/libcrypto/bn/bn_mul.c73
-rw-r--r--src/lib/libcrypto/bn/bn_sqr.c49
-rw-r--r--src/lib/libcrypto/bn/s2n_bignum.h793
-rw-r--r--src/lib/libcrypto/bn/s2n_bignum_internal.h35
31 files changed, 2988 insertions, 388 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
index 5fe4aae7a1..1d4e6d08ef 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
16// Add, z := x + y 18// Add, z := x + y
17// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] 19// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
18// 20//
19// extern uint64_t bignum_add 21// extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m,
20// (uint64_t p, uint64_t *z, 22// const uint64_t *x, uint64_t n, const uint64_t *y);
21// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22// 23//
23// Does the z := x + y operation, truncating modulo p words in general and 24// Does the z := x + y operation, truncating modulo p words in general and
24// returning a top carry (0 or 1) in the p'th place, only adding the input 25// returning a top carry (0 or 1) in the p'th place, only adding the input
@@ -49,7 +50,7 @@
49 50
50 51
51S2N_BN_SYMBOL(bignum_add): 52S2N_BN_SYMBOL(bignum_add):
52 _CET_ENDBR 53 _CET_ENDBR
53 54
54#if WINDOWS_ABI 55#if WINDOWS_ABI
55 push rdi 56 push rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add):
75 cmp p, n 76 cmp p, n
76 cmovc n, p 77 cmovc n, p
77 cmp m, n 78 cmp m, n
78 jc ylonger 79 jc bignum_add_ylonger
79 80
80// The case where x is longer or of the same size (p >= m >= n) 81// The case where x is longer or of the same size (p >= m >= n)
81 82
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add):
83 sub m, n 84 sub m, n
84 inc m 85 inc m
85 test n, n 86 test n, n
86 jz xtest 87 jz bignum_add_xtest
87xmainloop: 88bignum_add_xmainloop:
88 mov a, [x+8*i] 89 mov a, [x+8*i]
89 adc a, [y+8*i] 90 adc a, [y+8*i]
90 mov [z+8*i],a 91 mov [z+8*i],a
91 inc i 92 inc i
92 dec n 93 dec n
93 jnz xmainloop 94 jnz bignum_add_xmainloop
94 jmp xtest 95 jmp bignum_add_xtest
95xtoploop: 96bignum_add_xtoploop:
96 mov a, [x+8*i] 97 mov a, [x+8*i]
97 adc a, 0 98 adc a, 0
98 mov [z+8*i],a 99 mov [z+8*i],a
99 inc i 100 inc i
100xtest: 101bignum_add_xtest:
101 dec m 102 dec m
102 jnz xtoploop 103 jnz bignum_add_xtoploop
103 mov ashort, 0 104 mov ashort, 0
104 adc a, 0 105 adc a, 0
105 test p, p 106 test p, p
106 jnz tails 107 jnz bignum_add_tails
107#if WINDOWS_ABI 108#if WINDOWS_ABI
108 pop rsi 109 pop rsi
109 pop rdi 110 pop rdi
@@ -112,30 +113,30 @@ xtest:
112 113
113// The case where y is longer (p >= n > m) 114// The case where y is longer (p >= n > m)
114 115
115ylonger: 116bignum_add_ylonger:
116 117
117 sub p, n 118 sub p, n
118 sub n, m 119 sub n, m
119 test m, m 120 test m, m
120 jz ytoploop 121 jz bignum_add_ytoploop
121ymainloop: 122bignum_add_ymainloop:
122 mov a, [x+8*i] 123 mov a, [x+8*i]
123 adc a, [y+8*i] 124 adc a, [y+8*i]
124 mov [z+8*i],a 125 mov [z+8*i],a
125 inc i 126 inc i
126 dec m 127 dec m
127 jnz ymainloop 128 jnz bignum_add_ymainloop
128ytoploop: 129bignum_add_ytoploop:
129 mov a, [y+8*i] 130 mov a, [y+8*i]
130 adc a, 0 131 adc a, 0
131 mov [z+8*i],a 132 mov [z+8*i],a
132 inc i 133 inc i
133 dec n 134 dec n
134 jnz ytoploop 135 jnz bignum_add_ytoploop
135 mov ashort, 0 136 mov ashort, 0
136 adc a, 0 137 adc a, 0
137 test p, p 138 test p, p
138 jnz tails 139 jnz bignum_add_tails
139#if WINDOWS_ABI 140#if WINDOWS_ABI
140 pop rsi 141 pop rsi
141 pop rdi 142 pop rdi
@@ -144,16 +145,16 @@ ytoploop:
144 145
145// Adding a non-trivial tail, when p > max(m,n) 146// Adding a non-trivial tail, when p > max(m,n)
146 147
147tails: 148bignum_add_tails:
148 mov [z+8*i],a 149 mov [z+8*i],a
149 xor a, a 150 xor a, a
150 jmp tail 151 jmp bignum_add_tail
151tailloop: 152bignum_add_tailloop:
152 mov [z+8*i],a 153 mov [z+8*i],a
153tail: 154bignum_add_tail:
154 inc i 155 inc i
155 dec p 156 dec p
156 jnz tailloop 157 jnz bignum_add_tailloop
157#if WINDOWS_ABI 158#if WINDOWS_ABI
158 pop rsi 159 pop rsi
159 pop rdi 160 pop rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
index 25ba17bce2..a611919603 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Multiply-add with single-word multiplier, z := z + c * y 18// Multiply-add with single-word multiplier, z := z + c * y
17// Inputs c, y[n]; outputs function return (carry-out) and z[k] 19// Inputs c, y[n]; outputs function return (carry-out) and z[k]
18// 20//
19// extern uint64_t bignum_cmadd 21// extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
20// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 22// const uint64_t *y);
21// 23//
22// Does the "z := z + c * y" operation where y is n digits, result z is p. 24// Does the "z := z + c * y" operation where y is n digits, result z is p.
23// Truncates the result in general. 25// Truncates the result in general.
@@ -54,7 +56,7 @@
54 56
55 57
56S2N_BN_SYMBOL(bignum_cmadd): 58S2N_BN_SYMBOL(bignum_cmadd):
57 _CET_ENDBR 59 _CET_ENDBR
58 60
59#if WINDOWS_ABI 61#if WINDOWS_ABI
60 push rdi 62 push rdi
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd):
82 84
83 xor h, h 85 xor h, h
84 test n, n 86 test n, n
85 jz end 87 jz bignum_cmadd_end
86 88
87// Move c into a safer register as multiplies overwrite rdx 89// Move c into a safer register as multiplies overwrite rdx
88 90
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd):
96 mov h, rdx 98 mov h, rdx
97 mov ishort, 1 99 mov ishort, 1
98 dec n 100 dec n
99 jz hightail 101 jz bignum_cmadd_hightail
100 102
101// Main loop, where we always have CF + previous high part h to add in 103// Main loop, where we always have CF + previous high part h to add in
102 104
103loop: 105bignum_cmadd_loop:
104 adc h, [z+8*i] 106 adc h, [z+8*i]
105 sbb r, r 107 sbb r, r
106 mov rax, [x+8*i] 108 mov rax, [x+8*i]
@@ -111,36 +113,36 @@ loop:
111 mov h, rdx 113 mov h, rdx
112 inc i 114 inc i
113 dec n 115 dec n
114 jnz loop 116 jnz bignum_cmadd_loop
115 117
116hightail: 118bignum_cmadd_hightail:
117 adc h, 0 119 adc h, 0
118 120
119// Propagate the carry all the way to the end with h as extra carry word 121// Propagate the carry all the way to the end with h as extra carry word
120 122
121tail: 123bignum_cmadd_tail:
122 test p, p 124 test p, p
123 jz end 125 jz bignum_cmadd_end
124 126
125 add [z+8*i], h 127 add [z+8*i], h
126 mov hshort, 0 128 mov hshort, 0
127 inc i 129 inc i
128 dec p 130 dec p
129 jz highend 131 jz bignum_cmadd_highend
130 132
131tloop: 133bignum_cmadd_tloop:
132 adc [z+8*i], h 134 adc [z+8*i], h
133 inc i 135 inc i
134 dec p 136 dec p
135 jnz tloop 137 jnz bignum_cmadd_tloop
136 138
137highend: 139bignum_cmadd_highend:
138 140
139 adc h, 0 141 adc h, 0
140 142
141// Return the high/carry word 143// Return the high/carry word
142 144
143end: 145bignum_cmadd_end:
144 mov rax, h 146 mov rax, h
145 147
146 pop rbx 148 pop rbx
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
index 12f785d63a..eb71d9da44 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Multiply by a single word, z := c * y 18// Multiply by a single word, z := c * y
17// Inputs c, y[n]; outputs function return (carry-out) and z[k] 19// Inputs c, y[n]; outputs function return (carry-out) and z[k]
18// 20//
19// extern uint64_t bignum_cmul 21// extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
20// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 22// const uint64_t *y);
21// 23//
22// Does the "z := c * y" operation where y is n digits, result z is p. 24// Does the "z := c * y" operation where y is n digits, result z is p.
23// Truncates the result in general unless p >= n + 1. 25// Truncates the result in general unless p >= n + 1.
@@ -51,7 +53,7 @@
51 53
52 54
53S2N_BN_SYMBOL(bignum_cmul): 55S2N_BN_SYMBOL(bignum_cmul):
54 _CET_ENDBR 56 _CET_ENDBR
55 57
56#if WINDOWS_ABI 58#if WINDOWS_ABI
57 push rdi 59 push rdi
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul):
76 xor h, h 78 xor h, h
77 xor i, i 79 xor i, i
78 test n, n 80 test n, n
79 jz tail 81 jz bignum_cmul_tail
80 82
81// Move c into a safer register as multiplies overwrite rdx 83// Move c into a safer register as multiplies overwrite rdx
82 84
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul):
90 mov h, rdx 92 mov h, rdx
91 inc i 93 inc i
92 cmp i, n 94 cmp i, n
93 jz tail 95 jz bignum_cmul_tail
94 96
95// Main loop doing the multiplications 97// Main loop doing the multiplications
96 98
97loop: 99bignum_cmul_loop:
98 mov rax, [x+8*i] 100 mov rax, [x+8*i]
99 mul c 101 mul c
100 add rax, h 102 add rax, h
@@ -103,28 +105,28 @@ loop:
103 mov h, rdx 105 mov h, rdx
104 inc i 106 inc i
105 cmp i, n 107 cmp i, n
106 jc loop 108 jc bignum_cmul_loop
107 109
108// Add a tail when the destination is longer 110// Add a tail when the destination is longer
109 111
110tail: 112bignum_cmul_tail:
111 cmp i, p 113 cmp i, p
112 jnc end 114 jnc bignum_cmul_end
113 mov [z+8*i], h 115 mov [z+8*i], h
114 xor h, h 116 xor h, h
115 inc i 117 inc i
116 cmp i, p 118 cmp i, p
117 jnc end 119 jnc bignum_cmul_end
118 120
119tloop: 121bignum_cmul_tloop:
120 mov [z+8*i], h 122 mov [z+8*i], h
121 inc i 123 inc i
122 cmp i, p 124 cmp i, p
123 jc tloop 125 jc bignum_cmul_tloop
124 126
125// Return the high/carry word 127// Return the high/carry word
126 128
127end: 129bignum_cmul_end:
128 mov rax, h 130 mov rax, h
129 131
130#if WINDOWS_ABI 132#if WINDOWS_ABI
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
new file mode 100644
index 0000000000..baf27fdc7f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
@@ -0,0 +1,112 @@
1// $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Add modulo m, z := (x + y) mod m, assuming x and y reduced
19// Inputs x[k], y[k], m[k]; output z[k]
20//
21// extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x,
22// const uint64_t *y, const uint64_t *m);
23//
24// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
25// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
33 .text
34
35#define k rdi
36#define z rsi
37#define x rdx
38#define y rcx
39#define m r8
40#define i r9
41#define j r10
42#define a rax
43#define c r11
44
45S2N_BN_SYMBOL(bignum_modadd):
46 _CET_ENDBR
47
48#if WINDOWS_ABI
49 push rdi
50 push rsi
51 mov rdi, rcx
52 mov rsi, rdx
53 mov rdx, r8
54 mov rcx, r9
55 mov r8, [rsp+56]
56#endif
57
58// If k = 0 do nothing
59
60 test k, k
61 jz bignum_modadd_end
62
63// First just add (c::z) := x + y
64
65 xor c, c
66 mov j, k
67 xor i, i
68bignum_modadd_addloop:
69 mov a, [x+8*i]
70 adc a, [y+8*i]
71 mov [z+8*i], a
72 inc i
73 dec j
74 jnz bignum_modadd_addloop
75 adc c, 0
76
77// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
78
79 mov j, k
80 xor i, i
81bignum_modadd_cmploop:
82 mov a, [z+8*i]
83 sbb a, [m+8*i]
84 inc i
85 dec j
86 jnz bignum_modadd_cmploop
87 sbb c, 0
88 not c
89
90// Now do a masked subtraction z := z - [c] * m
91
92 xor i, i
93bignum_modadd_subloop:
94 mov a, [m+8*i]
95 and a, c
96 neg j
97 sbb [z+8*i], a
98 sbb j, j
99 inc i
100 cmp i, k
101 jc bignum_modadd_subloop
102
103bignum_modadd_end:
104#if WINDOWS_ABI
105 pop rsi
106 pop rdi
107#endif
108 ret
109
110#if defined(__linux__) && defined(__ELF__)
111.section .note.GNU-stack,"",%progbits
112#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
new file mode 100644
index 0000000000..63b3230e35
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
@@ -0,0 +1,99 @@
1// $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
19// Inputs x[k], y[k], m[k]; output z[k]
20//
21// extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x,
22// const uint64_t *y, const uint64_t *m);
23//
24// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
25// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
33 .text
34
35#define k rdi
36#define z rsi
37#define x rdx
38#define y rcx
39#define m r8
40#define i r9
41#define j r10
42#define a rax
43#define c r11
44
45S2N_BN_SYMBOL(bignum_modsub):
46 _CET_ENDBR
47
48#if WINDOWS_ABI
49 push rdi
50 push rsi
51 mov rdi, rcx
52 mov rsi, rdx
53 mov rdx, r8
54 mov rcx, r9
55 mov r8, [rsp+56]
56#endif
57
58// If k = 0 do nothing
59
60 test k, k
61 jz bignum_modsub_end
62
63// Subtract z := x - y and record a mask for the carry x - y < 0
64
65 xor c, c
66 mov j, k
67 xor i, i
68bignum_modsub_subloop:
69 mov a, [x+8*i]
70 sbb a, [y+8*i]
71 mov [z+8*i], a
72 inc i
73 dec j
74 jnz bignum_modsub_subloop
75 sbb c, c
76
77// Now do a masked addition z := z + [c] * m
78
79 xor i, i
80bignum_modsub_addloop:
81 mov a, [m+8*i]
82 and a, c
83 neg j
84 adc [z+8*i], a
85 sbb j, j
86 inc i
87 cmp i, k
88 jc bignum_modsub_addloop
89
90bignum_modsub_end:
91#if WINDOWS_ABI
92 pop rsi
93 pop rdi
94#endif
95 ret
96
97#if defined(__linux__) && defined(__ELF__)
98.section .note.GNU-stack,"",%progbits
99#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
index a3552679a2..538cce9af7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
16// Multiply z := x * y 18// Multiply z := x * y
17// Inputs x[m], y[n]; output z[k] 19// Inputs x[m], y[n]; output z[k]
18// 20//
19// extern void bignum_mul 21// extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
20// (uint64_t k, uint64_t *z, 22// uint64_t n, const uint64_t *y);
21// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22// 23//
23// Does the "z := x * y" operation where x is m digits, y is n, result z is k. 24// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
24// Truncates the result in general unless k >= m + n 25// Truncates the result in general unless k >= m + n
@@ -59,7 +60,7 @@
59 60
60 61
61S2N_BN_SYMBOL(bignum_mul): 62S2N_BN_SYMBOL(bignum_mul):
62 _CET_ENDBR 63 _CET_ENDBR
63 64
64#if WINDOWS_ABI 65#if WINDOWS_ABI
65 push rdi 66 push rdi
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul):
88// If we did a multiply-add variant, however, then we could 89// If we did a multiply-add variant, however, then we could
89 90
90 test p, p 91 test p, p
91 jz end 92 jz bignum_mul_end
92 93
93// Set initial 2-part sum to zero (we zero c inside the body) 94// Set initial 2-part sum to zero (we zero c inside the body)
94 95
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul):
99 100
100 xor k, k 101 xor k, k
101 102
102outerloop: 103bignum_mul_outerloop:
103 104
104// Zero our carry term first; we eventually want it and a zero is useful now 105// Zero our carry term first; we eventually want it and a zero is useful now
105// Set a = max 0 (k + 1 - n), i = min (k + 1) m 106// Set a = max 0 (k + 1 - n), i = min (k + 1) m
@@ -125,11 +126,11 @@ outerloop:
125 mov d, k 126 mov d, k
126 sub d, i 127 sub d, i
127 sub i, a 128 sub i, a
128 jbe innerend 129 jbe bignum_mul_innerend
129 lea x,[rcx+8*a] 130 lea x,[rcx+8*a]
130 lea y,[r9+8*d-8] 131 lea y,[r9+8*d-8]
131 132
132innerloop: 133bignum_mul_innerloop:
133 mov rax, [y+8*i] 134 mov rax, [y+8*i]
134 mul QWORD PTR [x] 135 mul QWORD PTR [x]
135 add x, 8 136 add x, 8
@@ -137,9 +138,9 @@ innerloop:
137 adc h, rdx 138 adc h, rdx
138 adc c, 0 139 adc c, 0
139 dec i 140 dec i
140 jnz innerloop 141 jnz bignum_mul_innerloop
141 142
142innerend: 143bignum_mul_innerend:
143 144
144 mov [z], l 145 mov [z], l
145 mov l, h 146 mov l, h
@@ -147,9 +148,9 @@ innerend:
147 add z, 8 148 add z, 8
148 149
149 cmp k, p 150 cmp k, p
150 jc outerloop 151 jc bignum_mul_outerloop
151 152
152end: 153bignum_mul_end:
153 pop r15 154 pop r15
154 pop r14 155 pop r14
155 pop r13 156 pop r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..d6ad514020
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,187 @@
1// $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[4], y[4]; output z[8]
20//
21// extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
22// const uint64_t y[static 4]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
33 .text
34
35// These are actually right
36
37#define z rdi
38#define x rsi
39
40// Copied in or set up
41
42#define y rcx
43
44// A zero register
45
46#define zero rbp
47#define zeroe ebp
48
49// Add in x[i] * rdx to the (i,i+1) position with the register window
50// Would be nice to have conditional expressions reg[i], reg[i+1] ...
51
52.macro mulpadd arg1,arg2
53 mulx rbx, rax, [x+8*\arg2]
54.if ((\arg1 + \arg2) % 4 == 0)
55 adcx r8, rax
56 adox r9, rbx
57.elseif ((\arg1 + \arg2) % 4 == 1)
58 adcx r9, rax
59 adox r10, rbx
60.elseif ((\arg1 + \arg2) % 4 == 2)
61 adcx r10, rax
62 adox r11, rbx
63.elseif ((\arg1 + \arg2) % 4 == 3)
64 adcx r11, rax
65 adox r8, rbx
66.endif
67
68.endm
69
70
71// Add in the whole j'th row
72
73.macro addrow arg1
74 mov rdx, [y+8*\arg1]
75 xor zeroe, zeroe
76
77 mulpadd \arg1, 0
78
79.if (\arg1 % 4 == 0)
80 mov [z+8*\arg1],r8
81.elseif (\arg1 % 4 == 1)
82 mov [z+8*\arg1],r9
83.elseif (\arg1 % 4 == 2)
84 mov [z+8*\arg1],r10
85.elseif (\arg1 % 4 == 3)
86 mov [z+8*\arg1],r11
87.endif
88
89 mulpadd \arg1, 1
90 mulpadd \arg1, 2
91
92.if (\arg1 % 4 == 0)
93 mulx r8, rax, [x+24]
94 adcx r11, rax
95 adox r8, zero
96 adcx r8, zero
97.elseif (\arg1 % 4 == 1)
98 mulx r9, rax, [x+24]
99 adcx r8, rax
100 adox r9, zero
101 adcx r9, zero
102.elseif (\arg1 % 4 == 2)
103 mulx r10, rax, [x+24]
104 adcx r9, rax
105 adox r10, zero
106 adcx r10, zero
107.elseif (\arg1 % 4 == 3)
108 mulx r11, rax, [x+24]
109 adcx r10, rax
110 adox r11, zero
111 adcx r11, zero
112.endif
113
114.endm
115
116
117
118S2N_BN_SYMBOL(bignum_mul_4_8):
119 _CET_ENDBR
120
121#if WINDOWS_ABI
122 push rdi
123 push rsi
124 mov rdi, rcx
125 mov rsi, rdx
126 mov rdx, r8
127#endif
128
129// Save more registers to play with
130
131 push rbp
132 push rbx
133
134// Copy y into a safe register to start with
135
136 mov y, rdx
137
138// Zero a register, which also makes sure we don't get a fake carry-in
139
140 xor zeroe, zeroe
141
142// Do the zeroth row, which is a bit different
143// Write back the zero-zero product and then accumulate
144// r8,r11,r10,r9 as y[0] * x from 1..4
145
146 mov rdx, [y]
147
148 mulx r9, r8, [x]
149 mov [z], r8
150
151 mulx r10, rbx, [x+8]
152 adcx r9, rbx
153
154 mulx r11, rbx, [x+16]
155 adcx r10, rbx
156
157 mulx r8, rbx, [x+24]
158 adcx r11, rbx
159 adcx r8, zero
160
161// Now all the other rows in a uniform pattern
162
163 addrow 1
164 addrow 2
165 addrow 3
166
167// Now write back the additional columns
168
169 mov [z+32], r8
170 mov [z+40], r9
171 mov [z+48], r10
172 mov [z+56], r11
173
174// Restore registers and return
175
176 pop rbx
177 pop rbp
178
179#if WINDOWS_ABI
180 pop rsi
181 pop rdi
182#endif
183 ret
184
185#if defined(__linux__) && defined(__ELF__)
186.section .note.GNU-stack,"",%progbits
187#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
index 70ff69e372..2592d1d658 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Multiply z := x * y 18// Multiply z := x * y
17// Inputs x[4], y[4]; output z[8] 19// Inputs x[4], y[4]; output z[8]
18// 20//
19// extern void bignum_mul_4_8_alt 21// extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4],
20// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); 22// const uint64_t y[static 4]);
21// 23//
22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y 24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
23// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y 25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
@@ -72,7 +74,7 @@
72 adc h, rdx 74 adc h, rdx
73 75
74S2N_BN_SYMBOL(bignum_mul_4_8_alt): 76S2N_BN_SYMBOL(bignum_mul_4_8_alt):
75 _CET_ENDBR 77 _CET_ENDBR
76 78
77#if WINDOWS_ABI 79#if WINDOWS_ABI
78 push rdi 80 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..56cbdf06e0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,223 @@
1// $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[6], y[6]; output z[12]
20//
21// extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
22// const uint64_t y[static 6]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
33 .text
34
35// These are actually right
36
37#define z rdi
38#define x rsi
39
40// Copied in or set up
41
42#define y rcx
43
44// A zero register
45
46#define zero rbp
47#define zeroe ebp
48
49// Add in x[i] * rdx to the (i,i+1) position with the register window
50// Would be nice to have conditional expressions reg[i], reg[i+1] ...
51
52.macro mulpadd arg1,arg2
53 mulx rbx, rax, [x+8*\arg2]
54.if ((\arg1 + \arg2) % 6 == 0)
55 adcx r8, rax
56 adox r9, rbx
57.elseif ((\arg1 + \arg2) % 6 == 1)
58 adcx r9, rax
59 adox r10, rbx
60.elseif ((\arg1 + \arg2) % 6 == 2)
61 adcx r10, rax
62 adox r11, rbx
63.elseif ((\arg1 + \arg2) % 6 == 3)
64 adcx r11, rax
65 adox r12, rbx
66.elseif ((\arg1 + \arg2) % 6 == 4)
67 adcx r12, rax
68 adox r13, rbx
69.elseif ((\arg1 + \arg2) % 6 == 5)
70 adcx r13, rax
71 adox r8, rbx
72.endif
73
74.endm
75
76
77// Add in the whole j'th row
78
79.macro addrow arg1
80 mov rdx, [y+8*\arg1]
81 xor zeroe, zeroe
82
83 mulpadd \arg1, 0
84
85.if (\arg1 % 6 == 0)
86 mov [z+8*\arg1],r8
87.elseif (\arg1 % 6 == 1)
88 mov [z+8*\arg1],r9
89.elseif (\arg1 % 6 == 2)
90 mov [z+8*\arg1],r10
91.elseif (\arg1 % 6 == 3)
92 mov [z+8*\arg1],r11
93.elseif (\arg1 % 6 == 4)
94 mov [z+8*\arg1],r12
95.elseif (\arg1 % 6 == 5)
96 mov [z+8*\arg1],r13
97.endif
98
99 mulpadd \arg1, 1
100 mulpadd \arg1, 2
101 mulpadd \arg1, 3
102 mulpadd \arg1, 4
103
104.if (\arg1 % 6 == 0)
105 mulx r8, rax, [x+40]
106 adcx r13, rax
107 adox r8, zero
108 adcx r8, zero
109.elseif (\arg1 % 6 == 1)
110 mulx r9, rax, [x+40]
111 adcx r8, rax
112 adox r9, zero
113 adcx r9, zero
114.elseif (\arg1 % 6 == 2)
115 mulx r10, rax, [x+40]
116 adcx r9, rax
117 adox r10, zero
118 adcx r10, zero
119.elseif (\arg1 % 6 == 3)
120 mulx r11, rax, [x+40]
121 adcx r10, rax
122 adox r11, zero
123 adcx r11, zero
124.elseif (\arg1 % 6 == 4)
125 mulx r12, rax, [x+40]
126 adcx r11, rax
127 adox r12, zero
128 adcx r12, zero
129.elseif (\arg1 % 6 == 5)
130 mulx r13, rax, [x+40]
131 adcx r12, rax
132 adox r13, zero
133 adcx r13, zero
134.endif
135
136.endm
137
138
139
140S2N_BN_SYMBOL(bignum_mul_6_12):
141 _CET_ENDBR
142
143#if WINDOWS_ABI
144 push rdi
145 push rsi
146 mov rdi, rcx
147 mov rsi, rdx
148 mov rdx, r8
149#endif
150
151// Save more registers to play with
152
153 push rbp
154 push rbx
155 push r12
156 push r13
157
158// Copy y into a safe register to start with
159
160 mov y, rdx
161
162// Zero a register, which also makes sure we don't get a fake carry-in
163
164 xor zeroe, zeroe
165
166// Do the zeroth row, which is a bit different
167// Write back the zero-zero product and then accumulate
168// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
169
170 mov rdx, [y]
171
172 mulx r9, r8, [x]
173 mov [z], r8
174
175 mulx r10, rbx, [x+8]
176 adcx r9, rbx
177
178 mulx r11, rbx, [x+16]
179 adcx r10, rbx
180
181 mulx r12, rbx, [x+24]
182 adcx r11, rbx
183
184 mulx r13, rbx, [x+32]
185 adcx r12, rbx
186
187 mulx r8, rbx, [x+40]
188 adcx r13, rbx
189 adcx r8, zero
190
191// Now all the other rows in a uniform pattern
192
193 addrow 1
194 addrow 2
195 addrow 3
196 addrow 4
197 addrow 5
198
199// Now write back the additional columns
200
201 mov [z+48], r8
202 mov [z+56], r9
203 mov [z+64], r10
204 mov [z+72], r11
205 mov [z+80], r12
206 mov [z+88], r13
207
208// Restore registers and return
209
210 pop r13
211 pop r12
212 pop rbx
213 pop rbp
214
215#if WINDOWS_ABI
216 pop rsi
217 pop rdi
218#endif
219 ret
220
221#if defined(__linux__) && defined(__ELF__)
222.section .note.GNU-stack,"",%progbits
223#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..077c52b38e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,199 @@
1// $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[6], y[6]; output z[12]
20//
21// extern void bignum_mul_6_12_alt(uint64_t z[static 12],
22// const uint64_t x[static 6],
23// const uint64_t y[static 6]);
24//
25// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
26// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
27// ----------------------------------------------------------------------------
28
29#include "s2n_bignum_internal.h"
30
31 .intel_syntax noprefix
32 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
33 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
34 .text
35
36// These are actually right
37
38#define z rdi
39#define x rsi
40
41// This is moved from rdx to free it for muls
42
43#define y rcx
44
45// Other variables used as a rotating 3-word window to add terms to
46
47#define t0 r8
48#define t1 r9
49#define t2 r10
50
51// Macro for the key "multiply and add to (c,h,l)" step
52
53#define combadd(c,h,l,numa,numb) \
54 mov rax, numa; \
55 mul QWORD PTR numb; \
56 add l, rax; \
57 adc h, rdx; \
58 adc c, 0
59
60// A minutely shorter form for when c = 0 initially
61
62#define combadz(c,h,l,numa,numb) \
63 mov rax, numa; \
64 mul QWORD PTR numb; \
65 add l, rax; \
66 adc h, rdx; \
67 adc c, c
68
69// A short form where we don't expect a top carry
70
71#define combads(h,l,numa,numb) \
72 mov rax, numa; \
73 mul QWORD PTR numb; \
74 add l, rax; \
75 adc h, rdx
76
77S2N_BN_SYMBOL(bignum_mul_6_12_alt):
78 _CET_ENDBR
79
80#if WINDOWS_ABI
81 push rdi
82 push rsi
83 mov rdi, rcx
84 mov rsi, rdx
85 mov rdx, r8
86#endif
87
88// Copy y into a safe register to start with
89
90 mov y, rdx
91
92// Result term 0
93
94 mov rax, [x]
95 mul QWORD PTR [y]
96
97 mov [z], rax
98 mov t0, rdx
99 xor t1, t1
100
101// Result term 1
102
103 xor t2, t2
104 combads(t1,t0,[x],[y+8])
105 combadz(t2,t1,t0,[x+8],[y])
106 mov [z+8], t0
107
108// Result term 2
109
110 xor t0, t0
111 combadz(t0,t2,t1,[x],[y+16])
112 combadd(t0,t2,t1,[x+8],[y+8])
113 combadd(t0,t2,t1,[x+16],[y])
114 mov [z+16], t1
115
116// Result term 3
117
118 xor t1, t1
119 combadz(t1,t0,t2,[x],[y+24])
120 combadd(t1,t0,t2,[x+8],[y+16])
121 combadd(t1,t0,t2,[x+16],[y+8])
122 combadd(t1,t0,t2,[x+24],[y])
123 mov [z+24], t2
124
125// Result term 4
126
127 xor t2, t2
128 combadz(t2,t1,t0,[x],[y+32])
129 combadd(t2,t1,t0,[x+8],[y+24])
130 combadd(t2,t1,t0,[x+16],[y+16])
131 combadd(t2,t1,t0,[x+24],[y+8])
132 combadd(t2,t1,t0,[x+32],[y])
133 mov [z+32], t0
134
135// Result term 5
136
137 xor t0, t0
138 combadz(t0,t2,t1,[x],[y+40])
139 combadd(t0,t2,t1,[x+8],[y+32])
140 combadd(t0,t2,t1,[x+16],[y+24])
141 combadd(t0,t2,t1,[x+24],[y+16])
142 combadd(t0,t2,t1,[x+32],[y+8])
143 combadd(t0,t2,t1,[x+40],[y])
144 mov [z+40], t1
145
146// Result term 6
147
148 xor t1, t1
149 combadz(t1,t0,t2,[x+8],[y+40])
150 combadd(t1,t0,t2,[x+16],[y+32])
151 combadd(t1,t0,t2,[x+24],[y+24])
152 combadd(t1,t0,t2,[x+32],[y+16])
153 combadd(t1,t0,t2,[x+40],[y+8])
154 mov [z+48], t2
155
156// Result term 7
157
158 xor t2, t2
159 combadz(t2,t1,t0,[x+16],[y+40])
160 combadd(t2,t1,t0,[x+24],[y+32])
161 combadd(t2,t1,t0,[x+32],[y+24])
162 combadd(t2,t1,t0,[x+40],[y+16])
163 mov [z+56], t0
164
165// Result term 8
166
167 xor t0, t0
168 combadz(t0,t2,t1,[x+24],[y+40])
169 combadd(t0,t2,t1,[x+32],[y+32])
170 combadd(t0,t2,t1,[x+40],[y+24])
171 mov [z+64], t1
172
173// Result term 9
174
175 xor t1, t1
176 combadz(t1,t0,t2,[x+32],[y+40])
177 combadd(t1,t0,t2,[x+40],[y+32])
178 mov [z+72], t2
179
180// Result term 10
181
182 combads(t1,t0,[x+40],[y+40])
183 mov [z+80], t0
184
185// Result term 11
186
187 mov [z+88], t1
188
189// Return
190
191#if WINDOWS_ABI
192 pop rsi
193 pop rdi
194#endif
195 ret
196
197#if defined(__linux__) && defined(__ELF__)
198.section .note.GNU-stack,"",%progbits
199#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..faa0196d8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,273 @@
1// $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Multiply z := x * y
19// Inputs x[8], y[8]; output z[16]
20//
21// extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
22// const uint64_t y[static 8]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
25// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
33 .text
34
35// These are actually right
36
37#define z rdi
38#define x rsi
39
40// Copied in or set up
41
42#define y rcx
43
44// A zero register
45
46#define zero rbp
47#define zeroe ebp
48
49// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
50
51.macro mulpadd arg1,arg2
52 mulx rbx, rax, [x+8*\arg1]
53.if ((\arg1 + \arg2) % 8 == 0)
54 adcx r8, rax
55 adox r9, rbx
56.elseif ((\arg1 + \arg2) % 8 == 1)
57 adcx r9, rax
58 adox r10, rbx
59.elseif ((\arg1 + \arg2) % 8 == 2)
60 adcx r10, rax
61 adox r11, rbx
62.elseif ((\arg1 + \arg2) % 8 == 3)
63 adcx r11, rax
64 adox r12, rbx
65.elseif ((\arg1 + \arg2) % 8 == 4)
66 adcx r12, rax
67 adox r13, rbx
68.elseif ((\arg1 + \arg2) % 8 == 5)
69 adcx r13, rax
70 adox r14, rbx
71.elseif ((\arg1 + \arg2) % 8 == 6)
72 adcx r14, rax
73 adox r15, rbx
74.elseif ((\arg1 + \arg2) % 8 == 7)
75 adcx r15, rax
76 adox r8, rbx
77.endif
78
79.endm
80
81// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
82// but re-creates the top word assuming nothing to add there
83
84.macro mulpade arg1,arg2
85.if ((\arg1 + \arg2) % 8 == 0)
86 mulx r9, rax, [x+8*\arg1]
87 adcx r8, rax
88 adox r9, zero
89.elseif ((\arg1 + \arg2) % 8 == 1)
90 mulx r10, rax, [x+8*\arg1]
91 adcx r9, rax
92 adox r10, zero
93.elseif ((\arg1 + \arg2) % 8 == 2)
94 mulx r11, rax, [x+8*\arg1]
95 adcx r10, rax
96 adox r11, zero
97.elseif ((\arg1 + \arg2) % 8 == 3)
98 mulx r12, rax, [x+8*\arg1]
99 adcx r11, rax
100 adox r12, zero
101.elseif ((\arg1 + \arg2) % 8 == 4)
102 mulx r13, rax, [x+8*\arg1]
103 adcx r12, rax
104 adox r13, zero
105.elseif ((\arg1 + \arg2) % 8 == 5)
106 mulx r14, rax, [x+8*\arg1]
107 adcx r13, rax
108 adox r14, zero
109.elseif ((\arg1 + \arg2) % 8 == 6)
110 mulx r15, rax, [x+8*\arg1]
111 adcx r14, rax
112 adox r15, zero
113.elseif ((\arg1 + \arg2) % 8 == 7)
114 mulx r8, rax, [x+8*\arg1]
115 adcx r15, rax
116 adox r8, zero
117.endif
118
119.endm
120
121// Add in the whole j'th row
122
123.macro addrow arg1
124 mov rdx, [y+8*\arg1]
125 xor zeroe, zeroe
126
127 mulpadd 0, \arg1
128
129.if (\arg1 % 8 == 0)
130 mov [z+8*\arg1],r8
131.elseif (\arg1 % 8 == 1)
132 mov [z+8*\arg1],r9
133.elseif (\arg1 % 8 == 2)
134 mov [z+8*\arg1],r10
135.elseif (\arg1 % 8 == 3)
136 mov [z+8*\arg1],r11
137.elseif (\arg1 % 8 == 4)
138 mov [z+8*\arg1],r12
139.elseif (\arg1 % 8 == 5)
140 mov [z+8*\arg1],r13
141.elseif (\arg1 % 8 == 6)
142 mov [z+8*\arg1],r14
143.elseif (\arg1 % 8 == 7)
144 mov [z+8*\arg1],r15
145.endif
146
147 mulpadd 1, \arg1
148 mulpadd 2, \arg1
149 mulpadd 3, \arg1
150 mulpadd 4, \arg1
151 mulpadd 5, \arg1
152 mulpadd 6, \arg1
153 mulpade 7, \arg1
154
155.if (\arg1 % 8 == 0)
156 adc r8, zero
157.elseif (\arg1 % 8 == 1)
158 adc r9, zero
159.elseif (\arg1 % 8 == 2)
160 adc r10, zero
161.elseif (\arg1 % 8 == 3)
162 adc r11, zero
163.elseif (\arg1 % 8 == 4)
164 adc r12, zero
165.elseif (\arg1 % 8 == 5)
166 adc r13, zero
167.elseif (\arg1 % 8 == 6)
168 adc r14, zero
169.elseif (\arg1 % 8 == 7)
170 adc r15, zero
171.endif
172
173.endm
174
175
176S2N_BN_SYMBOL(bignum_mul_8_16):
177 _CET_ENDBR
178
179#if WINDOWS_ABI
180 push rdi
181 push rsi
182 mov rdi, rcx
183 mov rsi, rdx
184 mov rdx, r8
185#endif
186
187// Save more registers to play with
188
189 push rbp
190 push rbx
191 push r12
192 push r13
193 push r14
194 push r15
195
196// Copy y into a safe register to start with
197
198 mov y, rdx
199
200// Zero a register, which also makes sure we don't get a fake carry-in
201
202 xor zeroe, zeroe
203
204// Do the zeroth row, which is a bit different
205// Write back the zero-zero product and then accumulate
206// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
207
208 mov rdx, [y]
209
210 mulx r9, r8, [x]
211 mov [z], r8
212
213 mulx r10, rbx, [x+8]
214 adc r9, rbx
215
216 mulx r11, rbx, [x+16]
217 adc r10, rbx
218
219 mulx r12, rbx, [x+24]
220 adc r11, rbx
221
222 mulx r13, rbx, [x+32]
223 adc r12, rbx
224
225 mulx r14, rbx, [x+40]
226 adc r13, rbx
227
228 mulx r15, rbx, [x+48]
229 adc r14, rbx
230
231 mulx r8, rbx, [x+56]
232 adc r15, rbx
233 adc r8, zero
234
235// Now all the other rows in a uniform pattern
236
237 addrow 1
238 addrow 2
239 addrow 3
240 addrow 4
241 addrow 5
242 addrow 6
243 addrow 7
244
245// Now write back the additional columns
246
247 mov [z+64], r8
248 mov [z+72], r9
249 mov [z+80], r10
250 mov [z+88], r11
251 mov [z+96], r12
252 mov [z+104], r13
253 mov [z+112], r14
254 mov [z+120], r15
255
256// Real epilog
257
258 pop r15
259 pop r14
260 pop r13
261 pop r12
262 pop rbx
263 pop rbp
264
265#if WINDOWS_ABI
266 pop rsi
267 pop rdi
268#endif
269 ret
270
271#if defined(__linux__) && defined(__ELF__)
272.section .note.GNU-stack,"",%progbits
273#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
index 066403b074..0e30b9170f 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,9 @@
16// Multiply z := x * y 18// Multiply z := x * y
17// Inputs x[8], y[8]; output z[16] 19// Inputs x[8], y[8]; output z[16]
18// 20//
19// extern void bignum_mul_8_16_alt 21// extern void bignum_mul_8_16_alt(uint64_t z[static 16],
20// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); 22// const uint64_t x[static 8],
23// const uint64_t y[static 8]);
21// 24//
22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y 25// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
23// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y 26// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
@@ -72,7 +75,7 @@
72 adc h, rdx 75 adc h, rdx
73 76
74S2N_BN_SYMBOL(bignum_mul_8_16_alt): 77S2N_BN_SYMBOL(bignum_mul_8_16_alt):
75 _CET_ENDBR 78 _CET_ENDBR
76 79
77#if WINDOWS_ABI 80#if WINDOWS_ABI
78 push rdi 81 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
index 54e3f59442..86f1af2ac4 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,7 @@
16// Square z := x^2 18// Square z := x^2
17// Input x[n]; output z[k] 19// Input x[n]; output z[k]
18// 20//
19// extern void bignum_sqr 21// extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
20// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
21// 22//
22// Does the "z := x^2" operation where x is n digits and result z is k. 23// Does the "z := x^2" operation where x is n digits and result z is k.
23// Truncates the result in general unless k >= 2 * n 24// Truncates the result in general unless k >= 2 * n
@@ -62,7 +63,7 @@
62#define llshort ebp 63#define llshort ebp
63 64
64S2N_BN_SYMBOL(bignum_sqr): 65S2N_BN_SYMBOL(bignum_sqr):
65 _CET_ENDBR 66 _CET_ENDBR
66 67
67#if WINDOWS_ABI 68#if WINDOWS_ABI
68 push rdi 69 push rdi
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr):
86// If p = 0 the result is trivial and nothing needs doing 87// If p = 0 the result is trivial and nothing needs doing
87 88
88 test p, p 89 test p, p
89 jz end 90 jz bignum_sqr_end
90 91
91// initialize (hh,ll) = 0 92// initialize (hh,ll) = 0
92 93
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr):
97 98
98 xor k, k 99 xor k, k
99 100
100outerloop: 101bignum_sqr_outerloop:
101 102
102// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n 103// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
103// We want to accumulate all x[i] * x[k - i] for bot <= i < top 104// We want to accumulate all x[i] * x[k - i] for bot <= i < top
@@ -122,7 +123,7 @@ outerloop:
122// If htop <= bot then main doubled part of the sum is empty 123// If htop <= bot then main doubled part of the sum is empty
123 124
124 cmp i, htop 125 cmp i, htop
125 jnc nosumming 126 jnc bignum_sqr_nosumming
126 127
127// Use a moving pointer for [y] = x[k-i] for the cofactor 128// Use a moving pointer for [y] = x[k-i] for the cofactor
128 129
@@ -132,7 +133,7 @@ outerloop:
132 133
133// Do the main part of the sum x[i] * x[k - i] for 2 * i < k 134// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
134 135
135innerloop: 136bignum_sqr_innerloop:
136 mov a, [x+8*i] 137 mov a, [x+8*i]
137 mul QWORD PTR [y] 138 mul QWORD PTR [y]
138 add l, a 139 add l, a
@@ -141,7 +142,7 @@ innerloop:
141 sub y, 8 142 sub y, 8
142 inc i 143 inc i
143 cmp i, htop 144 cmp i, htop
144 jc innerloop 145 jc bignum_sqr_innerloop
145 146
146// Now double it 147// Now double it
147 148
@@ -151,11 +152,11 @@ innerloop:
151 152
152// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term 153// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
153 154
154nosumming: 155bignum_sqr_nosumming:
155 test k, 1 156 test k, 1
156 jnz innerend 157 jnz bignum_sqr_innerend
157 cmp i, n 158 cmp i, n
158 jnc innerend 159 jnc bignum_sqr_innerend
159 160
160 mov a, [x+8*i] 161 mov a, [x+8*i]
161 mul a 162 mul a
@@ -165,7 +166,7 @@ nosumming:
165 166
166// Now add the local sum into the global sum, store and shift 167// Now add the local sum into the global sum, store and shift
167 168
168innerend: 169bignum_sqr_innerend:
169 add l, ll 170 add l, ll
170 mov [z+8*k], l 171 mov [z+8*k], l
171 adc h, hh 172 adc h, hh
@@ -175,11 +176,11 @@ innerend:
175 176
176 inc k 177 inc k
177 cmp k, p 178 cmp k, p
178 jc outerloop 179 jc bignum_sqr_outerloop
179 180
180// Restore registers and return 181// Restore registers and return
181 182
182end: 183bignum_sqr_end:
183 pop r15 184 pop r15
184 pop r14 185 pop r14
185 pop r13 186 pop r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..25664782f7
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,158 @@
1// $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[4]; output z[8]
20//
21// extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
22//
23// Standard x86-64 ABI: RDI = z, RSI = x
24// Microsoft x64 ABI: RCX = z, RDX = x
25// ----------------------------------------------------------------------------
26
27#include "s2n_bignum_internal.h"
28
29 .intel_syntax noprefix
30 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
31 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
32 .text
33
34// These are actually right
35
36#define z rdi
37#define x rsi
38
39// A zero register
40
41#define zero rbp
42#define zeroe ebp
43
44// Other registers
45
46#define d1 r8
47#define d2 r9
48#define d3 r10
49#define d4 r11
50#define d5 r12
51#define d6 r13
52
53
54
55S2N_BN_SYMBOL(bignum_sqr_4_8):
56 _CET_ENDBR
57
58#if WINDOWS_ABI
59 push rdi
60 push rsi
61 mov rdi, rcx
62 mov rsi, rdx
63#endif
64
65// Save more registers to play with
66
67 push rbp
68 push r12
69 push r13
70
71// Set up an initial window [d6;...d1] = [23;03;01]
72
73 mov rdx, [x]
74 mulx d2, d1, [x+8]
75 mulx d4, d3, [x+24]
76 mov rdx, [x+16]
77 mulx d6, d5, [x+24]
78
79// Clear our zero register, and also initialize the flags for the carry chain
80
81 xor zeroe, zeroe
82
83// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
84// This gives all the "heterogeneous" terms of the squaring ready to double
85
86 mulx rcx, rax, [x]
87 adcx d2, rax
88 adox d3, rcx
89 mulx rcx, rax, [x+8]
90 adcx d3, rax
91 adox d4, rcx
92 mov rdx, [x+24]
93 mulx rcx, rax, [x+8]
94 adcx d4, rax
95 adox d5, rcx
96 adcx d5, zero
97 adox d6, zero
98 adcx d6, zero
99
100// In principle this is otiose as CF and OF carries are absorbed at this point
101// However it seems helpful for the OOO engine to be told it's a fresh start
102
103 xor zeroe, zeroe
104
105// Double and add to the 00 + 11 + 22 + 33 terms
106//
107// We could use shift-double but this seems tidier and in larger squarings
108// it was actually more efficient. I haven't experimented with this small
109// case to see how much that matters. Note: the writeback here is sprinkled
110// into the sequence in such a way that things still work if z = x, i.e. if
111// the output overwrites the input buffer and beyond.
112
113 mov rdx, [x]
114 mulx rdx, rax, rdx
115 mov [z], rax
116 adcx d1, d1
117 adox d1, rdx
118 mov rdx, [x+8]
119 mov [z+8], d1
120 mulx rdx, rax, rdx
121 adcx d2, d2
122 adox d2, rax
123 adcx d3, d3
124 adox d3, rdx
125 mov rdx, [x+16]
126 mov [z+16], d2
127 mulx rdx, rax, rdx
128 adcx d4, d4
129 adox d4, rax
130 adcx d5, d5
131 adox d5, rdx
132 mov rdx, [x+24]
133 mov [z+24], d3
134 mulx rdx, rax, rdx
135 mov [z+32], d4
136 adcx d6, d6
137 mov [z+40], d5
138 adox d6, rax
139 mov [z+48], d6
140 adcx rdx, zero
141 adox rdx, zero
142 mov [z+56], rdx
143
144// Restore saved registers and return
145
146 pop r13
147 pop r12
148 pop rbp
149
150#if WINDOWS_ABI
151 pop rsi
152 pop rdi
153#endif
154 ret
155
156#if defined(__linux__) && defined(__ELF__)
157.section .note.GNU-stack,"",%progbits
158#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
index 7c534ae907..7eafac3284 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
16// Square, z := x^2 18// Square, z := x^2
17// Input x[4]; output z[8] 19// Input x[4]; output z[8]
18// 20//
19// extern void bignum_sqr_4_8_alt 21// extern void bignum_sqr_4_8_alt(uint64_t z[static 8],
20// (uint64_t z[static 8], uint64_t x[static 4]); 22// const uint64_t x[static 4]);
21// 23//
22// Standard x86-64 ABI: RDI = z, RSI = x 24// Standard x86-64 ABI: RDI = z, RSI = x
23// Microsoft x64 ABI: RCX = z, RDX = x 25// Microsoft x64 ABI: RCX = z, RDX = x
@@ -71,7 +73,7 @@
71 adc c, 0 73 adc c, 0
72 74
73S2N_BN_SYMBOL(bignum_sqr_4_8_alt): 75S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
74 _CET_ENDBR 76 _CET_ENDBR
75 77
76#if WINDOWS_ABI 78#if WINDOWS_ABI
77 push rdi 79 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..3f055e8b75
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,227 @@
1// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[6]; output z[12]
20//
21// extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
22//
23// Standard x86-64 ABI: RDI = z, RSI = x
24// Microsoft x64 ABI: RCX = z, RDX = x
25// ----------------------------------------------------------------------------
26
27#include "s2n_bignum_internal.h"
28
29 .intel_syntax noprefix
30 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
31 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
32 .text
33
34// These are actually right
35
36#define z rdi
37#define x rsi
38
39// A zero register
40
41#define zero rbp
42#define zeroe ebp
43
44// Other registers
45
46#define d1 r8
47#define d2 r9
48#define d3 r10
49#define d4 r11
50#define d5 r12
51#define d6 r13
52#define d7 r14
53#define d8 r15
54#define d9 rbx
55
56// Care is needed: re-using the zero register
57
58#define d10 rbp
59
60
61S2N_BN_SYMBOL(bignum_sqr_6_12):
62 _CET_ENDBR
63
64#if WINDOWS_ABI
65 push rdi
66 push rsi
67 mov rdi, rcx
68 mov rsi, rdx
69#endif
70
71// Save more registers to play with
72
73 push rbp
74 push rbx
75 push r12
76 push r13
77 push r14
78 push r15
79
80// Set up an initial window [d8;...d1] = [34;05;03;01]
81
82 mov rdx, [x]
83 mulx d2, d1, [x+8]
84 mulx d4, d3, [x+24]
85 mulx d6, d5, [x+40]
86 mov rdx, [x+24]
87 mulx d8, d7, [x+32]
88
89// Clear our zero register, and also initialize the flags for the carry chain
90
91 xor zeroe, zeroe
92
93// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
94// (no carry-out possible since we add it to the top of a product)
95
96 mov rdx, [x+16]
97 mulx rcx, rax, [x]
98 adcx d2, rax
99 adox d3, rcx
100 mulx rcx, rax, [x+8]
101 adcx d3, rax
102 adox d4, rcx
103 mov rdx, [x+8]
104 mulx rcx, rax, [x+24]
105 adcx d4, rax
106 adox d5, rcx
107 mulx rcx, rax, [x+32]
108 adcx d5, rax
109 adox d6, rcx
110 mulx rcx, rax, [x+40]
111 adcx d6, rax
112 adox d7, rcx
113 adcx d7, zero
114 adox d8, zero
115 adcx d8, zero
116
117// Again zero out the flags. Actually they are already cleared but it may
118// help decouple these in the OOO engine not to wait for the chain above
119
120 xor zeroe, zeroe
121
122// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
123// We are running out of registers and here our zero register is not zero!
124
125 mov rdx, [x+32]
126 mulx rcx, rax, [x]
127 adcx d4, rax
128 adox d5, rcx
129 mov rdx, [x+16]
130 mulx rcx, rax, [x+24]
131 adcx d5, rax
132 adox d6, rcx
133 mulx rcx, rax, [x+32]
134 adcx d6, rax
135 adox d7, rcx
136 mulx rcx, rax, [x+40]
137 adcx d7, rax
138 adox d8, rcx
139 mov rdx, [x+24]
140 mulx d9, rax, [x+40]
141 adcx d8, rax
142 adox d9, zero
143 mov rdx, [x+32]
144 mulx d10, rax, [x+40]
145 adcx d9, rax
146 mov eax, 0
147 adox d10, rax
148 adcx d10, rax
149
150// Again, just for a clear fresh start for the flags
151
152 xor eax, eax
153
154// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
155//
156// We could use shift-double but this seems tidier and in larger squarings
157// it was actually more efficient. I haven't experimented with this small
158// case to see how much that matters. Note: the writeback here is sprinkled
159// into the sequence in such a way that things still work if z = x, i.e. if
160// the output overwrites the input buffer and beyond.
161
162 mov rdx, [x]
163 mulx rdx, rax, rdx
164 mov [z], rax
165 adcx d1, d1
166 adox d1, rdx
167 mov rdx, [x+8]
168 mov [z+8], d1
169 mulx rdx, rax, rdx
170 adcx d2, d2
171 adox d2, rax
172 adcx d3, d3
173 adox d3, rdx
174 mov rdx, [x+16]
175 mov [z+16], d2
176 mulx rdx, rax, rdx
177 adcx d4, d4
178 adox d4, rax
179 adcx d5, d5
180 adox d5, rdx
181 mov rdx, [x+24]
182 mov [z+24], d3
183 mulx rdx, rax, rdx
184 adcx d6, d6
185 adox d6, rax
186 adcx d7, d7
187 adox d7, rdx
188 mov rdx, [x+32]
189 mov [z+32], d4
190 mulx rdx, rax, rdx
191 adcx d8, d8
192 adox d8, rax
193 adcx d9, d9
194 adox d9, rdx
195 mov rdx, [x+40]
196 mov [z+40], d5
197 mulx rdx, rax, rdx
198 mov [z+48], d6
199 adcx d10, d10
200 mov [z+56], d7
201 adox d10, rax
202 mov [z+64], d8
203 mov eax, 0
204 mov [z+72], d9
205 adcx rdx, rax
206 mov [z+80], d10
207 adox rdx, rax
208 mov [z+88], rdx
209
210// Restore saved registers and return
211
212 pop r15
213 pop r14
214 pop r13
215 pop r12
216 pop rbx
217 pop rbp
218
219#if WINDOWS_ABI
220 pop rsi
221 pop rdi
222#endif
223 ret
224
225#if defined(__linux__) && defined(__ELF__)
226.section .note.GNU-stack,"",%progbits
227#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..eb43b0a15b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,210 @@
1// $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[6]; output z[12]
20//
21// extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
22// const uint64_t x[static 6]);
23//
24// Standard x86-64 ABI: RDI = z, RSI = x
25// Microsoft x64 ABI: RCX = z, RDX = x
26// ----------------------------------------------------------------------------
27
28#include "s2n_bignum_internal.h"
29
30 .intel_syntax noprefix
31 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
32 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
33 .text
34
35// Input arguments
36
37#define z rdi
38#define x rsi
39
40// Other variables used as a rotating 3-word window to add terms to
41
42#define t0 r8
43#define t1 r9
44#define t2 r10
45
46// Additional temporaries for local windows to share doublings
47
48#define u0 rcx
49#define u1 r11
50
51// Macro for the key "multiply and add to (c,h,l)" step
52
53#define combadd(c,h,l,numa,numb) \
54 mov rax, numa; \
55 mul QWORD PTR numb; \
56 add l, rax; \
57 adc h, rdx; \
58 adc c, 0
59
60// Set up initial window (c,h,l) = numa * numb
61
62#define combaddz(c,h,l,numa,numb) \
63 mov rax, numa; \
64 mul QWORD PTR numb; \
65 xor c, c; \
66 mov l, rax; \
67 mov h, rdx
68
69// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
70
71#define doubladd(c,h,l,hh,ll) \
72 add ll, ll; \
73 adc hh, hh; \
74 adc c, c; \
75 add l, ll; \
76 adc h, hh; \
77 adc c, 0
78
79// Square term incorporation (c,h,l) += numba^2
80
81#define combadd1(c,h,l,numa) \
82 mov rax, numa; \
83 mul rax; \
84 add l, rax; \
85 adc h, rdx; \
86 adc c, 0
87
88// A short form where we don't expect a top carry
89
90#define combads(h,l,numa) \
91 mov rax, numa; \
92 mul rax; \
93 add l, rax; \
94 adc h, rdx
95
96// A version doubling directly before adding, for single non-square terms
97
98#define combadd2(c,h,l,numa,numb) \
99 mov rax, numa; \
100 mul QWORD PTR numb; \
101 add rax, rax; \
102 adc rdx, rdx; \
103 adc c, 0; \
104 add l, rax; \
105 adc h, rdx; \
106 adc c, 0
107
108S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
109 _CET_ENDBR
110
111#if WINDOWS_ABI
112 push rdi
113 push rsi
114 mov rdi, rcx
115 mov rsi, rdx
116#endif
117
118// Result term 0
119
120 mov rax, [x]
121 mul rax
122
123 mov [z], rax
124 mov t0, rdx
125 xor t1, t1
126
127// Result term 1
128
129 xor t2, t2
130 combadd2(t2,t1,t0,[x],[x+8])
131 mov [z+8], t0
132
133// Result term 2
134
135 xor t0, t0
136 combadd1(t0,t2,t1,[x+8])
137 combadd2(t0,t2,t1,[x],[x+16])
138 mov [z+16], t1
139
140// Result term 3
141
142 combaddz(t1,u1,u0,[x],[x+24])
143 combadd(t1,u1,u0,[x+8],[x+16])
144 doubladd(t1,t0,t2,u1,u0)
145 mov [z+24], t2
146
147// Result term 4
148
149 combaddz(t2,u1,u0,[x],[x+32])
150 combadd(t2,u1,u0,[x+8],[x+24])
151 doubladd(t2,t1,t0,u1,u0)
152 combadd1(t2,t1,t0,[x+16])
153 mov [z+32], t0
154
155// Result term 5
156
157 combaddz(t0,u1,u0,[x],[x+40])
158 combadd(t0,u1,u0,[x+8],[x+32])
159 combadd(t0,u1,u0,[x+16],[x+24])
160 doubladd(t0,t2,t1,u1,u0)
161 mov [z+40], t1
162
163// Result term 6
164
165 combaddz(t1,u1,u0,[x+8],[x+40])
166 combadd(t1,u1,u0,[x+16],[x+32])
167 doubladd(t1,t0,t2,u1,u0)
168 combadd1(t1,t0,t2,[x+24])
169 mov [z+48], t2
170
171// Result term 7
172
173 combaddz(t2,u1,u0,[x+16],[x+40])
174 combadd(t2,u1,u0,[x+24],[x+32])
175 doubladd(t2,t1,t0,u1,u0)
176 mov [z+56], t0
177
178// Result term 8
179
180 xor t0, t0
181 combadd2(t0,t2,t1,[x+24],[x+40])
182 combadd1(t0,t2,t1,[x+32])
183 mov [z+64], t1
184
185// Result term 9
186
187 xor t1, t1
188 combadd2(t1,t0,t2,[x+32],[x+40])
189 mov [z+72], t2
190
191// Result term 10
192
193 combads(t1,t0,[x+40])
194 mov [z+80], t0
195
196// Result term 11
197
198 mov [z+88], t1
199
200// Return
201
202#if WINDOWS_ABI
203 pop rsi
204 pop rdi
205#endif
206 ret
207
208#if defined(__linux__) && defined(__ELF__)
209.section .note.GNU-stack,"",%progbits
210#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..41277b5b6a
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,311 @@
1// $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
2//
3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4//
5// Permission to use, copy, modify, and/or distribute this software for any
6// purpose with or without fee is hereby granted, provided that the above
7// copyright notice and this permission notice appear in all copies.
8//
9// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17// ----------------------------------------------------------------------------
18// Square, z := x^2
19// Input x[8]; output z[16]
20//
21// extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
22//
23// Standard x86-64 ABI: RDI = z, RSI = x
24// Microsoft x64 ABI: RCX = z, RDX = x
25// ----------------------------------------------------------------------------
26
27#include "s2n_bignum_internal.h"
28
29 .intel_syntax noprefix
30 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
31 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
32 .text
33
34// These are actually right
35
36#define z rdi
37#define x rsi
38
39// A zero register
40
41#define zero rbp
42#define zeroe ebp
43
44// mulpadd i, j adds rdx * x[i] into the window at the i+j point
45
46.macro mulpadd arg1,arg2
47 mulx rcx, rax, [x+8*\arg1]
48.if ((\arg1 + \arg2) % 8 == 0)
49 adcx r8, rax
50 adox r9, rcx
51.elseif ((\arg1 + \arg2) % 8 == 1)
52 adcx r9, rax
53 adox r10, rcx
54.elseif ((\arg1 + \arg2) % 8 == 2)
55 adcx r10, rax
56 adox r11, rcx
57.elseif ((\arg1 + \arg2) % 8 == 3)
58 adcx r11, rax
59 adox r12, rcx
60.elseif ((\arg1 + \arg2) % 8 == 4)
61 adcx r12, rax
62 adox r13, rcx
63.elseif ((\arg1 + \arg2) % 8 == 5)
64 adcx r13, rax
65 adox r14, rcx
66.elseif ((\arg1 + \arg2) % 8 == 6)
67 adcx r14, rax
68 adox r15, rcx
69.elseif ((\arg1 + \arg2) % 8 == 7)
70 adcx r15, rax
71 adox r8, rcx
72.endif
73
74.endm
75
76// mulpade i, j adds rdx * x[i] into the window at i+j
77// but re-creates the top word assuming nothing to add there
78
79.macro mulpade arg1,arg2
80.if ((\arg1 + \arg2) % 8 == 0)
81 mulx r9, rax, [x+8*\arg1]
82 adcx r8, rax
83 adox r9, zero
84.elseif ((\arg1 + \arg2) % 8 == 1)
85 mulx r10, rax, [x+8*\arg1]
86 adcx r9, rax
87 adox r10, zero
88.elseif ((\arg1 + \arg2) % 8 == 2)
89 mulx r11, rax, [x+8*\arg1]
90 adcx r10, rax
91 adox r11, zero
92.elseif ((\arg1 + \arg2) % 8 == 3)
93 mulx r12, rax, [x+8*\arg1]
94 adcx r11, rax
95 adox r12, zero
96.elseif ((\arg1 + \arg2) % 8 == 4)
97 mulx r13, rax, [x+8*\arg1]
98 adcx r12, rax
99 adox r13, zero
100.elseif ((\arg1 + \arg2) % 8 == 5)
101 mulx r14, rax, [x+8*\arg1]
102 adcx r13, rax
103 adox r14, zero
104.elseif ((\arg1 + \arg2) % 8 == 6)
105 mulx r15, rax, [x+8*\arg1]
106 adcx r14, rax
107 adox r15, zero
108.elseif ((\arg1 + \arg2) % 8 == 7)
109 mulx r8, rax, [x+8*\arg1]
110 adcx r15, rax
111 adox r8, zero
112.endif
113
114.endm
115
116.macro diagonals
117
118 xor zeroe, zeroe
119
120// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
121
122 mov rdx, [x]
123 mulx rax, r9, [x+8]
124 mov [z+8], r9
125 mulx rcx, r10, [x+16]
126 adcx r10, rax
127 mov [z+16], r10
128 mulx rax, r11, [x+24]
129 adcx r11, rcx
130 mulx rcx, r12, [x+32]
131 adcx r12, rax
132 mulx rax, r13, [x+40]
133 adcx r13, rcx
134 mulx rcx, r14, [x+48]
135 adcx r14, rax
136 mulx r8, r15, [x+56]
137 adcx r15, rcx
138 adcx r8, zero
139
140// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
141
142 xor zeroe, zeroe
143 mov rdx, [x+8]
144 mulpadd 2, 1
145 mov [z+24], r11
146 mulpadd 3, 1
147 mov [z+32], r12
148 mulpadd 4, 1
149 mulpadd 5, 1
150 mulpadd 6, 1
151 mulpade 7, 1
152 mov rdx, [x+32]
153 mulpade 5, 4
154 adcx r10, zero
155
156// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
157
158 xor zeroe, zeroe
159 mov rdx, [x+16]
160 mulpadd 3, 2
161 mov [z+40], r13
162 mulpadd 4, 2
163 mov [z+48], r14
164 mulpadd 5, 2
165 mulpadd 6, 2
166 mulpadd 7, 2
167 mov rdx, [x+48]
168 mulpade 4, 6
169 mulpade 5, 6
170 adcx r12, zero
171
172// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
173
174 xor zeroe, zeroe
175 mov rdx, [x+24]
176 mulpadd 4, 3
177 mov [z+56], r15
178 mulpadd 5, 3
179 mov [z+64], r8
180 mulpadd 6, 3
181 mulpadd 7, 3
182 mov rdx, [x+56]
183 mulpadd 4, 7
184 mulpade 5, 7
185 mulpade 6, 7
186 adcx r14, zero
187
188// Double and add things; use z[1]..z[8] and thereafter the registers
189// r9..r15 which haven't been written back yet
190
191 xor zeroe, zeroe
192 mov rdx, [x]
193 mulx rcx, rax, rdx
194 mov [z], rax
195 mov rax, [z+8]
196 adcx rax, rax
197 adox rax, rcx
198 mov [z+8], rax
199
200 mov rax, [z+16]
201 mov rdx, [x+8]
202 mulx rcx, rdx, rdx
203 adcx rax, rax
204 adox rax, rdx
205 mov [z+16], rax
206 mov rax, [z+24]
207 adcx rax, rax
208 adox rax, rcx
209 mov [z+24], rax
210
211 mov rax, [z+32]
212 mov rdx, [x+16]
213 mulx rcx, rdx, rdx
214 adcx rax, rax
215 adox rax, rdx
216 mov [z+32], rax
217 mov rax, [z+40]
218 adcx rax, rax
219 adox rax, rcx
220 mov [z+40], rax
221
222 mov rax, [z+48]
223 mov rdx, [x+24]
224 mulx rcx, rdx, rdx
225 adcx rax, rax
226 adox rax, rdx
227 mov [z+48], rax
228 mov rax, [z+56]
229 adcx rax, rax
230 adox rax, rcx
231 mov [z+56], rax
232
233 mov rax, [z+64]
234 mov rdx, [x+32]
235 mulx rcx, rdx, rdx
236 adcx rax, rax
237 adox rax, rdx
238 mov [z+64], rax
239 adcx r9, r9
240 adox r9, rcx
241 mov [z+72], r9
242
243 mov rdx, [x+40]
244 mulx rcx, rdx, rdx
245 adcx r10, r10
246 adox r10, rdx
247 mov [z+80], r10
248 adcx r11, r11
249 adox r11, rcx
250 mov [z+88], r11
251
252 mov rdx, [x+48]
253 mulx rcx, rdx, rdx
254 adcx r12, r12
255 adox r12, rdx
256 mov [z+96], r12
257 adcx r13, r13
258 adox r13, rcx
259 mov [z+104], r13
260
261 mov rdx, [x+56]
262 mulx r15, rdx, rdx
263 adcx r14, r14
264 adox r14, rdx
265 mov [z+112], r14
266 adcx r15, zero
267 adox r15, zero
268 mov [z+120], r15
269
270.endm
271
272
273S2N_BN_SYMBOL(bignum_sqr_8_16):
274 _CET_ENDBR
275
276#if WINDOWS_ABI
277 push rdi
278 push rsi
279 mov rdi, rcx
280 mov rsi, rdx
281#endif
282
283// Save more registers to play with
284
285 push rbp
286 push r12
287 push r13
288 push r14
289 push r15
290
291// Do the multiplication
292
293 diagonals
294
295// Real epilog
296
297 pop r15
298 pop r14
299 pop r13
300 pop r12
301 pop rbp
302
303#if WINDOWS_ABI
304 pop rsi
305 pop rdi
306#endif
307 ret
308
309#if defined(__linux__) && defined(__ELF__)
310.section .note.GNU-stack,"",%progbits
311#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
index ac0b6f96c2..cb10ba2a12 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,8 @@
16// Square, z := x^2 18// Square, z := x^2
17// Input x[8]; output z[16] 19// Input x[8]; output z[16]
18// 20//
19// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); 21// extern void bignum_sqr_8_16_alt(uint64_t z[static 16],
22// const uint64_t x[static 8]);
20// 23//
21// Standard x86-64 ABI: RDI = z, RSI = x 24// Standard x86-64 ABI: RDI = z, RSI = x
22// Microsoft x64 ABI: RCX = z, RDX = x 25// Microsoft x64 ABI: RCX = z, RDX = x
@@ -103,7 +106,7 @@
103 adc c, 0 106 adc c, 0
104 107
105S2N_BN_SYMBOL(bignum_sqr_8_16_alt): 108S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
106 _CET_ENDBR 109 _CET_ENDBR
107 110
108#if WINDOWS_ABI 111#if WINDOWS_ABI
109 push rdi 112 push rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
index 3ff8a30510..7324d3a71e 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -1,3 +1,5 @@
1// $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
16// Subtract, z := x - y 18// Subtract, z := x - y
17// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] 19// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
18// 20//
19// extern uint64_t bignum_sub 21// extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m,
20// (uint64_t p, uint64_t *z, 22// const uint64_t *x, uint64_t n, const uint64_t *y);
21// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22// 23//
23// Does the z := x - y operation, truncating modulo p words in general and 24// Does the z := x - y operation, truncating modulo p words in general and
24// returning a top borrow (0 or 1) in the p'th place, only subtracting input 25// returning a top borrow (0 or 1) in the p'th place, only subtracting input
@@ -49,7 +50,7 @@
49 50
50 51
51S2N_BN_SYMBOL(bignum_sub): 52S2N_BN_SYMBOL(bignum_sub):
52 _CET_ENDBR 53 _CET_ENDBR
53 54
54#if WINDOWS_ABI 55#if WINDOWS_ABI
55 push rdi 56 push rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub):
75 cmp p, n 76 cmp p, n
76 cmovc n, p 77 cmovc n, p
77 cmp m, n 78 cmp m, n
78 jc ylonger 79 jc bignum_sub_ylonger
79 80
80// The case where x is longer or of the same size (p >= m >= n) 81// The case where x is longer or of the same size (p >= m >= n)
81 82
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub):
83 sub m, n 84 sub m, n
84 inc m 85 inc m
85 test n, n 86 test n, n
86 jz xtest 87 jz bignum_sub_xtest
87xmainloop: 88bignum_sub_xmainloop:
88 mov a, [x+8*i] 89 mov a, [x+8*i]
89 sbb a, [y+8*i] 90 sbb a, [y+8*i]
90 mov [z+8*i],a 91 mov [z+8*i],a
91 inc i 92 inc i
92 dec n 93 dec n
93 jnz xmainloop 94 jnz bignum_sub_xmainloop
94 jmp xtest 95 jmp bignum_sub_xtest
95xtoploop: 96bignum_sub_xtoploop:
96 mov a, [x+8*i] 97 mov a, [x+8*i]
97 sbb a, 0 98 sbb a, 0
98 mov [z+8*i],a 99 mov [z+8*i],a
99 inc i 100 inc i
100xtest: 101bignum_sub_xtest:
101 dec m 102 dec m
102 jnz xtoploop 103 jnz bignum_sub_xtoploop
103 sbb a, a 104 sbb a, a
104 test p, p 105 test p, p
105 jz tailskip 106 jz bignum_sub_tailskip
106tailloop: 107bignum_sub_tailloop:
107 mov [z+8*i],a 108 mov [z+8*i],a
108 inc i 109 inc i
109 dec p 110 dec p
110 jnz tailloop 111 jnz bignum_sub_tailloop
111tailskip: 112bignum_sub_tailskip:
112 neg a 113 neg a
113#if WINDOWS_ABI 114#if WINDOWS_ABI
114 pop rsi 115 pop rsi
@@ -118,29 +119,29 @@ tailskip:
118 119
119// The case where y is longer (p >= n > m) 120// The case where y is longer (p >= n > m)
120 121
121ylonger: 122bignum_sub_ylonger:
122 123
123 sub p, n 124 sub p, n
124 sub n, m 125 sub n, m
125 test m, m 126 test m, m
126 jz ytoploop 127 jz bignum_sub_ytoploop
127ymainloop: 128bignum_sub_ymainloop:
128 mov a, [x+8*i] 129 mov a, [x+8*i]
129 sbb a, [y+8*i] 130 sbb a, [y+8*i]
130 mov [z+8*i],a 131 mov [z+8*i],a
131 inc i 132 inc i
132 dec m 133 dec m
133 jnz ymainloop 134 jnz bignum_sub_ymainloop
134ytoploop: 135bignum_sub_ytoploop:
135 mov ashort, 0 136 mov ashort, 0
136 sbb a, [y+8*i] 137 sbb a, [y+8*i]
137 mov [z+8*i],a 138 mov [z+8*i],a
138 inc i 139 inc i
139 dec n 140 dec n
140 jnz ytoploop 141 jnz bignum_sub_ytoploop
141 sbb a, a 142 sbb a, a
142 test p, p 143 test p, p
143 jnz tailloop 144 jnz bignum_sub_tailloop
144 neg a 145 neg a
145#if WINDOWS_ABI 146#if WINDOWS_ABI
146 pop rsi 147 pop rsi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
index a377a05681..9ff8920ca2 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */ 1/* $OpenBSD: bn_arch.c,v 1.12 2025/08/14 15:29:17 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -19,6 +19,7 @@
19 19
20#include "bn_arch.h" 20#include "bn_arch.h"
21#include "bn_local.h" 21#include "bn_local.h"
22#include "crypto_arch.h"
22#include "s2n_bignum.h" 23#include "s2n_bignum.h"
23 24
24#ifdef HAVE_BN_ADD 25#ifdef HAVE_BN_ADD
@@ -26,8 +27,8 @@ BN_ULONG
26bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, 27bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
27 int b_len) 28 int b_len)
28{ 29{
29 return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a, 30 return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
30 b_len, (uint64_t *)b); 31 b_len, (const uint64_t *)b);
31} 32}
32#endif 33#endif
33 34
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
36BN_ULONG 37BN_ULONG
37bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) 38bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
38{ 39{
39 return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n, 40 return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
40 (uint64_t *)bd); 41 (const uint64_t *)bd);
41} 42}
42#endif 43#endif
43 44
@@ -46,8 +47,8 @@ BN_ULONG
46bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b, 47bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
47 int b_len) 48 int b_len)
48{ 49{
49 return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a, 50 return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
50 b_len, (uint64_t *)b); 51 b_len, (const uint64_t *)b);
51} 52}
52#endif 53#endif
53 54
@@ -55,8 +56,28 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
55BN_ULONG 56BN_ULONG
56bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n) 57bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
57{ 58{
58 return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n, 59 return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
59 (uint64_t *)bd); 60 (const uint64_t *)bd);
61}
62#endif
63
64#ifdef HAVE_BN_MOD_ADD_WORDS
65void
66bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
67 const BN_ULONG *m, size_t n)
68{
69 bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a,
70 (const uint64_t *)b, (const uint64_t *)m);
71}
72#endif
73
74#ifdef HAVE_BN_MOD_SUB_WORDS
75void
76bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
77 const BN_ULONG *m, size_t n)
78{
79 bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a,
80 (const uint64_t *)b, (const uint64_t *)m);
60} 81}
61#endif 82#endif
62 83
@@ -64,7 +85,7 @@ bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
64BN_ULONG 85BN_ULONG
65bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) 86bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
66{ 87{
67 return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad); 88 return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
68} 89}
69#endif 90#endif
70 91
@@ -72,25 +93,52 @@ bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
72BN_ULONG 93BN_ULONG
73bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w) 94bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
74{ 95{
75 return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad); 96 return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
76} 97}
77#endif 98#endif
78 99
79#ifdef HAVE_BN_MUL_COMBA4 100#ifdef HAVE_BN_MUL_COMBA4
80void 101void
81bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) 102bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
82{ 103{
83 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 104 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
84 bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd); 105 bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad,
106 (const uint64_t *)bd);
107 return;
108 }
109
110 bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad,
111 (const uint64_t *)bd);
112}
113#endif
114
115#ifdef HAVE_BN_MUL_COMBA6
116void
117bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
118{
119 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
120 bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad,
121 (const uint64_t *)bd);
122 return;
123 }
124
125 bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad,
126 (const uint64_t *)bd);
85} 127}
86#endif 128#endif
87 129
88#ifdef HAVE_BN_MUL_COMBA8 130#ifdef HAVE_BN_MUL_COMBA8
89void 131void
90bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd) 132bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
91{ 133{
92 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 134 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
93 bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd); 135 bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad,
136 (const uint64_t *)bd);
137 return;
138 }
139
140 bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad,
141 (const uint64_t *)bd);
94} 142}
95#endif 143#endif
96 144
@@ -98,7 +146,7 @@ bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
98int 146int
99bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx) 147bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
100{ 148{
101 bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d); 149 bignum_sqr(r_len, (uint64_t *)r->d, a->top, (const uint64_t *)a->d);
102 150
103 return 1; 151 return 1;
104} 152}
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
108void 156void
109bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad) 157bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
110{ 158{
111 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 159 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
112 bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad); 160 bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad);
161 return;
162 }
163
164 bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad);
165}
166#endif
167
168#ifdef HAVE_BN_SQR_COMBA6
169void
170bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad)
171{
172 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
173 bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad);
174 return;
175 }
176
177 bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad);
113} 178}
114#endif 179#endif
115 180
@@ -117,8 +182,12 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
117void 182void
118bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad) 183bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
119{ 184{
120 /* XXX - consider using non-alt on CPUs that have the ADX extension. */ 185 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
121 bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad); 186 bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad);
187 return;
188 }
189
190 bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad);
122} 191}
123#endif 192#endif
124 193
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 927cd75208..7359f993a7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */ 1/* $OpenBSD: bn_arch.h,v 1.16 2025/08/14 15:22:54 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -27,13 +27,18 @@
27 27
28#define HAVE_BN_DIV_WORDS 28#define HAVE_BN_DIV_WORDS
29 29
30#define HAVE_BN_MOD_ADD_WORDS
31#define HAVE_BN_MOD_SUB_WORDS
32
30#define HAVE_BN_MUL_ADD_WORDS 33#define HAVE_BN_MUL_ADD_WORDS
31#define HAVE_BN_MUL_COMBA4 34#define HAVE_BN_MUL_COMBA4
35#define HAVE_BN_MUL_COMBA6
32#define HAVE_BN_MUL_COMBA8 36#define HAVE_BN_MUL_COMBA8
33#define HAVE_BN_MUL_WORDS 37#define HAVE_BN_MUL_WORDS
34 38
35#define HAVE_BN_SQR 39#define HAVE_BN_SQR
36#define HAVE_BN_SQR_COMBA4 40#define HAVE_BN_SQR_COMBA4
41#define HAVE_BN_SQR_COMBA6
37#define HAVE_BN_SQR_COMBA8 42#define HAVE_BN_SQR_COMBA8
38 43
39#define HAVE_BN_SUB 44#define HAVE_BN_SUB
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
index 3926fcd4b0..705fbdbbda 100644
--- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S
+++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
@@ -1,3 +1,5 @@
1// $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,7 @@
16// Count leading zero bits in a single word 18// Count leading zero bits in a single word
17// Input a; output function return 19// Input a; output function return
18// 20//
19// extern uint64_t word_clz (uint64_t a); 21// extern uint64_t word_clz(uint64_t a);
20// 22//
21// Standard x86-64 ABI: RDI = a, returns RAX 23// Standard x86-64 ABI: RDI = a, returns RAX
22// Microsoft x64 ABI: RCX = a, returns RAX 24// Microsoft x64 ABI: RCX = a, returns RAX
@@ -30,7 +32,7 @@
30 .text 32 .text
31 33
32S2N_BN_SYMBOL(word_clz): 34S2N_BN_SYMBOL(word_clz):
33 _CET_ENDBR 35 _CET_ENDBR
34 36
35#if WINDOWS_ABI 37#if WINDOWS_ABI
36 push rdi 38 push rdi
diff --git a/src/lib/libcrypto/bn/bn_internal.h b/src/lib/libcrypto/bn/bn_internal.h
index a1f1515b57..8b5145e225 100644
--- a/src/lib/libcrypto/bn/bn_internal.h
+++ b/src/lib/libcrypto/bn/bn_internal.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_internal.h,v 1.19 2025/05/25 05:12:05 jsing Exp $ */ 1/* $OpenBSD: bn_internal.h,v 1.20 2025/08/02 16:20:00 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -41,6 +41,8 @@ void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
41 const BN_ULONG *m, size_t n); 41 const BN_ULONG *m, size_t n);
42void bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 42void bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
43 const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n); 43 const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n);
44void bn_mod_sqr_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *m,
45 BN_ULONG *t, BN_ULONG m0, size_t n);
44 46
45void bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap, 47void bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap,
46 const BN_ULONG *bp, const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0, 48 const BN_ULONG *bp, const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0,
diff --git a/src/lib/libcrypto/bn/bn_local.h b/src/lib/libcrypto/bn/bn_local.h
index 0eb9a7a745..1bd4c16baf 100644
--- a/src/lib/libcrypto/bn/bn_local.h
+++ b/src/lib/libcrypto/bn/bn_local.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_local.h,v 1.51 2025/05/25 04:30:55 jsing Exp $ */ 1/* $OpenBSD: bn_local.h,v 1.54 2025/08/05 15:08:13 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -240,10 +240,12 @@ BN_ULONG bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len,
240 const BN_ULONG *b, int b_len); 240 const BN_ULONG *b, int b_len);
241 241
242void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb); 242void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
243void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b); 243void bn_mul_comba4(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
244void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b); 244void bn_mul_comba6(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
245void bn_mul_comba8(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
245 246
246void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a); 247void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
248void bn_sqr_comba6(BN_ULONG *r, const BN_ULONG *a);
247void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a); 249void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
248 250
249int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 251int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
diff --git a/src/lib/libcrypto/bn/bn_mod_words.c b/src/lib/libcrypto/bn/bn_mod_words.c
index 8971f9f306..d9aee8701a 100644
--- a/src/lib/libcrypto/bn/bn_mod_words.c
+++ b/src/lib/libcrypto/bn/bn_mod_words.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_mod_words.c,v 1.1 2025/05/25 04:58:32 jsing Exp $ */ 1/* $OpenBSD: bn_mod_words.c,v 1.3 2025/08/05 15:15:54 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -73,6 +73,42 @@ void
73bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 73bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
74 const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n) 74 const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n)
75{ 75{
76 bn_montgomery_multiply_words(r, a, b, m, t, m0, n); 76 if (n == 4) {
77 bn_mul_comba4(t, a, b);
78 bn_montgomery_reduce_words(r, t, m, m0, n);
79 } else if (n == 6) {
80 bn_mul_comba6(t, a, b);
81 bn_montgomery_reduce_words(r, t, m, m0, n);
82 } else if (n == 8) {
83 bn_mul_comba8(t, a, b);
84 bn_montgomery_reduce_words(r, t, m, m0, n);
85 } else {
86 bn_montgomery_multiply_words(r, a, b, m, t, m0, n);
87 }
88}
89#endif
90
91/*
92 * bn_mod_sqr_words() computes r[] = (a[] * a[]) mod m[], where a, r and
93 * m are arrays of words with length n (r may be the same as a) in the
94 * Montgomery domain. The result remains in the Montgomery domain.
95 */
96#ifndef HAVE_BN_MOD_SQR_WORDS
97void
98bn_mod_sqr_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *m,
99 BN_ULONG *t, BN_ULONG m0, size_t n)
100{
101 if (n == 4) {
102 bn_sqr_comba4(t, a);
103 bn_montgomery_reduce_words(r, t, m, m0, n);
104 } else if (n == 6) {
105 bn_sqr_comba6(t, a);
106 bn_montgomery_reduce_words(r, t, m, m0, n);
107 } else if (n == 8) {
108 bn_sqr_comba8(t, a);
109 bn_montgomery_reduce_words(r, t, m, m0, n);
110 } else {
111 bn_montgomery_multiply_words(r, a, a, m, t, m0, n);
112 }
77} 113}
78#endif 114#endif
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
index 950846fa5b..8280a8db27 100644
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ b/src/lib/libcrypto/bn/bn_mont.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_mont.c,v 1.68 2025/05/25 05:12:05 jsing Exp $ */ 1/* $OpenBSD: bn_mont.c,v 1.69 2025/08/03 10:33:46 tb Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -116,6 +116,7 @@
116 * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf 116 * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
117 */ 117 */
118 118
119#include <limits.h>
119#include <stdio.h> 120#include <stdio.h>
120#include <stdint.h> 121#include <stdint.h>
121#include <string.h> 122#include <string.h>
@@ -214,7 +215,7 @@ BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
214 goto err; 215 goto err;
215 mont->N.neg = 0; 216 mont->N.neg = 0;
216 mont->ri = ((BN_num_bits(mod) + BN_BITS2 - 1) / BN_BITS2) * BN_BITS2; 217 mont->ri = ((BN_num_bits(mod) + BN_BITS2 - 1) / BN_BITS2) * BN_BITS2;
217 if (mont->ri * 2 < mont->ri) 218 if (mont->ri > INT_MAX / 2)
218 goto err; 219 goto err;
219 220
220 /* 221 /*
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
index bdeb9b0fe8..a30d05fb02 100644
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ b/src/lib/libcrypto/bn/bn_mul.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_mul.c,v 1.39 2023/07/08 12:21:58 beck Exp $ */ 1/* $OpenBSD: bn_mul.c,v 1.43 2025/08/14 15:15:04 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -57,6 +57,7 @@
57 */ 57 */
58 58
59#include <assert.h> 59#include <assert.h>
60#include <limits.h>
60#include <stdio.h> 61#include <stdio.h>
61#include <string.h> 62#include <string.h>
62 63
@@ -73,7 +74,7 @@
73 */ 74 */
74#ifndef HAVE_BN_MUL_COMBA4 75#ifndef HAVE_BN_MUL_COMBA4
75void 76void
76bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 77bn_mul_comba4(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
77{ 78{
78 BN_ULONG c0, c1, c2; 79 BN_ULONG c0, c1, c2;
79 80
@@ -103,13 +104,73 @@ bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
103#endif 104#endif
104 105
105/* 106/*
107 * bn_mul_comba6() computes r[] = a[] * b[] using Comba multiplication
108 * (https://everything2.com/title/Comba+multiplication), where a and b are both
109 * six word arrays, producing a 12 word array result.
110 */
111#ifndef HAVE_BN_MUL_COMBA6
112void
113bn_mul_comba6(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
114{
115 BN_ULONG c0, c1, c2;
116
117 bn_mulw_addtw(a[0], b[0], 0, 0, 0, &c2, &c1, &r[0]);
118
119 bn_mulw_addtw(a[0], b[1], 0, c2, c1, &c2, &c1, &c0);
120 bn_mulw_addtw(a[1], b[0], c2, c1, c0, &c2, &c1, &r[1]);
121
122 bn_mulw_addtw(a[2], b[0], 0, c2, c1, &c2, &c1, &c0);
123 bn_mulw_addtw(a[1], b[1], c2, c1, c0, &c2, &c1, &c0);
124 bn_mulw_addtw(a[0], b[2], c2, c1, c0, &c2, &c1, &r[2]);
125
126 bn_mulw_addtw(a[0], b[3], 0, c2, c1, &c2, &c1, &c0);
127 bn_mulw_addtw(a[1], b[2], c2, c1, c0, &c2, &c1, &c0);
128 bn_mulw_addtw(a[2], b[1], c2, c1, c0, &c2, &c1, &c0);
129 bn_mulw_addtw(a[3], b[0], c2, c1, c0, &c2, &c1, &r[3]);
130
131 bn_mulw_addtw(a[4], b[0], 0, c2, c1, &c2, &c1, &c0);
132 bn_mulw_addtw(a[3], b[1], c2, c1, c0, &c2, &c1, &c0);
133 bn_mulw_addtw(a[2], b[2], c2, c1, c0, &c2, &c1, &c0);
134 bn_mulw_addtw(a[1], b[3], c2, c1, c0, &c2, &c1, &c0);
135 bn_mulw_addtw(a[0], b[4], c2, c1, c0, &c2, &c1, &r[4]);
136
137 bn_mulw_addtw(a[0], b[5], 0, c2, c1, &c2, &c1, &c0);
138 bn_mulw_addtw(a[1], b[4], c2, c1, c0, &c2, &c1, &c0);
139 bn_mulw_addtw(a[2], b[3], c2, c1, c0, &c2, &c1, &c0);
140 bn_mulw_addtw(a[3], b[2], c2, c1, c0, &c2, &c1, &c0);
141 bn_mulw_addtw(a[4], b[1], c2, c1, c0, &c2, &c1, &c0);
142 bn_mulw_addtw(a[5], b[0], c2, c1, c0, &c2, &c1, &r[5]);
143
144 bn_mulw_addtw(a[5], b[1], 0, c2, c1, &c2, &c1, &c0);
145 bn_mulw_addtw(a[4], b[2], c2, c1, c0, &c2, &c1, &c0);
146 bn_mulw_addtw(a[3], b[3], c2, c1, c0, &c2, &c1, &c0);
147 bn_mulw_addtw(a[2], b[4], c2, c1, c0, &c2, &c1, &c0);
148 bn_mulw_addtw(a[1], b[5], c2, c1, c0, &c2, &c1, &r[6]);
149
150 bn_mulw_addtw(a[2], b[5], 0, c2, c1, &c2, &c1, &c0);
151 bn_mulw_addtw(a[3], b[4], c2, c1, c0, &c2, &c1, &c0);
152 bn_mulw_addtw(a[4], b[3], c2, c1, c0, &c2, &c1, &c0);
153 bn_mulw_addtw(a[5], b[2], c2, c1, c0, &c2, &c1, &r[7]);
154
155 bn_mulw_addtw(a[5], b[3], 0, c2, c1, &c2, &c1, &c0);
156 bn_mulw_addtw(a[4], b[4], c2, c1, c0, &c2, &c1, &c0);
157 bn_mulw_addtw(a[3], b[5], c2, c1, c0, &c2, &c1, &r[8]);
158
159 bn_mulw_addtw(a[4], b[5], 0, c2, c1, &c2, &c1, &c0);
160 bn_mulw_addtw(a[5], b[4], c2, c1, c0, &c2, &c1, &r[9]);
161
162 bn_mulw_addtw(a[5], b[5], 0, c2, c1, &c2, &r[11], &r[10]);
163}
164#endif
165
166/*
106 * bn_mul_comba8() computes r[] = a[] * b[] using Comba multiplication 167 * bn_mul_comba8() computes r[] = a[] * b[] using Comba multiplication
107 * (https://everything2.com/title/Comba+multiplication), where a and b are both 168 * (https://everything2.com/title/Comba+multiplication), where a and b are both
108 * eight word arrays, producing a 16 word array result. 169 * eight word arrays, producing a 16 word array result.
109 */ 170 */
110#ifndef HAVE_BN_MUL_COMBA8 171#ifndef HAVE_BN_MUL_COMBA8
111void 172void
112bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 173bn_mul_comba8(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
113{ 174{
114 BN_ULONG c0, c1, c2; 175 BN_ULONG c0, c1, c2;
115 176
@@ -338,14 +399,16 @@ BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
338 if (rr == NULL) 399 if (rr == NULL)
339 goto err; 400 goto err;
340 401
341 rn = a->top + b->top; 402 if (a->top > INT_MAX - b->top)
342 if (rn < a->top)
343 goto err; 403 goto err;
404 rn = a->top + b->top;
344 if (!bn_wexpand(rr, rn)) 405 if (!bn_wexpand(rr, rn))
345 goto err; 406 goto err;
346 407
347 if (a->top == 4 && b->top == 4) { 408 if (a->top == 4 && b->top == 4) {
348 bn_mul_comba4(rr->d, a->d, b->d); 409 bn_mul_comba4(rr->d, a->d, b->d);
410 } else if (a->top == 6 && b->top == 6) {
411 bn_mul_comba6(rr->d, a->d, b->d);
349 } else if (a->top == 8 && b->top == 8) { 412 } else if (a->top == 8 && b->top == 8) {
350 bn_mul_comba8(rr->d, a->d, b->d); 413 bn_mul_comba8(rr->d, a->d, b->d);
351 } else { 414 } else {
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
index 0dbccbf85d..2f7f71f819 100644
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ b/src/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_sqr.c,v 1.36 2023/07/08 12:21:58 beck Exp $ */ 1/* $OpenBSD: bn_sqr.c,v 1.38 2025/08/14 15:15:04 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -97,6 +97,51 @@ bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
97#endif 97#endif
98 98
99/* 99/*
100 * bn_sqr_comba6() computes r[] = a[] * a[] using Comba multiplication
101 * (https://everything2.com/title/Comba+multiplication), where a is an
102 * six word array, producing an 12 word array result.
103 */
104#ifndef HAVE_BN_SQR_COMBA6
105void
106bn_sqr_comba6(BN_ULONG *r, const BN_ULONG *a)
107{
108 BN_ULONG c2, c1, c0;
109
110 bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
111
112 bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
113
114 bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
115 bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
116
117 bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
118 bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
119
120 bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
121 bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0);
122 bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]);
123
124 bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0);
125 bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0);
126 bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]);
127
128 bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0);
129 bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0);
130 bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &r[6]);
131
132 bn_mul2_mulw_addtw(a[5], a[2], 0, c2, c1, &c2, &c1, &c0);
133 bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]);
134
135 bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0);
136 bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &r[8]);
137
138 bn_mul2_mulw_addtw(a[5], a[4], 0, c2, c1, &c2, &c1, &r[9]);
139
140 bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &r[11], &r[10]);
141}
142#endif
143
144/*
100 * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication 145 * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication
101 * (https://everything2.com/title/Comba+multiplication), where a is an 146 * (https://everything2.com/title/Comba+multiplication), where a is an
102 * eight word array, producing an 16 word array result. 147 * eight word array, producing an 16 word array result.
@@ -281,6 +326,8 @@ BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
281 326
282 if (a->top == 4) { 327 if (a->top == 4) {
283 bn_sqr_comba4(rr->d, a->d); 328 bn_sqr_comba4(rr->d, a->d);
329 } else if (a->top == 6) {
330 bn_sqr_comba6(rr->d, a->d);
284 } else if (a->top == 8) { 331 } else if (a->top == 8) {
285 bn_sqr_comba8(rr->d, a->d); 332 bn_sqr_comba8(rr->d, a->d);
286 } else { 333 } else {
diff --git a/src/lib/libcrypto/bn/s2n_bignum.h b/src/lib/libcrypto/bn/s2n_bignum.h
index ce6e8cdc94..7d77894cdc 100644
--- a/src/lib/libcrypto/bn/s2n_bignum.h
+++ b/src/lib/libcrypto/bn/s2n_bignum.h
@@ -1,3 +1,5 @@
1// $OpenBSD: s2n_bignum.h,v 1.4 2025/08/12 10:01:37 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -34,182 +36,240 @@
34// throughput, generally offering higher performance there. 36// throughput, generally offering higher performance there.
35// ---------------------------------------------------------------------------- 37// ----------------------------------------------------------------------------
36 38
39
40#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
41#define S2N_BIGNUM_STATIC
42#else
43#define S2N_BIGNUM_STATIC static
44#endif
45
37// Add, z := x + y 46// Add, z := x + y
38// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] 47// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
39extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 48extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
40 49
41// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced 50// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
42// Inputs x[4], y[4]; output z[4] 51// Inputs x[4], y[4]; output z[4]
43extern void bignum_add_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 52extern void bignum_add_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
44 53
45// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced 54// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
46// Inputs x[4], y[4]; output z[4] 55// Inputs x[4], y[4]; output z[4]
47extern void bignum_add_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 56extern void bignum_add_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
48 57
49// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced 58// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
50// Inputs x[4], y[4]; output z[4] 59// Inputs x[4], y[4]; output z[4]
51extern void bignum_add_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 60extern void bignum_add_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
52 61
53// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced 62// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
54// Inputs x[6], y[6]; output z[6] 63// Inputs x[6], y[6]; output z[6]
55extern void bignum_add_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); 64extern void bignum_add_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
56 65
57// Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced 66// Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced
58// Inputs x[9], y[9]; output z[9] 67// Inputs x[9], y[9]; output z[9]
59extern void bignum_add_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); 68extern void bignum_add_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
69
70// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
71// Inputs x[4], y[4]; output z[4]
72extern void bignum_add_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
60 73
61// Compute "amontification" constant z :== 2^{128k} (congruent mod m) 74// Compute "amontification" constant z :== 2^{128k} (congruent mod m)
62// Input m[k]; output z[k]; temporary buffer t[>=k] 75// Input m[k]; output z[k]; temporary buffer t[>=k]
63extern void bignum_amontifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); 76extern void bignum_amontifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
64 77
65// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m) 78// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
66// Inputs x[k], y[k], m[k]; output z[k] 79// Inputs x[k], y[k], m[k]; output z[k]
67extern void bignum_amontmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); 80extern void bignum_amontmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
68 81
69// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m) 82// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
70// Inputs x[n], m[k], p; output z[k] 83// Inputs x[n], m[k], p; output z[k]
71extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); 84extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
72 85
73// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m) 86// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
74// Inputs x[k], m[k]; output z[k] 87// Inputs x[k], m[k]; output z[k]
75extern void bignum_amontsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); 88extern void bignum_amontsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
76 89
77// Convert 4-digit (256-bit) bignum to/from big-endian form 90// Convert 4-digit (256-bit) bignum to/from big-endian form
78// Input x[4]; output z[4] 91// Input x[4]; output z[4]
79extern void bignum_bigendian_4 (uint64_t z[static 4], uint64_t x[static 4]); 92extern void bignum_bigendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
80 93
81// Convert 6-digit (384-bit) bignum to/from big-endian form 94// Convert 6-digit (384-bit) bignum to/from big-endian form
82// Input x[6]; output z[6] 95// Input x[6]; output z[6]
83extern void bignum_bigendian_6 (uint64_t z[static 6], uint64_t x[static 6]); 96extern void bignum_bigendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
84 97
85// Select bitfield starting at bit n with length l <= 64 98// Select bitfield starting at bit n with length l <= 64
86// Inputs x[k], n, l; output function return 99// Inputs x[k], n, l; output function return
87extern uint64_t bignum_bitfield (uint64_t k, uint64_t *x, uint64_t n, uint64_t l); 100extern uint64_t bignum_bitfield (uint64_t k, const uint64_t *x, uint64_t n, uint64_t l);
88 101
89// Return size of bignum in bits 102// Return size of bignum in bits
90// Input x[k]; output function return 103// Input x[k]; output function return
91extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x); 104extern uint64_t bignum_bitsize (uint64_t k, const uint64_t *x);
92 105
93// Divide by a single (nonzero) word, z := x / m and return x mod m 106// Divide by a single (nonzero) word, z := x / m and return x mod m
94// Inputs x[n], m; outputs function return (remainder) and z[k] 107// Inputs x[n], m; outputs function return (remainder) and z[k]
95extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); 108extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
96 109
97// Divide by a single word, z := x / m when known to be exact 110// Divide by a single word, z := x / m when known to be exact
98// Inputs x[n], m; output z[k] 111// Inputs x[n], m; output z[k]
99extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); 112extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
100 113
101// Count leading zero digits (64-bit words) 114// Count leading zero digits (64-bit words)
102// Input x[k]; output function return 115// Input x[k]; output function return
103extern uint64_t bignum_cld (uint64_t k, uint64_t *x); 116extern uint64_t bignum_cld (uint64_t k, const uint64_t *x);
104 117
105// Count leading zero bits 118// Count leading zero bits
106// Input x[k]; output function return 119// Input x[k]; output function return
107extern uint64_t bignum_clz (uint64_t k, uint64_t *x); 120extern uint64_t bignum_clz (uint64_t k, const uint64_t *x);
108 121
109// Multiply-add with single-word multiplier, z := z + c * y 122// Multiply-add with single-word multiplier, z := z + c * y
110// Inputs c, y[n]; outputs function return (carry-out) and z[k] 123// Inputs c, y[n]; outputs function return (carry-out) and z[k]
111extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 124extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
112 125
113// Negated multiply-add with single-word multiplier, z := z - c * y 126// Negated multiply-add with single-word multiplier, z := z - c * y
114// Inputs c, y[n]; outputs function return (negative carry-out) and z[k] 127// Inputs c, y[n]; outputs function return (negative carry-out) and z[k]
115extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 128extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
116 129
117// Find modulus of bignum w.r.t. single nonzero word m, returning x mod m 130// Find modulus of bignum w.r.t. single nonzero word m, returning x mod m
118// Input x[k], m; output function return 131// Input x[k], m; output function return
119extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m); 132extern uint64_t bignum_cmod (uint64_t k, const uint64_t *x, uint64_t m);
120 133
121// Multiply by a single word, z := c * y 134// Multiply by a single word, z := c * y
122// Inputs c, y[n]; outputs function return (carry-out) and z[k] 135// Inputs c, y[n]; outputs function return (carry-out) and z[k]
123extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 136extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
124 137
125// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced 138// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced
126// Inputs c, x[4]; output z[4] 139// Inputs c, x[4]; output z[4]
127extern void bignum_cmul_p25519 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); 140extern void bignum_cmul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
128extern void bignum_cmul_p25519_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); 141extern void bignum_cmul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
129 142
130// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced 143// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced
131// Inputs c, x[4]; output z[4] 144// Inputs c, x[4]; output z[4]
132extern void bignum_cmul_p256 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); 145extern void bignum_cmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
133extern void bignum_cmul_p256_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); 146extern void bignum_cmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
134 147
135// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced 148// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced
136// Inputs c, x[4]; output z[4] 149// Inputs c, x[4]; output z[4]
137extern void bignum_cmul_p256k1 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); 150extern void bignum_cmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
138extern void bignum_cmul_p256k1_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); 151extern void bignum_cmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
139 152
140// Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced 153// Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced
141// Inputs c, x[6]; output z[6] 154// Inputs c, x[6]; output z[6]
142extern void bignum_cmul_p384 (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]); 155extern void bignum_cmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
143extern void bignum_cmul_p384_alt (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]); 156extern void bignum_cmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
144 157
145// Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced 158// Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced
146// Inputs c, x[9]; output z[9] 159// Inputs c, x[9]; output z[9]
147extern void bignum_cmul_p521 (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]); 160extern void bignum_cmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
148extern void bignum_cmul_p521_alt (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]); 161extern void bignum_cmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
162
163// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced
164// Inputs c, x[4]; output z[4]
165extern void bignum_cmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
166extern void bignum_cmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
149 167
150// Test bignums for coprimality, gcd(x,y) = 1 168// Test bignums for coprimality, gcd(x,y) = 1
151// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)] 169// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
152extern uint64_t bignum_coprime (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t); 170extern uint64_t bignum_coprime (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y, uint64_t *t);
153 171
154// Copy bignum with zero-extension or truncation, z := x 172// Copy bignum with zero-extension or truncation, z := x
155// Input x[n]; output z[k] 173// Input x[n]; output z[k]
156extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); 174extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
175
176// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
177// into z[0..width-1].
178// This function is constant-time with respect to the value of `idx`. This is
179// achieved by reading the whole table and using the bit-masking to get the
180// `idx`-th row.
181// Input table[height*width]; output z[width]
182extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, uint64_t height,
183 uint64_t width, uint64_t idx);
184
185// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
186// into z[0..width-1]. width must be a multiple of 8.
187// This function is constant-time with respect to the value of `idx`. This is
188// achieved by reading the whole table and using the bit-masking to get the
189// `idx`-th row.
190// Input table[height*width]; output z[width]
191extern void bignum_copy_row_from_table_8n (uint64_t *z, const uint64_t *table,
192 uint64_t height, uint64_t width, uint64_t idx);
193
194// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1].
195// This function is constant-time with respect to the value of `idx`. This is
196// achieved by reading the whole table and using the bit-masking to get the
197// `idx`-th row.
198// Input table[height*16]; output z[16]
199extern void bignum_copy_row_from_table_16 (uint64_t *z, const uint64_t *table,
200 uint64_t height, uint64_t idx);
201
202// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1].
203// This function is constant-time with respect to the value of `idx`. This is
204// achieved by reading the whole table and using the bit-masking to get the
205// `idx`-th row.
206// Input table[height*32]; output z[32]
207extern void bignum_copy_row_from_table_32 (uint64_t *z, const uint64_t *table,
208 uint64_t height, uint64_t idx);
157 209
158// Count trailing zero digits (64-bit words) 210// Count trailing zero digits (64-bit words)
159// Input x[k]; output function return 211// Input x[k]; output function return
160extern uint64_t bignum_ctd (uint64_t k, uint64_t *x); 212extern uint64_t bignum_ctd (uint64_t k, const uint64_t *x);
161 213
162// Count trailing zero bits 214// Count trailing zero bits
163// Input x[k]; output function return 215// Input x[k]; output function return
164extern uint64_t bignum_ctz (uint64_t k, uint64_t *x); 216extern uint64_t bignum_ctz (uint64_t k, const uint64_t *x);
165 217
166// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 218// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
167// Input x[4]; output z[4] 219// Input x[4]; output z[4]
168extern void bignum_deamont_p256 (uint64_t z[static 4], uint64_t x[static 4]); 220extern void bignum_deamont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
169extern void bignum_deamont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]); 221extern void bignum_deamont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
170 222
171// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1 223// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1
172// Input x[4]; output z[4] 224// Input x[4]; output z[4]
173extern void bignum_deamont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 225extern void bignum_deamont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
174 226
175// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 227// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
176// Input x[6]; output z[6] 228// Input x[6]; output z[6]
177extern void bignum_deamont_p384 (uint64_t z[static 6], uint64_t x[static 6]); 229extern void bignum_deamont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
178extern void bignum_deamont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]); 230extern void bignum_deamont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
179 231
180// Convert from almost-Montgomery form z := (x / 2^576) mod p_521 232// Convert from almost-Montgomery form z := (x / 2^576) mod p_521
181// Input x[9]; output z[9] 233// Input x[9]; output z[9]
182extern void bignum_deamont_p521 (uint64_t z[static 9], uint64_t x[static 9]); 234extern void bignum_deamont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
235
236// Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2
237// Input x[4]; output z[4]
238extern void bignum_deamont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
183 239
184// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m 240// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
185// Inputs x[k], m[k]; output z[k] 241// Inputs x[k], m[k]; output z[k]
186extern void bignum_demont (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); 242extern void bignum_demont (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
187 243
188// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced 244// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
189// Input x[4]; output z[4] 245// Input x[4]; output z[4]
190extern void bignum_demont_p256 (uint64_t z[static 4], uint64_t x[static 4]); 246extern void bignum_demont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
191extern void bignum_demont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]); 247extern void bignum_demont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
192 248
193// Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced 249// Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced
194// Input x[4]; output z[4] 250// Input x[4]; output z[4]
195extern void bignum_demont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 251extern void bignum_demont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
196 252
197// Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced 253// Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced
198// Input x[6]; output z[6] 254// Input x[6]; output z[6]
199extern void bignum_demont_p384 (uint64_t z[static 6], uint64_t x[static 6]); 255extern void bignum_demont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
200extern void bignum_demont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]); 256extern void bignum_demont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
201 257
202// Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced 258// Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced
203// Input x[9]; output z[9] 259// Input x[9]; output z[9]
204extern void bignum_demont_p521 (uint64_t z[static 9], uint64_t x[static 9]); 260extern void bignum_demont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
261
262// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
263// Input x[4]; output z[4]
264extern void bignum_demont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
205 265
206// Select digit x[n] 266// Select digit x[n]
207// Inputs x[k], n; output function return 267// Inputs x[k], n; output function return
208extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n); 268extern uint64_t bignum_digit (uint64_t k, const uint64_t *x, uint64_t n);
209 269
210// Return size of bignum in digits (64-bit word) 270// Return size of bignum in digits (64-bit word)
211// Input x[k]; output function return 271// Input x[k]; output function return
212extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x); 272extern uint64_t bignum_digitsize (uint64_t k, const uint64_t *x);
213 273
214// Divide bignum by 10: z' := z div 10, returning remainder z mod 10 274// Divide bignum by 10: z' := z div 10, returning remainder z mod 10
215// Inputs z[k]; outputs function return (remainder) and z[k] 275// Inputs z[k]; outputs function return (remainder) and z[k]
@@ -217,294 +277,391 @@ extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
217 277
218// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced 278// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
219// Input x[4]; output z[4] 279// Input x[4]; output z[4]
220extern void bignum_double_p25519 (uint64_t z[static 4], uint64_t x[static 4]); 280extern void bignum_double_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
221 281
222// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced 282// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
223// Input x[4]; output z[4] 283// Input x[4]; output z[4]
224extern void bignum_double_p256 (uint64_t z[static 4], uint64_t x[static 4]); 284extern void bignum_double_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
225 285
226// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced 286// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
227// Input x[4]; output z[4] 287// Input x[4]; output z[4]
228extern void bignum_double_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 288extern void bignum_double_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
229 289
230// Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced 290// Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced
231// Input x[6]; output z[6] 291// Input x[6]; output z[6]
232extern void bignum_double_p384 (uint64_t z[static 6], uint64_t x[static 6]); 292extern void bignum_double_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
233 293
234// Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced 294// Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced
235// Input x[9]; output z[9] 295// Input x[9]; output z[9]
236extern void bignum_double_p521 (uint64_t z[static 9], uint64_t x[static 9]); 296extern void bignum_double_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
297
298// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
299// Input x[4]; output z[4]
300extern void bignum_double_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
237 301
238// Extended Montgomery reduce, returning results in input-output buffer 302// Extended Montgomery reduce, returning results in input-output buffer
239// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] 303// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
240extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); 304extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
241 305
242// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer 306// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
243// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] 307// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
244extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); 308extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
309// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
310// Temporary buffer m_precalc[12*(k/4-1)]
311extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, const uint64_t *m,
312 uint64_t w, uint64_t *m_precalc);
245 313
246// Test bignums for equality, x = y 314// Test bignums for equality, x = y
247// Inputs x[m], y[n]; output function return 315// Inputs x[m], y[n]; output function return
248extern uint64_t bignum_eq (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 316extern uint64_t bignum_eq (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
249 317
250// Test bignum for even-ness 318// Test bignum for even-ness
251// Input x[k]; output function return 319// Input x[k]; output function return
252extern uint64_t bignum_even (uint64_t k, uint64_t *x); 320extern uint64_t bignum_even (uint64_t k, const uint64_t *x);
253 321
254// Convert 4-digit (256-bit) bignum from big-endian bytes 322// Convert 4-digit (256-bit) bignum from big-endian bytes
255// Input x[32] (bytes); output z[4] 323// Input x[32] (bytes); output z[4]
256extern void bignum_frombebytes_4 (uint64_t z[static 4], uint8_t x[static 32]); 324extern void bignum_frombebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
257 325
258// Convert 6-digit (384-bit) bignum from big-endian bytes 326// Convert 6-digit (384-bit) bignum from big-endian bytes
259// Input x[48] (bytes); output z[6] 327// Input x[48] (bytes); output z[6]
260extern void bignum_frombebytes_6 (uint64_t z[static 6], uint8_t x[static 48]); 328extern void bignum_frombebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
261 329
262// Convert 4-digit (256-bit) bignum from little-endian bytes 330// Convert 4-digit (256-bit) bignum from little-endian bytes
263// Input x[32] (bytes); output z[4] 331// Input x[32] (bytes); output z[4]
264extern void bignum_fromlebytes_4 (uint64_t z[static 4], uint8_t x[static 32]); 332extern void bignum_fromlebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
265 333
266// Convert 6-digit (384-bit) bignum from little-endian bytes 334// Convert 6-digit (384-bit) bignum from little-endian bytes
267// Input x[48] (bytes); output z[6] 335// Input x[48] (bytes); output z[6]
268extern void bignum_fromlebytes_6 (uint64_t z[static 6], uint8_t x[static 48]); 336extern void bignum_fromlebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
269 337
270// Convert little-endian bytes to 9-digit 528-bit bignum 338// Convert little-endian bytes to 9-digit 528-bit bignum
271// Input x[66] (bytes); output z[9] 339// Input x[66] (bytes); output z[9]
272extern void bignum_fromlebytes_p521 (uint64_t z[static 9],uint8_t x[static 66]); 340extern void bignum_fromlebytes_p521 (uint64_t z[S2N_BIGNUM_STATIC 9],const uint8_t x[S2N_BIGNUM_STATIC 66]);
273 341
274// Compare bignums, x >= y 342// Compare bignums, x >= y
275// Inputs x[m], y[n]; output function return 343// Inputs x[m], y[n]; output function return
276extern uint64_t bignum_ge (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 344extern uint64_t bignum_ge (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
277 345
278// Compare bignums, x > y 346// Compare bignums, x > y
279// Inputs x[m], y[n]; output function return 347// Inputs x[m], y[n]; output function return
280extern uint64_t bignum_gt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 348extern uint64_t bignum_gt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
281 349
282// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced 350// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
283// Input x[4]; output z[4] 351// Input x[4]; output z[4]
284extern void bignum_half_p256 (uint64_t z[static 4], uint64_t x[static 4]); 352extern void bignum_half_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
285 353
286// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced 354// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
287// Input x[4]; output z[4] 355// Input x[4]; output z[4]
288extern void bignum_half_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 356extern void bignum_half_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
289 357
290// Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced 358// Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced
291// Input x[6]; output z[6] 359// Input x[6]; output z[6]
292extern void bignum_half_p384 (uint64_t z[static 6], uint64_t x[static 6]); 360extern void bignum_half_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
293 361
294// Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced 362// Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced
295// Input x[9]; output z[9] 363// Input x[9]; output z[9]
296extern void bignum_half_p521 (uint64_t z[static 9], uint64_t x[static 9]); 364extern void bignum_half_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
365
366// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
367// Input x[4]; output z[4]
368extern void bignum_half_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
369
370// Modular inverse modulo p_25519 = 2^255 - 19
371// Input x[4]; output z[4]
372extern void bignum_inv_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
373
374// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
375// Input x[4]; output z[4]
376extern void bignum_inv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
377
378// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
379// Input x[6]; output z[6]
380extern void bignum_inv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
381
382// Modular inverse modulo p_521 = 2^521 - 1
383// Input x[9]; output z[9]
384extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9],const uint64_t x[S2N_BIGNUM_STATIC 9]);
385
386// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
387// Input x[4]; output z[4]
388extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
389
390// Inverse square root modulo p_25519
391// Input x[4]; output function return (Legendre symbol) and z[4]
392extern int64_t bignum_invsqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
393extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
297 394
298// Test bignum for zero-ness, x = 0 395// Test bignum for zero-ness, x = 0
299// Input x[k]; output function return 396// Input x[k]; output function return
300extern uint64_t bignum_iszero (uint64_t k, uint64_t *x); 397extern uint64_t bignum_iszero (uint64_t k, const uint64_t *x);
301 398
302// Multiply z := x * y 399// Multiply z := x * y
303// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] 400// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
304extern void bignum_kmul_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], uint64_t t[static 32]); 401extern void bignum_kmul_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], const uint64_t y[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 32]);
305 402
306// Multiply z := x * y 403// Multiply z := x * y
307// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] 404// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
308extern void bignum_kmul_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], uint64_t t[static 96]); 405extern void bignum_kmul_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], const uint64_t y[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 96]);
309 406
310// Square, z := x^2 407// Square, z := x^2
311// Input x[16]; output z[32]; temporary buffer t[>=24] 408// Input x[16]; output z[32]; temporary buffer t[>=24]
312extern void bignum_ksqr_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]); 409extern void bignum_ksqr_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 24]);
313 410
314// Square, z := x^2 411// Square, z := x^2
315// Input x[32]; output z[64]; temporary buffer t[>=72] 412// Input x[32]; output z[64]; temporary buffer t[>=72]
316extern void bignum_ksqr_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]); 413extern void bignum_ksqr_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 72]);
317 414
318// Compare bignums, x <= y 415// Compare bignums, x <= y
319// Inputs x[m], y[n]; output function return 416// Inputs x[m], y[n]; output function return
320extern uint64_t bignum_le (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 417extern uint64_t bignum_le (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
321 418
322// Convert 4-digit (256-bit) bignum to/from little-endian form 419// Convert 4-digit (256-bit) bignum to/from little-endian form
323// Input x[4]; output z[4] 420// Input x[4]; output z[4]
324extern void bignum_littleendian_4 (uint64_t z[static 4], uint64_t x[static 4]); 421extern void bignum_littleendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
325 422
326// Convert 6-digit (384-bit) bignum to/from little-endian form 423// Convert 6-digit (384-bit) bignum to/from little-endian form
327// Input x[6]; output z[6] 424// Input x[6]; output z[6]
328extern void bignum_littleendian_6 (uint64_t z[static 6], uint64_t x[static 6]); 425extern void bignum_littleendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
329 426
330// Compare bignums, x < y 427// Compare bignums, x < y
331// Inputs x[m], y[n]; output function return 428// Inputs x[m], y[n]; output function return
332extern uint64_t bignum_lt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 429extern uint64_t bignum_lt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
333 430
334// Multiply-add, z := z + x * y 431// Multiply-add, z := z + x * y
335// Inputs x[m], y[n]; outputs function return (carry-out) and z[k] 432// Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
336extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 433extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
434
435// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
436// Inputs x[4], y[4], c[4]; output z[4]
437extern void bignum_madd_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
438extern void bignum_madd_n25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
439
440// Reduce modulo group order, z := x mod m_25519
441// Input x[4]; output z[4]
442extern void bignum_mod_m25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
443
444// Reduce modulo basepoint order, z := x mod n_25519
445// Input x[k]; output z[4]
446extern void bignum_mod_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
447
448// Reduce modulo basepoint order, z := x mod n_25519
449// Input x[4]; output z[4]
450extern void bignum_mod_n25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
337 451
338// Reduce modulo group order, z := x mod n_256 452// Reduce modulo group order, z := x mod n_256
339// Input x[k]; output z[4] 453// Input x[k]; output z[4]
340extern void bignum_mod_n256 (uint64_t z[static 4], uint64_t k, uint64_t *x); 454extern void bignum_mod_n256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
341extern void bignum_mod_n256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x); 455extern void bignum_mod_n256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
342 456
343// Reduce modulo group order, z := x mod n_256 457// Reduce modulo group order, z := x mod n_256
344// Input x[4]; output z[4] 458// Input x[4]; output z[4]
345extern void bignum_mod_n256_4 (uint64_t z[static 4], uint64_t x[static 4]); 459extern void bignum_mod_n256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
346 460
347// Reduce modulo group order, z := x mod n_256k1 461// Reduce modulo group order, z := x mod n_256k1
348// Input x[4]; output z[4] 462// Input x[4]; output z[4]
349extern void bignum_mod_n256k1_4 (uint64_t z[static 4], uint64_t x[static 4]); 463extern void bignum_mod_n256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
350 464
351// Reduce modulo group order, z := x mod n_384 465// Reduce modulo group order, z := x mod n_384
352// Input x[k]; output z[6] 466// Input x[k]; output z[6]
353extern void bignum_mod_n384 (uint64_t z[static 6], uint64_t k, uint64_t *x); 467extern void bignum_mod_n384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
354extern void bignum_mod_n384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x); 468extern void bignum_mod_n384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
355 469
356// Reduce modulo group order, z := x mod n_384 470// Reduce modulo group order, z := x mod n_384
357// Input x[6]; output z[6] 471// Input x[6]; output z[6]
358extern void bignum_mod_n384_6 (uint64_t z[static 6], uint64_t x[static 6]); 472extern void bignum_mod_n384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
359 473
360// Reduce modulo group order, z := x mod n_521 474// Reduce modulo group order, z := x mod n_521
361// Input x[9]; output z[9] 475// Input x[9]; output z[9]
362extern void bignum_mod_n521_9 (uint64_t z[static 9], uint64_t x[static 9]); 476extern void bignum_mod_n521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
363extern void bignum_mod_n521_9_alt (uint64_t z[static 9], uint64_t x[static 9]); 477extern void bignum_mod_n521_9_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
478
479// Reduce modulo group order, z := x mod n_sm2
480// Input x[k]; output z[4]
481extern void bignum_mod_nsm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
482extern void bignum_mod_nsm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
483
484// Reduce modulo group order, z := x mod n_sm2
485// Input x[4]; output z[4]
486extern void bignum_mod_nsm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
364 487
365// Reduce modulo field characteristic, z := x mod p_25519 488// Reduce modulo field characteristic, z := x mod p_25519
366// Input x[4]; output z[4] 489// Input x[4]; output z[4]
367extern void bignum_mod_p25519_4 (uint64_t z[static 4], uint64_t x[static 4]); 490extern void bignum_mod_p25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
368 491
369// Reduce modulo field characteristic, z := x mod p_256 492// Reduce modulo field characteristic, z := x mod p_256
370// Input x[k]; output z[4] 493// Input x[k]; output z[4]
371extern void bignum_mod_p256 (uint64_t z[static 4], uint64_t k, uint64_t *x); 494extern void bignum_mod_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
372extern void bignum_mod_p256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x); 495extern void bignum_mod_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
373 496
374// Reduce modulo field characteristic, z := x mod p_256 497// Reduce modulo field characteristic, z := x mod p_256
375// Input x[4]; output z[4] 498// Input x[4]; output z[4]
376extern void bignum_mod_p256_4 (uint64_t z[static 4], uint64_t x[static 4]); 499extern void bignum_mod_p256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
377 500
378// Reduce modulo field characteristic, z := x mod p_256k1 501// Reduce modulo field characteristic, z := x mod p_256k1
379// Input x[4]; output z[4] 502// Input x[4]; output z[4]
380extern void bignum_mod_p256k1_4 (uint64_t z[static 4], uint64_t x[static 4]); 503extern void bignum_mod_p256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
381 504
382// Reduce modulo field characteristic, z := x mod p_384 505// Reduce modulo field characteristic, z := x mod p_384
383// Input x[k]; output z[6] 506// Input x[k]; output z[6]
384extern void bignum_mod_p384 (uint64_t z[static 6], uint64_t k, uint64_t *x); 507extern void bignum_mod_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
385extern void bignum_mod_p384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x); 508extern void bignum_mod_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
386 509
387// Reduce modulo field characteristic, z := x mod p_384 510// Reduce modulo field characteristic, z := x mod p_384
388// Input x[6]; output z[6] 511// Input x[6]; output z[6]
389extern void bignum_mod_p384_6 (uint64_t z[static 6], uint64_t x[static 6]); 512extern void bignum_mod_p384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
390 513
391// Reduce modulo field characteristic, z := x mod p_521 514// Reduce modulo field characteristic, z := x mod p_521
392// Input x[9]; output z[9] 515// Input x[9]; output z[9]
393extern void bignum_mod_p521_9 (uint64_t z[static 9], uint64_t x[static 9]); 516extern void bignum_mod_p521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
517
518// Reduce modulo field characteristic, z := x mod p_sm2
519// Input x[k]; output z[4]
520extern void bignum_mod_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
521
522// Reduce modulo field characteristic, z := x mod p_sm2
523// Input x[4]; output z[4]
524extern void bignum_mod_sm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
394 525
395// Add modulo m, z := (x + y) mod m, assuming x and y reduced 526// Add modulo m, z := (x + y) mod m, assuming x and y reduced
396// Inputs x[k], y[k], m[k]; output z[k] 527// Inputs x[k], y[k], m[k]; output z[k]
397extern void bignum_modadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); 528extern void bignum_modadd (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
398 529
399// Double modulo m, z := (2 * x) mod m, assuming x reduced 530// Double modulo m, z := (2 * x) mod m, assuming x reduced
400// Inputs x[k], m[k]; output z[k] 531// Inputs x[k], m[k]; output z[k]
401extern void bignum_moddouble (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); 532extern void bignum_moddouble (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
533
534// Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m
535// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
536extern void bignum_modexp(uint64_t k,uint64_t *z, const uint64_t *a,const uint64_t *p,const uint64_t *m,uint64_t *t);
402 537
403// Compute "modification" constant z := 2^{64k} mod m 538// Compute "modification" constant z := 2^{64k} mod m
404// Input m[k]; output z[k]; temporary buffer t[>=k] 539// Input m[k]; output z[k]; temporary buffer t[>=k]
405extern void bignum_modifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); 540extern void bignum_modifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
406 541
407// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b 542// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b
408// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k] 543// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
409extern void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t); 544extern void bignum_modinv (uint64_t k, uint64_t *z, const uint64_t *a, const uint64_t *b, uint64_t *t);
410 545
411// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced 546// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced
412// Inputs p, x[k], m[k]; output z[k] 547// Inputs p, x[k], m[k]; output z[k]
413extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m); 548extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x, const uint64_t *m);
414 549
415// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced 550// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
416// Inputs x[k], y[k], m[k]; output z[k] 551// Inputs x[k], y[k], m[k]; output z[k]
417extern void bignum_modsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); 552extern void bignum_modsub (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
418 553
419// Compute "montification" constant z := 2^{128k} mod m 554// Compute "montification" constant z := 2^{128k} mod m
420// Input m[k]; output z[k]; temporary buffer t[>=k] 555// Input m[k]; output z[k]; temporary buffer t[>=k]
421extern void bignum_montifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); 556extern void bignum_montifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
557
558// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
559// Input x[4]; output z[4]
560extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
561
562// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
563// Input x[6]; output z[6]
564extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
565
566// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
567// Input x[4]; output z[4]
568extern void bignum_montinv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
422 569
423// Montgomery multiply, z := (x * y / 2^{64k}) mod m 570// Montgomery multiply, z := (x * y / 2^{64k}) mod m
424// Inputs x[k], y[k], m[k]; output z[k] 571// Inputs x[k], y[k], m[k]; output z[k]
425extern void bignum_montmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); 572extern void bignum_montmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
426 573
427// Montgomery multiply, z := (x * y / 2^256) mod p_256 574// Montgomery multiply, z := (x * y / 2^256) mod p_256
428// Inputs x[4], y[4]; output z[4] 575// Inputs x[4], y[4]; output z[4]
429extern void bignum_montmul_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 576extern void bignum_montmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
430extern void bignum_montmul_p256_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 577extern void bignum_montmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
431 578
432// Montgomery multiply, z := (x * y / 2^256) mod p_256k1 579// Montgomery multiply, z := (x * y / 2^256) mod p_256k1
433// Inputs x[4], y[4]; output z[4] 580// Inputs x[4], y[4]; output z[4]
434extern void bignum_montmul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 581extern void bignum_montmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
435extern void bignum_montmul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 582extern void bignum_montmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
436 583
437// Montgomery multiply, z := (x * y / 2^384) mod p_384 584// Montgomery multiply, z := (x * y / 2^384) mod p_384
438// Inputs x[6], y[6]; output z[6] 585// Inputs x[6], y[6]; output z[6]
439extern void bignum_montmul_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); 586extern void bignum_montmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
440extern void bignum_montmul_p384_alt (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); 587extern void bignum_montmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
441 588
442// Montgomery multiply, z := (x * y / 2^576) mod p_521 589// Montgomery multiply, z := (x * y / 2^576) mod p_521
443// Inputs x[9], y[9]; output z[9] 590// Inputs x[9], y[9]; output z[9]
444extern void bignum_montmul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); 591extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
445extern void bignum_montmul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); 592extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
593
594// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
595// Inputs x[4], y[4]; output z[4]
596extern void bignum_montmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
597extern void bignum_montmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
446 598
447// Montgomery reduce, z := (x' / 2^{64p}) MOD m 599// Montgomery reduce, z := (x' / 2^{64p}) MOD m
448// Inputs x[n], m[k], p; output z[k] 600// Inputs x[n], m[k], p; output z[k]
449extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); 601extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
450 602
451// Montgomery square, z := (x^2 / 2^{64k}) mod m 603// Montgomery square, z := (x^2 / 2^{64k}) mod m
452// Inputs x[k], m[k]; output z[k] 604// Inputs x[k], m[k]; output z[k]
453extern void bignum_montsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); 605extern void bignum_montsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
454 606
455// Montgomery square, z := (x^2 / 2^256) mod p_256 607// Montgomery square, z := (x^2 / 2^256) mod p_256
456// Input x[4]; output z[4] 608// Input x[4]; output z[4]
457extern void bignum_montsqr_p256 (uint64_t z[static 4], uint64_t x[static 4]); 609extern void bignum_montsqr_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
458extern void bignum_montsqr_p256_alt (uint64_t z[static 4], uint64_t x[static 4]); 610extern void bignum_montsqr_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
459 611
460// Montgomery square, z := (x^2 / 2^256) mod p_256k1 612// Montgomery square, z := (x^2 / 2^256) mod p_256k1
461// Input x[4]; output z[4] 613// Input x[4]; output z[4]
462extern void bignum_montsqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 614extern void bignum_montsqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
463extern void bignum_montsqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]); 615extern void bignum_montsqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
464 616
465// Montgomery square, z := (x^2 / 2^384) mod p_384 617// Montgomery square, z := (x^2 / 2^384) mod p_384
466// Input x[6]; output z[6] 618// Input x[6]; output z[6]
467extern void bignum_montsqr_p384 (uint64_t z[static 6], uint64_t x[static 6]); 619extern void bignum_montsqr_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
468extern void bignum_montsqr_p384_alt (uint64_t z[static 6], uint64_t x[static 6]); 620extern void bignum_montsqr_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
469 621
470// Montgomery square, z := (x^2 / 2^576) mod p_521 622// Montgomery square, z := (x^2 / 2^576) mod p_521
471// Input x[9]; output z[9] 623// Input x[9]; output z[9]
472extern void bignum_montsqr_p521 (uint64_t z[static 9], uint64_t x[static 9]); 624extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
473extern void bignum_montsqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]); 625extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
626
627// Montgomery square, z := (x^2 / 2^256) mod p_sm2
628// Input x[4]; output z[4]
629extern void bignum_montsqr_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
630extern void bignum_montsqr_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
474 631
475// Multiply z := x * y 632// Multiply z := x * y
476// Inputs x[m], y[n]; output z[k] 633// Inputs x[m], y[n]; output z[k]
477extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 634extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
478 635
479// Multiply z := x * y 636// Multiply z := x * y
480// Inputs x[4], y[4]; output z[8] 637// Inputs x[4], y[4]; output z[8]
481extern void bignum_mul_4_8 (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); 638extern void bignum_mul_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
482extern void bignum_mul_4_8_alt (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); 639extern void bignum_mul_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
483 640
484// Multiply z := x * y 641// Multiply z := x * y
485// Inputs x[6], y[6]; output z[12] 642// Inputs x[6], y[6]; output z[12]
486extern void bignum_mul_6_12 (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]); 643extern void bignum_mul_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
487extern void bignum_mul_6_12_alt (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]); 644extern void bignum_mul_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
488 645
489// Multiply z := x * y 646// Multiply z := x * y
490// Inputs x[8], y[8]; output z[16] 647// Inputs x[8], y[8]; output z[16]
491extern void bignum_mul_8_16 (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); 648extern void bignum_mul_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
492extern void bignum_mul_8_16_alt (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); 649extern void bignum_mul_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
493 650
494// Multiply modulo p_25519, z := (x * y) mod p_25519 651// Multiply modulo p_25519, z := (x * y) mod p_25519
495// Inputs x[4], y[4]; output z[4] 652// Inputs x[4], y[4]; output z[4]
496extern void bignum_mul_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 653extern void bignum_mul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
497extern void bignum_mul_p25519_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 654extern void bignum_mul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
498 655
499// Multiply modulo p_256k1, z := (x * y) mod p_256k1 656// Multiply modulo p_256k1, z := (x * y) mod p_256k1
500// Inputs x[4], y[4]; output z[4] 657// Inputs x[4], y[4]; output z[4]
501extern void bignum_mul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 658extern void bignum_mul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
502extern void bignum_mul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 659extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
503 660
504// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced 661// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
505// Inputs x[9], y[9]; output z[9] 662// Inputs x[9], y[9]; output z[9]
506extern void bignum_mul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); 663extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
507extern void bignum_mul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); 664extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
508 665
509// Multiply bignum by 10 and add word: z := 10 * z + d 666// Multiply bignum by 10 and add word: z := 10 * z + d
510// Inputs z[k], d; outputs function return (carry) and z[k] 667// Inputs z[k], d; outputs function return (carry) and z[k]
@@ -512,55 +669,59 @@ extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
512 669
513// Multiplex/select z := x (if p nonzero) or z := y (if p zero) 670// Multiplex/select z := x (if p nonzero) or z := y (if p zero)
514// Inputs p, x[k], y[k]; output z[k] 671// Inputs p, x[k], y[k]; output z[k]
515extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y); 672extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y);
516 673
517// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) 674// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
518// Inputs p, x[4], y[4]; output z[4] 675// Inputs p, x[4], y[4]; output z[4]
519extern void bignum_mux_4 (uint64_t p, uint64_t z[static 4],uint64_t x[static 4], uint64_t y[static 4]); 676extern void bignum_mux_4 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
520 677
521// 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) 678// 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
522// Inputs p, x[6], y[6]; output z[6] 679// Inputs p, x[6], y[6]; output z[6]
523extern void bignum_mux_6 (uint64_t p, uint64_t z[static 6],uint64_t x[static 6], uint64_t y[static 6]); 680extern void bignum_mux_6 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
524 681
525// Select element from 16-element table, z := xs[k*i] 682// Select element from 16-element table, z := xs[k*i]
526// Inputs xs[16*k], i; output z[k] 683// Inputs xs[16*k], i; output z[k]
527extern void bignum_mux16 (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i); 684extern void bignum_mux16 (uint64_t k, uint64_t *z, const uint64_t *xs, uint64_t i);
528 685
529// Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced 686// Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced
530// Input x[4]; output z[4] 687// Input x[4]; output z[4]
531extern void bignum_neg_p25519 (uint64_t z[static 4], uint64_t x[static 4]); 688extern void bignum_neg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
532 689
533// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced 690// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
534// Input x[4]; output z[4] 691// Input x[4]; output z[4]
535extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]); 692extern void bignum_neg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
536 693
537// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced 694// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
538// Input x[4]; output z[4] 695// Input x[4]; output z[4]
539extern void bignum_neg_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 696extern void bignum_neg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
540 697
541// Negate modulo p_384, z := (-x) mod p_384, assuming x reduced 698// Negate modulo p_384, z := (-x) mod p_384, assuming x reduced
542// Input x[6]; output z[6] 699// Input x[6]; output z[6]
543extern void bignum_neg_p384 (uint64_t z[static 6], uint64_t x[static 6]); 700extern void bignum_neg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
544 701
545// Negate modulo p_521, z := (-x) mod p_521, assuming x reduced 702// Negate modulo p_521, z := (-x) mod p_521, assuming x reduced
546// Input x[9]; output z[9] 703// Input x[9]; output z[9]
547extern void bignum_neg_p521 (uint64_t z[static 9], uint64_t x[static 9]); 704extern void bignum_neg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
705
706// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
707// Input x[4]; output z[4]
708extern void bignum_neg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
548 709
549// Negated modular inverse, z := (-1/x) mod 2^{64k} 710// Negated modular inverse, z := (-1/x) mod 2^{64k}
550// Input x[k]; output z[k] 711// Input x[k]; output z[k]
551extern void bignum_negmodinv (uint64_t k, uint64_t *z, uint64_t *x); 712extern void bignum_negmodinv (uint64_t k, uint64_t *z, const uint64_t *x);
552 713
553// Test bignum for nonzero-ness x =/= 0 714// Test bignum for nonzero-ness x =/= 0
554// Input x[k]; output function return 715// Input x[k]; output function return
555extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x); 716extern uint64_t bignum_nonzero (uint64_t k, const uint64_t *x);
556 717
557// Test 256-bit bignum for nonzero-ness x =/= 0 718// Test 256-bit bignum for nonzero-ness x =/= 0
558// Input x[4]; output function return 719// Input x[4]; output function return
559extern uint64_t bignum_nonzero_4(uint64_t x[static 4]); 720extern uint64_t bignum_nonzero_4(const uint64_t x[S2N_BIGNUM_STATIC 4]);
560 721
561// Test 384-bit bignum for nonzero-ness x =/= 0 722// Test 384-bit bignum for nonzero-ness x =/= 0
562// Input x[6]; output function return 723// Input x[6]; output function return
563extern uint64_t bignum_nonzero_6(uint64_t x[static 6]); 724extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]);
564 725
565// Normalize bignum in-place by shifting left till top bit is 1 726// Normalize bignum in-place by shifting left till top bit is 1
566// Input z[k]; outputs function return (bits shifted left) and z[k] 727// Input z[k]; outputs function return (bits shifted left) and z[k]
@@ -568,7 +729,7 @@ extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
568 729
569// Test bignum for odd-ness 730// Test bignum for odd-ness
570// Input x[k]; output function return 731// Input x[k]; output function return
571extern uint64_t bignum_odd (uint64_t k, uint64_t *x); 732extern uint64_t bignum_odd (uint64_t k, const uint64_t *x);
572 733
573// Convert single digit to bignum, z := n 734// Convert single digit to bignum, z := n
574// Input n; output z[k] 735// Input n; output z[k]
@@ -576,39 +737,43 @@ extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
576 737
577// Optionally add, z := x + y (if p nonzero) or z := x (if p zero) 738// Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
578// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] 739// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
579extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); 740extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
580 741
581// Optionally negate, z := -x (if p nonzero) or z := x (if p zero) 742// Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
582// Inputs p, x[k]; outputs function return (nonzero input) and z[k] 743// Inputs p, x[k]; outputs function return (nonzero input) and z[k]
583extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x); 744extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x);
584 745
585// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced 746// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced
586// Inputs p, x[4]; output z[4] 747// Inputs p, x[4]; output z[4]
587extern void bignum_optneg_p25519 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); 748extern void bignum_optneg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
588 749
589// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced 750// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced
590// Inputs p, x[4]; output z[4] 751// Inputs p, x[4]; output z[4]
591extern void bignum_optneg_p256 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); 752extern void bignum_optneg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
592 753
593// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced 754// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced
594// Inputs p, x[4]; output z[4] 755// Inputs p, x[4]; output z[4]
595extern void bignum_optneg_p256k1 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); 756extern void bignum_optneg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
596 757
597// Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced 758// Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced
598// Inputs p, x[6]; output z[6] 759// Inputs p, x[6]; output z[6]
599extern void bignum_optneg_p384 (uint64_t z[static 6], uint64_t p, uint64_t x[static 6]); 760extern void bignum_optneg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 6]);
600 761
601// Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced 762// Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced
602// Inputs p, x[9]; output z[9] 763// Inputs p, x[9]; output z[9]
603extern void bignum_optneg_p521 (uint64_t z[static 9], uint64_t p, uint64_t x[static 9]); 764extern void bignum_optneg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 9]);
765
766// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced
767// Inputs p, x[4]; output z[4]
768extern void bignum_optneg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
604 769
605// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) 770// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
606// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] 771// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
607extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); 772extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
608 773
609// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed 774// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
610// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] 775// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
611extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); 776extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
612 777
613// Return bignum of power of 2, z := 2^n 778// Return bignum of power of 2, z := 2^n
614// Input n; output z[k] 779// Input n; output z[k]
@@ -616,216 +781,376 @@ extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
616 781
617// Shift bignum left by c < 64 bits z := x * 2^c 782// Shift bignum left by c < 64 bits z := x * 2^c
618// Inputs x[n], c; outputs function return (carry-out) and z[k] 783// Inputs x[n], c; outputs function return (carry-out) and z[k]
619extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); 784extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
620 785
621// Shift bignum right by c < 64 bits z := floor(x / 2^c) 786// Shift bignum right by c < 64 bits z := floor(x / 2^c)
622// Inputs x[n], c; outputs function return (bits shifted out) and z[k] 787// Inputs x[n], c; outputs function return (bits shifted out) and z[k]
623extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); 788extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
624 789
625// Square, z := x^2 790// Square, z := x^2
626// Input x[n]; output z[k] 791// Input x[n]; output z[k]
627extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); 792extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
628 793
629// Square, z := x^2 794// Square, z := x^2
630// Input x[4]; output z[8] 795// Input x[4]; output z[8]
631extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]); 796extern void bignum_sqr_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
632extern void bignum_sqr_4_8_alt (uint64_t z[static 8], uint64_t x[static 4]); 797extern void bignum_sqr_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
633 798
634// Square, z := x^2 799// Square, z := x^2
635// Input x[6]; output z[12] 800// Input x[6]; output z[12]
636extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]); 801extern void bignum_sqr_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
637extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]); 802extern void bignum_sqr_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
638 803
639// Square, z := x^2 804// Square, z := x^2
640// Input x[8]; output z[16] 805// Input x[8]; output z[16]
641extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]); 806extern void bignum_sqr_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
642extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); 807extern void bignum_sqr_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
643 808
644// Square modulo p_25519, z := (x^2) mod p_25519 809// Square modulo p_25519, z := (x^2) mod p_25519
645// Input x[4]; output z[4] 810// Input x[4]; output z[4]
646extern void bignum_sqr_p25519 (uint64_t z[static 4], uint64_t x[static 4]); 811extern void bignum_sqr_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
647extern void bignum_sqr_p25519_alt (uint64_t z[static 4], uint64_t x[static 4]); 812extern void bignum_sqr_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
648 813
649// Square modulo p_256k1, z := (x^2) mod p_256k1 814// Square modulo p_256k1, z := (x^2) mod p_256k1
650// Input x[4]; output z[4] 815// Input x[4]; output z[4]
651extern void bignum_sqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 816extern void bignum_sqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
652extern void bignum_sqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]); 817extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
653 818
654// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced 819// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
655// Input x[9]; output z[9] 820// Input x[9]; output z[9]
656extern void bignum_sqr_p521 (uint64_t z[static 9], uint64_t x[static 9]); 821extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
657extern void bignum_sqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]); 822extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
823
824// Square root modulo p_25519
825// Input x[4]; output function return (Legendre symbol) and z[4]
826extern int64_t bignum_sqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
827extern int64_t bignum_sqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
658 828
659// Subtract, z := x - y 829// Subtract, z := x - y
660// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] 830// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
661extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); 831extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
662 832
663// Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced 833// Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced
664// Inputs x[4], y[4]; output z[4] 834// Inputs x[4], y[4]; output z[4]
665extern void bignum_sub_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 835extern void bignum_sub_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
666 836
667// Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced 837// Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced
668// Inputs x[4], y[4]; output z[4] 838// Inputs x[4], y[4]; output z[4]
669extern void bignum_sub_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 839extern void bignum_sub_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
670 840
671// Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced 841// Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced
672// Inputs x[4], y[4]; output z[4] 842// Inputs x[4], y[4]; output z[4]
673extern void bignum_sub_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); 843extern void bignum_sub_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
674 844
675// Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced 845// Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced
676// Inputs x[6], y[6]; output z[6] 846// Inputs x[6], y[6]; output z[6]
677extern void bignum_sub_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); 847extern void bignum_sub_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
678 848
679// Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced 849// Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced
680// Inputs x[9], y[9]; output z[9] 850// Inputs x[9], y[9]; output z[9]
681extern void bignum_sub_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); 851extern void bignum_sub_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
852
853// Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced
854// Inputs x[4], y[4]; output z[4]
855extern void bignum_sub_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
682 856
683// Convert 4-digit (256-bit) bignum to big-endian bytes 857// Convert 4-digit (256-bit) bignum to big-endian bytes
684// Input x[4]; output z[32] (bytes) 858// Input x[4]; output z[32] (bytes)
685extern void bignum_tobebytes_4 (uint8_t z[static 32], uint64_t x[static 4]); 859extern void bignum_tobebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
686 860
687// Convert 6-digit (384-bit) bignum to big-endian bytes 861// Convert 6-digit (384-bit) bignum to big-endian bytes
688// Input x[6]; output z[48] (bytes) 862// Input x[6]; output z[48] (bytes)
689extern void bignum_tobebytes_6 (uint8_t z[static 48], uint64_t x[static 6]); 863extern void bignum_tobebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
690 864
691// Convert 4-digit (256-bit) bignum to little-endian bytes 865// Convert 4-digit (256-bit) bignum to little-endian bytes
692// Input x[4]; output z[32] (bytes) 866// Input x[4]; output z[32] (bytes)
693extern void bignum_tolebytes_4 (uint8_t z[static 32], uint64_t x[static 4]); 867extern void bignum_tolebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
694 868
695// Convert 6-digit (384-bit) bignum to little-endian bytes 869// Convert 6-digit (384-bit) bignum to little-endian bytes
696// Input x[6]; output z[48] (bytes) 870// Input x[6]; output z[48] (bytes)
697extern void bignum_tolebytes_6 (uint8_t z[static 48], uint64_t x[static 6]); 871extern void bignum_tolebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
698 872
699// Convert 9-digit 528-bit bignum to little-endian bytes 873// Convert 9-digit 528-bit bignum to little-endian bytes
700// Input x[6]; output z[66] (bytes) 874// Input x[6]; output z[66] (bytes)
701extern void bignum_tolebytes_p521 (uint8_t z[static 66], uint64_t x[static 9]); 875extern void bignum_tolebytes_p521 (uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
702 876
703// Convert to Montgomery form z := (2^256 * x) mod p_256 877// Convert to Montgomery form z := (2^256 * x) mod p_256
704// Input x[4]; output z[4] 878// Input x[4]; output z[4]
705extern void bignum_tomont_p256 (uint64_t z[static 4], uint64_t x[static 4]); 879extern void bignum_tomont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
706extern void bignum_tomont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]); 880extern void bignum_tomont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
707 881
708// Convert to Montgomery form z := (2^256 * x) mod p_256k1 882// Convert to Montgomery form z := (2^256 * x) mod p_256k1
709// Input x[4]; output z[4] 883// Input x[4]; output z[4]
710extern void bignum_tomont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 884extern void bignum_tomont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
711extern void bignum_tomont_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]); 885extern void bignum_tomont_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
712 886
713// Convert to Montgomery form z := (2^384 * x) mod p_384 887// Convert to Montgomery form z := (2^384 * x) mod p_384
714// Input x[6]; output z[6] 888// Input x[6]; output z[6]
715extern void bignum_tomont_p384 (uint64_t z[static 6], uint64_t x[static 6]); 889extern void bignum_tomont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
716extern void bignum_tomont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]); 890extern void bignum_tomont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
717 891
718// Convert to Montgomery form z := (2^576 * x) mod p_521 892// Convert to Montgomery form z := (2^576 * x) mod p_521
719// Input x[9]; output z[9] 893// Input x[9]; output z[9]
720extern void bignum_tomont_p521 (uint64_t z[static 9], uint64_t x[static 9]); 894extern void bignum_tomont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
895
896// Convert to Montgomery form z := (2^256 * x) mod p_sm2
897// Input x[4]; output z[4]
898extern void bignum_tomont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
721 899
722// Triple modulo p_256, z := (3 * x) mod p_256 900// Triple modulo p_256, z := (3 * x) mod p_256
723// Input x[4]; output z[4] 901// Input x[4]; output z[4]
724extern void bignum_triple_p256 (uint64_t z[static 4], uint64_t x[static 4]); 902extern void bignum_triple_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
725extern void bignum_triple_p256_alt (uint64_t z[static 4], uint64_t x[static 4]); 903extern void bignum_triple_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
726 904
727// Triple modulo p_256k1, z := (3 * x) mod p_256k1 905// Triple modulo p_256k1, z := (3 * x) mod p_256k1
728// Input x[4]; output z[4] 906// Input x[4]; output z[4]
729extern void bignum_triple_p256k1 (uint64_t z[static 4], uint64_t x[static 4]); 907extern void bignum_triple_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
730extern void bignum_triple_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]); 908extern void bignum_triple_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
731 909
732// Triple modulo p_384, z := (3 * x) mod p_384 910// Triple modulo p_384, z := (3 * x) mod p_384
733// Input x[6]; output z[6] 911// Input x[6]; output z[6]
734extern void bignum_triple_p384 (uint64_t z[static 6], uint64_t x[static 6]); 912extern void bignum_triple_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
735extern void bignum_triple_p384_alt (uint64_t z[static 6], uint64_t x[static 6]); 913extern void bignum_triple_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
736 914
737// Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced 915// Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced
738// Input x[9]; output z[9] 916// Input x[9]; output z[9]
739extern void bignum_triple_p521 (uint64_t z[static 9], uint64_t x[static 9]); 917extern void bignum_triple_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
740extern void bignum_triple_p521_alt (uint64_t z[static 9], uint64_t x[static 9]); 918extern void bignum_triple_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
919
920// Triple modulo p_sm2, z := (3 * x) mod p_sm2
921// Input x[4]; output z[4]
922extern void bignum_triple_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
923extern void bignum_triple_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
741 924
742// Montgomery ladder step for curve25519 925// Montgomery ladder step for curve25519
743// Inputs point[8], pp[16], b; output rr[16] 926// Inputs point[8], pp[16], b; output rr[16]
744extern void curve25519_ladderstep(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b); 927extern void curve25519_ladderstep(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
745extern void curve25519_ladderstep_alt(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b); 928extern void curve25519_ladderstep_alt(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
746 929
747// Projective scalar multiplication, x coordinate only, for curve25519 930// Projective scalar multiplication, x coordinate only, for curve25519
748// Inputs scalar[4], point[4]; output res[8] 931// Inputs scalar[4], point[4]; output res[8]
749extern void curve25519_pxscalarmul(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]); 932extern void curve25519_pxscalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
750extern void curve25519_pxscalarmul_alt(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]); 933extern void curve25519_pxscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
751 934
752// x25519 function for curve25519 935// x25519 function for curve25519
753// Inputs scalar[4], point[4]; output res[4] 936// Inputs scalar[4], point[4]; output res[4]
754extern void curve25519_x25519(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]); 937extern void curve25519_x25519(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
755extern void curve25519_x25519_alt(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]); 938extern void curve25519_x25519_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
939
940// x25519 function for curve25519 (byte array arguments)
941// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes)
942extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
943extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
756 944
757// x25519 function for curve25519 on base element 9 945// x25519 function for curve25519 on base element 9
758// Input scalar[4]; output res[4] 946// Input scalar[4]; output res[4]
759extern void curve25519_x25519base(uint64_t res[static 4],uint64_t scalar[static 4]); 947extern void curve25519_x25519base(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
760extern void curve25519_x25519base_alt(uint64_t res[static 4],uint64_t scalar[static 4]); 948extern void curve25519_x25519base_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
949
950// x25519 function for curve25519 on base element 9 (byte array arguments)
951// Input scalar[32] (bytes); output res[32] (bytes)
952extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
953extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
954
955// Decode compressed 256-bit form of edwards25519 point
956// Input c[32] (bytes); output function return and z[8]
957extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
958extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
959
960// Encode edwards25519 point into compressed form as 256-bit number
961// Input p[8]; output z[32] (bytes)
962extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t p[S2N_BIGNUM_STATIC 8]);
761 963
762// Extended projective addition for edwards25519 964// Extended projective addition for edwards25519
763// Inputs p1[16], p2[16]; output p3[16] 965// Inputs p1[16], p2[16]; output p3[16]
764extern void edwards25519_epadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]); 966extern void edwards25519_epadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
765extern void edwards25519_epadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]); 967extern void edwards25519_epadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
766 968
767// Extended projective doubling for edwards25519 969// Extended projective doubling for edwards25519
768// Inputs p1[12]; output p3[16] 970// Inputs p1[12]; output p3[16]
769extern void edwards25519_epdouble(uint64_t p3[static 16],uint64_t p1[static 12]); 971extern void edwards25519_epdouble(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
770extern void edwards25519_epdouble_alt(uint64_t p3[static 16],uint64_t p1[static 12]); 972extern void edwards25519_epdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
771 973
772// Projective doubling for edwards25519 974// Projective doubling for edwards25519
773// Inputs p1[12]; output p3[12] 975// Inputs p1[12]; output p3[12]
774extern void edwards25519_pdouble(uint64_t p3[static 12],uint64_t p1[static 12]); 976extern void edwards25519_pdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
775extern void edwards25519_pdouble_alt(uint64_t p3[static 12],uint64_t p1[static 12]); 977extern void edwards25519_pdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
776 978
777// Extended projective + precomputed mixed addition for edwards25519 979// Extended projective + precomputed mixed addition for edwards25519
778// Inputs p1[16], p2[12]; output p3[16] 980// Inputs p1[16], p2[12]; output p3[16]
779extern void edwards25519_pepadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]); 981extern void edwards25519_pepadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
780extern void edwards25519_pepadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]); 982extern void edwards25519_pepadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
983
984// Scalar multiplication by standard basepoint for edwards25519 (Ed25519)
985// Input scalar[4]; output res[8]
986extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
987extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
988
989// Double scalar multiplication for edwards25519, fresh and base point
990// Input scalar[4], point[8], bscalar[4]; output res[8]
991extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
992extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
993
994// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache
995// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words)
996extern void mlkem_basemul_k2(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 512],const int16_t b[S2N_BIGNUM_STATIC 512],const int16_t bt[S2N_BIGNUM_STATIC 256]);
997
998// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache
999// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words)
1000extern void mlkem_basemul_k3(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 768],const int16_t b[S2N_BIGNUM_STATIC 768],const int16_t bt[S2N_BIGNUM_STATIC 384]);
1001
1002// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache
1003// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words)
1004extern void mlkem_basemul_k4(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 1024],const int16_t b[S2N_BIGNUM_STATIC 1024],const int16_t bt[S2N_BIGNUM_STATIC 512]);
1005
1006// Inverse number-theoretic transform from ML-KEM
1007// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
1008extern void mlkem_intt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
1009
1010// Precompute the mulcache data for a polynomial in the NTT domain
1011// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words)
1012extern void mlkem_mulcache_compute(int16_t x[S2N_BIGNUM_STATIC 128],const int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z[S2N_BIGNUM_STATIC 128],const int16_t t[S2N_BIGNUM_STATIC 128]);
1013
1014// Forward number-theoretic transform from ML-KEM
1015// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
1016extern void mlkem_ntt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
1017
1018// Canonical modular reduction of polynomial coefficients for ML-KEM
1019// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
1020extern void mlkem_reduce(int16_t a[S2N_BIGNUM_STATIC 256]);
1021
1022// Pack ML-KEM polynomial coefficients as 12-bit numbers
1023// Input a[256] (signed 16-bit words); output r[384] (bytes)
1024extern void mlkem_tobytes(uint8_t r[S2N_BIGNUM_STATIC 384],const int16_t a[S2N_BIGNUM_STATIC 256]);
1025
1026// Conversion of ML-KEM polynomial coefficients to Montgomery form
1027// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
1028extern void mlkem_tomont(int16_t a[S2N_BIGNUM_STATIC 256]);
1029
1030// Uniform rejection sampling for ML-KEM
1031// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return
1032extern uint64_t mlkem_rej_uniform_VARIABLE_TIME(int16_t r[S2N_BIGNUM_STATIC 256],const uint8_t *buf,uint64_t buflen,const uint8_t *table);
781 1033
782// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates 1034// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
783// Inputs p1[12], p2[12]; output p3[12] 1035// Inputs p1[12], p2[12]; output p3[12]
784extern void p256_montjadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); 1036extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
1037extern void p256_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
785 1038
786// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates 1039// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
787// Inputs p1[12]; output p3[12] 1040// Inputs p1[12]; output p3[12]
788extern void p256_montjdouble(uint64_t p3[static 12],uint64_t p1[static 12]); 1041extern void p256_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
1042extern void p256_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
789 1043
790// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates 1044// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
791// Inputs p1[12], p2[8]; output p3[12] 1045// Inputs p1[12], p2[8]; output p3[12]
792extern void p256_montjmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); 1046extern void p256_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
1047extern void p256_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
1048
1049// Montgomery-Jacobian form scalar multiplication for P-256
1050// Input scalar[4], point[12]; output res[12]
1051extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
1052extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
1053
1054// Scalar multiplication for NIST curve P-256
1055// Input scalar[4], point[8]; output res[8]
1056extern void p256_scalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
1057extern void p256_scalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
1058
1059// Scalar multiplication for precomputed point on NIST curve P-256
1060// Input scalar[4], blocksize, table[]; output res[8]
1061extern void p256_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
1062extern void p256_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
793 1063
794// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates 1064// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
795// Inputs p1[18], p2[18]; output p3[18] 1065// Inputs p1[18], p2[18]; output p3[18]
796extern void p384_montjadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); 1066extern void p384_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
1067extern void p384_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
797 1068
798// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates 1069// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
799// Inputs p1[18]; output p3[18] 1070// Inputs p1[18]; output p3[18]
800extern void p384_montjdouble(uint64_t p3[static 18],uint64_t p1[static 18]); 1071extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
1072extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
801 1073
802// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates 1074// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
803// Inputs p1[18], p2[12]; output p3[18] 1075// Inputs p1[18], p2[12]; output p3[18]
804extern void p384_montjmixadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); 1076extern void p384_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
1077extern void p384_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
1078
1079// Montgomery-Jacobian form scalar multiplication for P-384
1080// Input scalar[6], point[18]; output res[18]
1081extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
1082extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
805 1083
806// Point addition on NIST curve P-521 in Jacobian coordinates 1084// Point addition on NIST curve P-521 in Jacobian coordinates
807// Inputs p1[27], p2[27]; output p3[27] 1085// Inputs p1[27], p2[27]; output p3[27]
808extern void p521_jadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); 1086extern void p521_jadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
1087extern void p521_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
809 1088
810// Point doubling on NIST curve P-521 in Jacobian coordinates 1089// Point doubling on NIST curve P-521 in Jacobian coordinates
811// Input p1[27]; output p3[27] 1090// Input p1[27]; output p3[27]
812extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]); 1091extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
1092extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
813 1093
814// Point mixed addition on NIST curve P-521 in Jacobian coordinates 1094// Point mixed addition on NIST curve P-521 in Jacobian coordinates
815// Inputs p1[27], p2[18]; output p3[27] 1095// Inputs p1[27], p2[18]; output p3[27]
816extern void p521_jmixadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); 1096extern void p521_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
1097extern void p521_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
1098
1099// Jacobian form scalar multiplication for P-521
1100// Input scalar[9], point[27]; output res[27]
1101extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
1102extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
817 1103
818// Point addition on SECG curve secp256k1 in Jacobian coordinates 1104// Point addition on SECG curve secp256k1 in Jacobian coordinates
819// Inputs p1[12], p2[12]; output p3[12] 1105// Inputs p1[12], p2[12]; output p3[12]
820extern void secp256k1_jadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); 1106extern void secp256k1_jadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
1107extern void secp256k1_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
821 1108
822// Point doubling on SECG curve secp256k1 in Jacobian coordinates 1109// Point doubling on SECG curve secp256k1 in Jacobian coordinates
823// Input p1[12]; output p3[12] 1110// Input p1[12]; output p3[12]
824extern void secp256k1_jdouble(uint64_t p3[static 12],uint64_t p1[static 12]); 1111extern void secp256k1_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
1112extern void secp256k1_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
825 1113
826// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates 1114// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
827// Inputs p1[12], p2[8]; output p3[12] 1115// Inputs p1[12], p2[8]; output p3[12]
828extern void secp256k1_jmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); 1116extern void secp256k1_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
1117extern void secp256k1_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
1118
1119// Keccak-f1600 permutation for SHA3
1120// Inputs a[25], rc[24]; output a[25]
1121extern void sha3_keccak_f1600(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1122extern void sha3_keccak_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1123
1124// Batched 2-way Keccak-f1600 permutation for SHA3
1125// Inputs a[50], rc[24]; output a[50]
1126extern void sha3_keccak2_f1600(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1127extern void sha3_keccak2_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1128
1129// Batched 4-way Keccak-f1600 permutation for SHA3
1130// Inputs a[100], rc[24]; output a[100]
1131extern void sha3_keccak4_f1600(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1132extern void sha3_keccak4_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1133extern void sha3_keccak4_f1600_alt2(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
1134
1135// Point addition on CC curve SM2 in Montgomery-Jacobian coordinates
1136// Inputs p1[12], p2[12]; output p3[12]
1137extern void sm2_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
1138extern void sm2_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
1139
1140// Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates
1141// Inputs p1[12]; output p3[12]
1142extern void sm2_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
1143extern void sm2_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
1144
1145// Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates
1146// Inputs p1[12], p2[8]; output p3[12]
1147extern void sm2_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
1148extern void sm2_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
1149
1150// Montgomery-Jacobian form scalar multiplication for CC curve SM2
1151// Input scalar[4], point[12]; output res[12]
1152extern void sm2_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
1153extern void sm2_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
829 1154
830// Reverse the bytes in a single word 1155// Reverse the bytes in a single word
831// Input a; output function return 1156// Input a; output function return
@@ -839,6 +1164,10 @@ extern uint64_t word_clz (uint64_t a);
839// Input a; output function return 1164// Input a; output function return
840extern uint64_t word_ctz (uint64_t a); 1165extern uint64_t word_ctz (uint64_t a);
841 1166
1167// Perform 59 "divstep" iterations and return signed matrix of updates
1168// Inputs d, f, g; output m[2][2] and function return
1169extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
1170
842// Return maximum of two unsigned 64-bit words 1171// Return maximum of two unsigned 64-bit words
843// Inputs a, b; output function return 1172// Inputs a, b; output function return
844extern uint64_t word_max (uint64_t a, uint64_t b); 1173extern uint64_t word_max (uint64_t a, uint64_t b);
@@ -851,6 +1180,10 @@ extern uint64_t word_min (uint64_t a, uint64_t b);
851// Input a; output function return 1180// Input a; output function return
852extern uint64_t word_negmodinv (uint64_t a); 1181extern uint64_t word_negmodinv (uint64_t a);
853 1182
1183// Count number of set bits in a single 64-bit word (population count)
1184// Input a; output function return
1185extern uint64_t word_popcount (uint64_t a);
1186
854// Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set 1187// Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set
855// Input a; output function return 1188// Input a; output function return
856extern uint64_t word_recip (uint64_t a); 1189extern uint64_t word_recip (uint64_t a);
diff --git a/src/lib/libcrypto/bn/s2n_bignum_internal.h b/src/lib/libcrypto/bn/s2n_bignum_internal.h
index b82db7d019..37eebb4fd6 100644
--- a/src/lib/libcrypto/bn/s2n_bignum_internal.h
+++ b/src/lib/libcrypto/bn/s2n_bignum_internal.h
@@ -1,3 +1,5 @@
1// $OpenBSD: s2n_bignum_internal.h,v 1.5 2025/08/12 10:01:37 jsing Exp $
2//
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// 4//
3// Permission to use, copy, modify, and/or distribute this software for any 5// Permission to use, copy, modify, and/or distribute this software for any
@@ -14,14 +16,14 @@
14 16
15#ifdef __APPLE__ 17#ifdef __APPLE__
16# define S2N_BN_SYMBOL(NAME) _##NAME 18# define S2N_BN_SYMBOL(NAME) _##NAME
19# if defined(__AARCH64EL__) || defined(__ARMEL__)
20# define __LF %%
21# else
22# define __LF ;
23# endif
17#else 24#else
18# define S2N_BN_SYMBOL(name) name 25# define S2N_BN_SYMBOL(name) name
19#endif 26# define __LF ;
20
21#ifdef __CET__
22# include <cet.h>
23#else
24# define _CET_ENDBR
25#endif 27#endif
26 28
27#define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name) 29#define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name)
@@ -34,3 +36,24 @@
34#else 36#else
35# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */ 37# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */
36#endif 38#endif
39
40// Enable indirect branch tracking support unless explicitly disabled
41// with -DNO_IBT. If the platform supports CET, simply inherit this from
42// the usual header. Otherwise manually define _CET_ENDBR, used at each
43// x86 entry point, to be the ENDBR64 instruction, with an explicit byte
44// sequence for compilers/assemblers that don't know about it. Note that
45// it is safe to use ENDBR64 on all platforms, since the encoding is by
46// design interpreted as a NOP on all pre-CET x86_64 processors. The only
47// downside is a small increase in code size and potentially a modest
48// slowdown from executing one more instruction.
49
50#if NO_IBT
51# if defined(_CET_ENDBR)
52# error "The s2n-bignum build option NO_IBT was configured, but _CET_ENDBR is defined in this compilation unit. That is weird, so failing the build."
53# endif
54# define _CET_ENDBR
55#elif defined(__CET__)
56# include <cet.h>
57#elif !defined(_CET_ENDBR)
58# define _CET_ENDBR .byte 0xf3,0x0f,0x1e,0xfa
59#endif