31 files changed, 2988 insertions, 388 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
index 5fe4aae7a1..1d4e6d08ef 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_add
+//    extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x + y operation, truncating modulo p words in general and
 // returning a top carry (0 or 1) in the p'th place, only adding the input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_add):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_add_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_add_xtest
-xmainloop:
+bignum_add_xmainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_add_xmainloop
-        jmp     xtest
+        jmp     bignum_add_xtest
-xtoploop:
+bignum_add_xtoploop:
        mov     a, [x+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_add_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_add_xtoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -112,30 +113,30 @@ xtest:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_add_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_add_ytoploop
-ymainloop:
+bignum_add_ymainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_add_ymainloop
-ytoploop:
+bignum_add_ytoploop:
        mov     a, [y+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_add_ytoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -144,16 +145,16 @@ ytoploop:
 // Adding a non-trivial tail, when p > max(m,n)
-tails:
+bignum_add_tails:
        mov     [z+8*i],a
        xor     a, a
-        jmp     tail
+        jmp     bignum_add_tail
-tailloop:
+bignum_add_tailloop:
        mov     [z+8*i],a
-tail:
+bignum_add_tail:
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_add_tailloop
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
index 25ba17bce2..a611919603 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmadd
+//    extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                 const uint64_t *y);
 //
 // Does the "z := z + c * y" operation where y is n digits, result z is p.
 // Truncates the result in general.
@@ -54,7 +56,7 @@
 S2N_BN_SYMBOL(bignum_cmadd):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd):
        xor     h, h
        test    n, n
-        jz      end
+        jz      bignum_cmadd_end
 // Move c into a safer register as multiplies overwrite rdx
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd):
        mov     h, rdx
        mov     ishort, 1
        dec     n
-        jz      hightail
+        jz      bignum_cmadd_hightail
 // Main loop, where we always have CF + previous high part h to add in
-loop:
+bignum_cmadd_loop:
        adc     h, [z+8*i]
        sbb     r, r
        mov     rax, [x+8*i]
@@ -111,36 +113,36 @@ loop:
        mov     h, rdx
        inc     i
        dec     n
-        jnz     loop
+        jnz     bignum_cmadd_loop
-hightail:
+bignum_cmadd_hightail:
        adc     h, 0
 // Propagate the carry all the way to the end with h as extra carry word
-tail:
+bignum_cmadd_tail:
        test    p, p
-        jz      end
+        jz      bignum_cmadd_end
        add     [z+8*i], h
        mov     hshort, 0
        inc     i
        dec     p
-        jz      highend
+        jz      bignum_cmadd_highend
-tloop:
+bignum_cmadd_tloop:
        adc     [z+8*i], h
        inc     i
        dec     p
-        jnz     tloop
+        jnz     bignum_cmadd_tloop
-highend:
+bignum_cmadd_highend:
        adc     h, 0
 // Return the high/carry word
-end:
+bignum_cmadd_end:
        mov     rax, h
        pop     rbx
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
index 12f785d63a..eb71d9da44 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmul
+//    extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                const uint64_t *y);
 //
 // Does the "z := c * y" operation where y is n digits, result z is p.
 // Truncates the result in general unless p >= n + 1.
@@ -51,7 +53,7 @@
 S2N_BN_SYMBOL(bignum_cmul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul):
        xor     h, h
        xor     i, i
        test    n, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Move c into a safer register as multiplies overwrite rdx
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul):
        mov     h, rdx
        inc     i
        cmp     i, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Main loop doing the multiplications
-loop:
+bignum_cmul_loop:
        mov     rax, [x+8*i]
        mul     c
        add     rax, h
@@ -103,28 +105,28 @@ loop:
        mov     h, rdx
        inc     i
        cmp     i, n
-        jc      loop
+        jc      bignum_cmul_loop
 // Add a tail when the destination is longer
-tail:
+bignum_cmul_tail:
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
        mov     [z+8*i], h
        xor     h, h
        inc     i
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
-tloop:
+bignum_cmul_tloop:
        mov     [z+8*i], h
        inc     i
        cmp     i, p
-        jc      tloop
+        jc      bignum_cmul_tloop
 // Return the high/carry word
-end:
+bignum_cmul_end:
        mov     rax, h
 #if WINDOWS_ABI
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
new file mode 100644
index 0000000000..baf27fdc7f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
@@ -0,0 +1,112 @@
+// $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modadd):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modadd_end
+// First just add (c::z) := x + y
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modadd_addloop:
+        mov     a, [x+8*i]
+        adc     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modadd_addloop
+        adc     c, 0
+// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
+        mov     j, k
+        xor     i, i
+bignum_modadd_cmploop:
+        mov     a, [z+8*i]
+        sbb     a, [m+8*i]
+        inc     i
+        dec     j
+        jnz     bignum_modadd_cmploop
+        sbb     c, 0
+        not     c
+// Now do a masked subtraction z := z - [c] * m
+        xor     i, i
+bignum_modadd_subloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        sbb     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modadd_subloop
+bignum_modadd_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
new file mode 100644
index 0000000000..63b3230e35
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
@@ -0,0 +1,99 @@
+// $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modsub):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modsub_end
+// Subtract z := x - y and record a mask for the carry x - y < 0
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modsub_subloop:
+        mov     a, [x+8*i]
+        sbb     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modsub_subloop
+        sbb     c, c
+// Now do a masked addition z := z + [c] * m
+        xor     i, i
+bignum_modsub_addloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        adc     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modsub_addloop
+bignum_modsub_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
index a3552679a2..538cce9af7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
 //
-//    extern void bignum_mul
+//    extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
-//     (uint64_t k, uint64_t *z,
+//                           uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the "z := x * y" operation where x is m digits, y is n, result z is k.
 // Truncates the result in general unless k >= m + n
@@ -59,7 +60,7 @@
 S2N_BN_SYMBOL(bignum_mul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul):
 // If we did a multiply-add variant, however, then we could
        test    p, p
-        jz      end
+        jz      bignum_mul_end
 // Set initial 2-part sum to zero (we zero c inside the body)
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul):
        xor     k, k
-outerloop:
+bignum_mul_outerloop:
 // Zero our carry term first; we eventually want it and a zero is useful now
 // Set a =  max 0 (k + 1 - n), i = min (k + 1) m
@@ -125,11 +126,11 @@ outerloop:
        mov     d, k
        sub     d, i
        sub     i, a
-        jbe     innerend
+        jbe     bignum_mul_innerend
        lea     x,[rcx+8*a]
        lea     y,[r9+8*d-8]
-innerloop:
+bignum_mul_innerloop:
        mov     rax, [y+8*i]
        mul     QWORD PTR  [x]
        add     x, 8
@@ -137,9 +138,9 @@ innerloop:
        adc     h, rdx
        adc     c, 0
        dec     i
-        jnz     innerloop
+        jnz     bignum_mul_innerloop
-innerend:
+bignum_mul_innerend:
        mov     [z], l
        mov     l, h
@@ -147,9 +148,9 @@ innerend:
        add     z, 8
        cmp     k, p
-        jc      outerloop
+        jc      bignum_mul_outerloop
-end:
+bignum_mul_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..d6ad514020
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,187 @@
+// $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
+//                               const uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 4 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 4 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 4 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 4 == 3)
+        adcx    r11, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 4 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 4 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 4 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 4 == 3)
+        mov     [z+8*\arg1],r11
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+.if (\arg1 % 4 == 0)
+        mulx    r8, rax, [x+24]
+        adcx    r11, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 4 == 1)
+        mulx    r9, rax, [x+24]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 4 == 2)
+        mulx    r10, rax, [x+24]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 4 == 3)
+        mulx    r11, rax, [x+24]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r11,r10,r9 as y[0] * x from 1..4
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r8, rbx, [x+24]
+        adcx    r11, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+// Now write back the additional columns
+        mov     [z+32], r8
+        mov     [z+40], r9
+        mov     [z+48], r10
+        mov     [z+56], r11
+// Restore registers and return
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
index 70ff69e372..2592d1d658 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
 //
-//    extern void bignum_mul_4_8_alt
+//    extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4],
-//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//                                   const uint64_t y[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +74,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..56cbdf06e0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,223 @@
+// $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
+//                                const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcx    r13, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 6 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 6 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 6 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 6 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 6 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 6 == 5)
+        mov     [z+8*\arg1],r13
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+.if (\arg1 % 6 == 0)
+        mulx    r8, rax, [x+40]
+        adcx    r13, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 6 == 1)
+        mulx    r9, rax, [x+40]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 6 == 2)
+        mulx    r10, rax, [x+40]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 6 == 3)
+        mulx    r11, rax, [x+40]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.elseif (\arg1 % 6 == 4)
+        mulx    r12, rax, [x+40]
+        adcx    r11, rax
+        adox    r12, zero
+        adcx    r12, zero
+.elseif (\arg1 % 6 == 5)
+        mulx    r13, rax, [x+40]
+        adcx    r12, rax
+        adox    r13, zero
+        adcx    r13, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r12, rbx, [x+24]
+        adcx    r11, rbx
+        mulx    r13, rbx, [x+32]
+        adcx    r12, rbx
+        mulx    r8, rbx, [x+40]
+        adcx    r13, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+// Now write back the additional columns
+        mov     [z+48], r8
+        mov     [z+56], r9
+        mov     [z+64], r10
+        mov     [z+72], r11
+        mov     [z+80], r12
+        mov     [z+88], r13
+// Restore registers and return
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..077c52b38e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,199 @@
+// $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6],
+//                                    const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// This is moved from rdx to free it for muls
+#define y rcx
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A minutely shorter form for when c = 0 initially
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+// A short form where we don't expect a top carry
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Result term 0
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+// Result term 3
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+// Result term 4
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+// Result term 5
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+// Result term 6
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        mov     [z+48], t2
+// Result term 7
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40],[y+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..faa0196d8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,273 @@
+// $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
+//                                const uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rbx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rbx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rbx
+.endif
+.endm
+// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd 0, \arg1
+.if (\arg1 % 8 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 8 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 8 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 8 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 8 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 8 == 5)
+        mov     [z+8*\arg1],r13
+.elseif (\arg1 % 8 == 6)
+        mov     [z+8*\arg1],r14
+.elseif (\arg1 % 8 == 7)
+        mov     [z+8*\arg1],r15
+.endif
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+.if (\arg1 % 8 == 0)
+        adc     r8, zero
+.elseif (\arg1 % 8 == 1)
+        adc     r9, zero
+.elseif (\arg1 % 8 == 2)
+        adc     r10, zero
+.elseif (\arg1 % 8 == 3)
+        adc     r11, zero
+.elseif (\arg1 % 8 == 4)
+        adc     r12, zero
+.elseif (\arg1 % 8 == 5)
+        adc     r13, zero
+.elseif (\arg1 % 8 == 6)
+        adc     r14, zero
+.elseif (\arg1 % 8 == 7)
+        adc     r15, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adc     r9, rbx
+        mulx    r11, rbx, [x+16]
+        adc     r10, rbx
+        mulx    r12, rbx, [x+24]
+        adc     r11, rbx
+        mulx    r13, rbx, [x+32]
+        adc     r12, rbx
+        mulx    r14, rbx, [x+40]
+        adc     r13, rbx
+        mulx    r15, rbx, [x+48]
+        adc     r14, rbx
+        mulx    r8, rbx, [x+56]
+        adc     r15, rbx
+        adc     r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+        addrow  6
+        addrow  7
+// Now write back the additional columns
+        mov     [z+64], r8
+        mov     [z+72], r9
+        mov     [z+80], r10
+        mov     [z+88], r11
+        mov     [z+96], r12
+        mov     [z+104], r13
+        mov     [z+112], r14
+        mov     [z+120], r15
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
index 066403b074..0e30b9170f 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,9 @@
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
 //
-//    extern void bignum_mul_8_16_alt
+//    extern void bignum_mul_8_16_alt(uint64_t z[static 16],
-//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//                                    const uint64_t x[static 8],
+//                                    const uint64_t y[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +75,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
index 54e3f59442..86f1af2ac4 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,7 @@
 // Square z := x^2
 // Input x[n]; output z[k]
 //
-//    extern void bignum_sqr
+//    extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
-//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
 //
 // Does the "z := x^2" operation where x is n digits and result z is k.
 // Truncates the result in general unless k >= 2 * n
@@ -62,7 +63,7 @@
 #define llshort ebp
 S2N_BN_SYMBOL(bignum_sqr):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr):
 // If p = 0 the result is trivial and nothing needs doing
        test    p, p
-        jz      end
+        jz      bignum_sqr_end
 // initialize (hh,ll) = 0
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr):
        xor     k, k
-outerloop:
+bignum_sqr_outerloop:
 // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
 // We want to accumulate all x[i] * x[k - i] for bot <= i < top
@@ -122,7 +123,7 @@ outerloop:
 // If htop <= bot then main doubled part of the sum is empty
        cmp     i, htop
-        jnc     nosumming
+        jnc     bignum_sqr_nosumming
 // Use a moving pointer for [y] = x[k-i] for the cofactor
@@ -132,7 +133,7 @@ outerloop:
 // Do the main part of the sum x[i] * x[k - i] for 2 * i < k
-innerloop:
+bignum_sqr_innerloop:
        mov     a, [x+8*i]
        mul     QWORD PTR [y]
        add     l, a
@@ -141,7 +142,7 @@ innerloop:
        sub     y, 8
        inc     i
        cmp     i, htop
-        jc      innerloop
+        jc      bignum_sqr_innerloop
 // Now double it
@@ -151,11 +152,11 @@ innerloop:
 // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
-nosumming:
+bignum_sqr_nosumming:
        test    k, 1
-        jnz     innerend
+        jnz     bignum_sqr_innerend
        cmp     i, n
-        jnc     innerend
+        jnc     bignum_sqr_innerend
        mov     a, [x+8*i]
        mul     a
@@ -165,7 +166,7 @@ nosumming:
 // Now add the local sum into the global sum, store and shift
-innerend:
+bignum_sqr_innerend:
        add     l, ll
        mov     [z+8*k], l
        adc     h, hh
@@ -175,11 +176,11 @@ innerend:
        inc     k
        cmp     k, p
-        jc      outerloop
+        jc      bignum_sqr_outerloop
 // Restore registers and return
-end:
+bignum_sqr_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..25664782f7
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,158 @@
+// $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+S2N_BN_SYMBOL(bignum_sqr_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+// Set up an initial window [d6;...d1] = [23;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mov     rdx, [x+16]
+        mulx    d6, d5, [x+24]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+24]
+        mulx    rcx, rax, [x+8]
+        adcx    d4, rax
+        adox    d5, rcx
+        adcx    d5, zero
+        adox    d6, zero
+        adcx    d6, zero
+// In principle this is otiose as CF and OF carries are absorbed at this point
+// However it seems helpful for the OOO engine to be told it's a fresh start
+        xor     zeroe, zeroe
+// Double and add to the 00 + 11 + 22 + 33 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        mov     [z+32], d4
+        adcx    d6, d6
+        mov     [z+40], d5
+        adox    d6, rax
+        mov     [z+48], d6
+        adcx    rdx, zero
+        adox    rdx, zero
+        mov     [z+56], rdx
+// Restore saved registers and return
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
index 7c534ae907..7eafac3284 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Square, z := x^2
 // Input x[4]; output z[8]
 //
-//    extern void bignum_sqr_4_8_alt
+//    extern void bignum_sqr_4_8_alt(uint64_t z[static 8],
-//      (uint64_t z[static 8], uint64_t x[static 4]);
+//                                   const uint64_t x[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -71,7 +73,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..3f055e8b75
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,227 @@
+// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+#define d7 r14
+#define d8 r15
+#define d9 rbx
+// Care is needed: re-using the zero register
+#define d10 rbp
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mulx    d6, d5, [x+40]
+        mov     rdx, [x+24]
+        mulx    d8, d7, [x+32]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+8]
+        mulx    rcx, rax, [x+24]
+        adcx    d4, rax
+        adox    d5, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d6, rax
+        adox    d7, rcx
+        adcx    d7, zero
+        adox    d8, zero
+        adcx    d8, zero
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+        xor     zeroe, zeroe
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+        mov     rdx, [x+32]
+        mulx    rcx, rax, [x]
+        adcx    d4, rax
+        adox    d5, rcx
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x+24]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d6, rax
+        adox    d7, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d7, rax
+        adox    d8, rcx
+        mov     rdx, [x+24]
+        mulx    d9, rax, [x+40]
+        adcx    d8, rax
+        adox    d9, zero
+        mov     rdx, [x+32]
+        mulx    d10, rax, [x+40]
+        adcx    d9, rax
+        mov     eax, 0
+        adox    d10, rax
+        adcx    d10, rax
+// Again, just for a clear fresh start for the flags
+        xor     eax, eax
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        adcx    d6, d6
+        adox    d6, rax
+        adcx    d7, d7
+        adox    d7, rdx
+        mov     rdx, [x+32]
+        mov     [z+32], d4
+        mulx    rdx, rax, rdx
+        adcx    d8, d8
+        adox    d8, rax
+        adcx    d9, d9
+        adox    d9, rdx
+        mov     rdx, [x+40]
+        mov     [z+40], d5
+        mulx    rdx, rax, rdx
+        mov     [z+48], d6
+        adcx    d10, d10
+        mov     [z+56], d7
+        adox    d10, rax
+        mov     [z+64], d8
+        mov     eax, 0
+        mov     [z+72], d9
+        adcx    rdx, rax
+        mov     [z+80], d10
+        adox    rdx, rax
+        mov     [z+88], rdx
+// Restore saved registers and return
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..eb43b0a15b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,210 @@
+// $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+// Input arguments
+#define z rdi
+#define x rsi
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Additional temporaries for local windows to share doublings
+#define u0 rcx
+#define u1 r11
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// Set up initial window (c,h,l) = numa * numb
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+// Square term incorporation (c,h,l) += numba^2
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A short form where we don't expect a top carry
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+// A version doubling directly before adding, for single non-square terms
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Result term 0
+        mov     rax, [x]
+        mul     rax
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+// Result term 3
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+// Result term 4
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+// Result term 5
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+// Result term 6
+        combaddz(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+// Result term 7
+        combaddz(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+24],[x+40])
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+32],[x+40])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..41277b5b6a
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,311 @@
+// $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds rdx * x[i] into the window  at the i+j point
+.macro mulpadd arg1,arg2
+        mulx    rcx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rcx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rcx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rcx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rcx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rcx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rcx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rcx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rcx
+.endif
+.endm
+// mulpade i, j adds rdx * x[i] into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+.macro diagonals
+        xor     zeroe, zeroe
+// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+        mov     rdx, [x]
+        mulx    rax, r9, [x+8]
+        mov     [z+8], r9
+        mulx    rcx, r10, [x+16]
+        adcx    r10, rax
+        mov     [z+16], r10
+        mulx    rax, r11, [x+24]
+        adcx    r11, rcx
+        mulx    rcx, r12, [x+32]
+        adcx    r12, rax
+        mulx    rax, r13, [x+40]
+        adcx    r13, rcx
+        mulx    rcx, r14, [x+48]
+        adcx    r14, rax
+        mulx    r8, r15, [x+56]
+        adcx    r15, rcx
+        adcx    r8, zero
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+        xor     zeroe, zeroe
+        mov     rdx, [x+8]
+        mulpadd 2, 1
+        mov     [z+24], r11
+        mulpadd 3, 1
+        mov     [z+32], r12
+        mulpadd 4, 1
+        mulpadd 5, 1
+        mulpadd 6, 1
+        mulpade 7, 1
+        mov     rdx, [x+32]
+        mulpade 5, 4
+        adcx    r10, zero
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+        xor     zeroe, zeroe
+        mov     rdx, [x+16]
+        mulpadd 3, 2
+        mov     [z+40], r13
+        mulpadd 4, 2
+        mov     [z+48], r14
+        mulpadd 5, 2
+        mulpadd 6, 2
+        mulpadd 7, 2
+        mov     rdx, [x+48]
+        mulpade 4, 6
+        mulpade 5, 6
+        adcx    r12, zero
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+        xor     zeroe, zeroe
+        mov     rdx, [x+24]
+        mulpadd 4, 3
+        mov     [z+56], r15
+        mulpadd 5, 3
+        mov     [z+64], r8
+        mulpadd 6, 3
+        mulpadd 7, 3
+        mov     rdx, [x+56]
+        mulpadd 4, 7
+        mulpade 5, 7
+        mulpade 6, 7
+        adcx    r14, zero
+// Double and add things; use z[1]..z[8] and thereafter the registers
+// r9..r15 which haven't been written back yet
+        xor     zeroe, zeroe
+        mov     rdx, [x]
+        mulx    rcx, rax, rdx
+        mov     [z], rax
+        mov     rax, [z+8]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+8], rax
+        mov     rax, [z+16]
+        mov     rdx, [x+8]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+16], rax
+        mov     rax, [z+24]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+24], rax
+        mov     rax, [z+32]
+        mov     rdx, [x+16]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+32], rax
+        mov     rax, [z+40]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+40], rax
+        mov     rax, [z+48]
+        mov     rdx, [x+24]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+48], rax
+        mov     rax, [z+56]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+56], rax
+        mov     rax, [z+64]
+        mov     rdx, [x+32]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+64], rax
+        adcx    r9, r9
+        adox    r9, rcx
+        mov     [z+72], r9
+        mov     rdx, [x+40]
+        mulx    rcx, rdx, rdx
+        adcx    r10, r10
+        adox    r10, rdx
+        mov     [z+80], r10
+        adcx    r11, r11
+        adox    r11, rcx
+        mov     [z+88], r11
+        mov     rdx, [x+48]
+        mulx    rcx, rdx, rdx
+        adcx    r12, r12
+        adox    r12, rdx
+        mov     [z+96], r12
+        adcx    r13, r13
+        adox    r13, rcx
+        mov     [z+104], r13
+        mov     rdx, [x+56]
+        mulx    r15, rdx, rdx
+        adcx    r14, r14
+        adox    r14, rdx
+        mov     [z+112], r14
+        adcx    r15, zero
+        adox    r15, zero
+        mov     [z+120], r15
+.endm
+S2N_BN_SYMBOL(bignum_sqr_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Do the multiplication
+        diagonals
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
index ac0b6f96c2..cb10ba2a12 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,8 @@
 // Square, z := x^2
 // Input x[8]; output z[16]
 //
-//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//    extern void bignum_sqr_8_16_alt(uint64_t z[static 16],
+//                                    const uint64_t x[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -103,7 +106,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
index 3ff8a30510..7324d3a71e 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_sub
+//    extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x - y operation, truncating modulo p words in general and
 // returning a top borrow (0 or 1) in the p'th place, only subtracting input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_sub):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_sub_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_sub_xtest
-xmainloop:
+bignum_sub_xmainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_sub_xmainloop
-        jmp     xtest
+        jmp     bignum_sub_xtest
-xtoploop:
+bignum_sub_xtoploop:
        mov     a, [x+8*i]
        sbb     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_sub_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_sub_xtoploop
        sbb     a, a
        test    p, p
-        jz      tailskip
+        jz      bignum_sub_tailskip
-tailloop:
+bignum_sub_tailloop:
        mov     [z+8*i],a
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
-tailskip:
+bignum_sub_tailskip:
        neg     a
 #if WINDOWS_ABI
        pop    rsi
@@ -118,29 +119,29 @@ tailskip:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_sub_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_sub_ytoploop
-ymainloop:
+bignum_sub_ymainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_sub_ymainloop
-ytoploop:
+bignum_sub_ytoploop:
        mov     ashort, 0
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_sub_ytoploop
        sbb     a, a
        test    p, p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
        neg     a
 #if WINDOWS_ABI
        pop    rsi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
index a377a05681..9ff8920ca2 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */
+/*      $OpenBSD: bn_arch.c,v 1.12 2025/08/14 15:29:17 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -19,6 +19,7 @@
 #include "bn_arch.h"
 #include "bn_local.h"
+#include "crypto_arch.h"
 #include "s2n_bignum.h"
 #ifdef HAVE_BN_ADD
@@ -26,8 +27,8 @@ BN_ULONG
 bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
 }
 #endif
@@ -46,8 +47,8 @@ BN_ULONG
 bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -55,8 +56,28 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MOD_ADD_WORDS
+void
+bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
+}
+#endif
+#ifdef HAVE_BN_MOD_SUB_WORDS
+void
+bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
 }
 #endif
@@ -64,7 +85,7 @@ bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 BN_ULONG
 bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
@@ -72,25 +93,52 @@ bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 BN_ULONG
 bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
 #ifdef HAVE_BN_MUL_COMBA4
 void
-bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MUL_COMBA6
+void
+bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
 #ifdef HAVE_BN_MUL_COMBA8
 void
-bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
@@ -98,7 +146,7 @@ bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
 int
 bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 {
-        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d);
+        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (const uint64_t *)a->d);
        return 1;
 }
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 void
 bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad);
+}
+#endif
+#ifdef HAVE_BN_SQR_COMBA6
+void
+bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
@@ -117,8 +182,12 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 void
 bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 927cd75208..7359f993a7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.16 2025/08/14 15:22:54 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -27,13 +27,18 @@
 #define HAVE_BN_DIV_WORDS
+#define HAVE_BN_MOD_ADD_WORDS
+#define HAVE_BN_MOD_SUB_WORDS
 #define HAVE_BN_MUL_ADD_WORDS
 #define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA6
 #define HAVE_BN_MUL_COMBA8
 #define HAVE_BN_MUL_WORDS
 #define HAVE_BN_SQR
 #define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA6
 #define HAVE_BN_SQR_COMBA8
 #define HAVE_BN_SUB
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
index 3926fcd4b0..705fbdbbda 100644
--- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S
+++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
@@ -1,3 +1,5 @@
+// $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,7 @@
 // Count leading zero bits in a single word
 // Input a; output function return
 //
-//    extern uint64_t word_clz (uint64_t a);
+//    extern uint64_t word_clz(uint64_t a);
 //
 // Standard x86-64 ABI: RDI = a, returns RAX
 // Microsoft x64 ABI:   RCX = a, returns RAX
@@ -30,7 +32,7 @@
        .text
 S2N_BN_SYMBOL(word_clz):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/bn_internal.h b/src/lib/libcrypto/bn/bn_internal.h
index a1f1515b57..8b5145e225 100644
--- a/src/lib/libcrypto/bn/bn_internal.h
+++ b/src/lib/libcrypto/bn/bn_internal.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_internal.h,v 1.19 2025/05/25 05:12:05 jsing Exp $ */
+/*      $OpenBSD: bn_internal.h,v 1.20 2025/08/02 16:20:00 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -41,6 +41,8 @@ void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    const BN_ULONG *m, size_t n);
 void bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n);
+void bn_mod_sqr_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *m,
+    BN_ULONG *t, BN_ULONG m0, size_t n);
 void bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap,
    const BN_ULONG *bp, const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0,
diff --git a/src/lib/libcrypto/bn/bn_local.h b/src/lib/libcrypto/bn/bn_local.h
index 0eb9a7a745..1bd4c16baf 100644
--- a/src/lib/libcrypto/bn/bn_local.h
+++ b/src/lib/libcrypto/bn/bn_local.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_local.h,v 1.51 2025/05/25 04:30:55 jsing Exp $ */
+/* $OpenBSD: bn_local.h,v 1.54 2025/08/05 15:08:13 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -240,10 +240,12 @@ BN_ULONG bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len,
    const BN_ULONG *b, int b_len);
 void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba4(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba6(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+void bn_mul_comba8(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
+void bn_sqr_comba6(BN_ULONG *r, const BN_ULONG *a);
 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
diff --git a/src/lib/libcrypto/bn/bn_mod_words.c b/src/lib/libcrypto/bn/bn_mod_words.c
index 8971f9f306..d9aee8701a 100644
--- a/src/lib/libcrypto/bn/bn_mod_words.c
+++ b/src/lib/libcrypto/bn/bn_mod_words.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_mod_words.c,v 1.1 2025/05/25 04:58:32 jsing Exp $  */
+/*      $OpenBSD: bn_mod_words.c,v 1.3 2025/08/05 15:15:54 jsing Exp $  */
 /*
 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
 *
@@ -73,6 +73,42 @@ void
 bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n)
 {
-        bn_montgomery_multiply_words(r, a, b, m, t, m0, n);
+        if (n == 4) {
+                bn_mul_comba4(t, a, b);
+                bn_montgomery_reduce_words(r, t, m, m0, n);
+        } else if (n == 6) {
+                bn_mul_comba6(t, a, b);
+                bn_montgomery_reduce_words(r, t, m, m0, n);
+        } else if (n == 8) {
+                bn_mul_comba8(t, a, b);
+                bn_montgomery_reduce_words(r, t, m, m0, n);
+        } else {
+                bn_montgomery_multiply_words(r, a, b, m, t, m0, n);
+        }
+}
+#endif
+/*
+ * bn_mod_sqr_words() computes r[] = (a[] * a[]) mod m[], where a, r and
+ * m are arrays of words with length n (r may be the same as a) in the
+ * Montgomery domain. The result remains in the Montgomery domain.
+ */
+#ifndef HAVE_BN_MOD_SQR_WORDS
+void
+bn_mod_sqr_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *m,
+    BN_ULONG *t, BN_ULONG m0, size_t n)
+{
+        if (n == 4) {
+                bn_sqr_comba4(t, a);
+                bn_montgomery_reduce_words(r, t, m, m0, n);
+        } else if (n == 6) {
+                bn_sqr_comba6(t, a);
+                bn_montgomery_reduce_words(r, t, m, m0, n);
+        } else if (n == 8) {
+                bn_sqr_comba8(t, a);
+                bn_montgomery_reduce_words(r, t, m, m0, n);
+        } else {
+                bn_montgomery_multiply_words(r, a, a, m, t, m0, n);
+        }
 }
 #endif
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
index 950846fa5b..8280a8db27 100644
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ b/src/lib/libcrypto/bn/bn_mont.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mont.c,v 1.68 2025/05/25 05:12:05 jsing Exp $ */
+/* $OpenBSD: bn_mont.c,v 1.69 2025/08/03 10:33:46 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -116,6 +116,7 @@
 * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
 */
+#include <limits.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
@@ -214,7 +215,7 @@ BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
                 goto err;
        mont->N.neg = 0;
        mont->ri = ((BN_num_bits(mod) + BN_BITS2 - 1) / BN_BITS2) * BN_BITS2;
-        if (mont->ri * 2 < mont->ri)
+        if (mont->ri > INT_MAX / 2)
                goto err;
        /*
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
index bdeb9b0fe8..a30d05fb02 100644
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ b/src/lib/libcrypto/bn/bn_mul.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mul.c,v 1.39 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_mul.c,v 1.43 2025/08/14 15:15:04 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -57,6 +57,7 @@
 */
 #include <assert.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
@@ -73,7 +74,7 @@
 */
 #ifndef HAVE_BN_MUL_COMBA4
 void
-bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+bn_mul_comba4(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
 {
        BN_ULONG c0, c1, c2;
@@ -103,13 +104,73 @@ bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 #endif
 /*
+ * bn_mul_comba6() computes r[] = a[] * b[] using Comba multiplication
+ * (https://everything2.com/title/Comba+multiplication), where a and b are both
+ * six word arrays, producing a 12 word array result.
+ */
+#ifndef HAVE_BN_MUL_COMBA6
+void
+bn_mul_comba6(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
+{
+        BN_ULONG c0, c1, c2;
+        bn_mulw_addtw(a[0], b[0],  0,  0,  0, &c2, &c1, &r[0]);
+        bn_mulw_addtw(a[0], b[1],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[0], c2, c1, c0, &c2, &c1, &r[1]);
+        bn_mulw_addtw(a[2], b[0],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[0], b[2], c2, c1, c0, &c2, &c1, &r[2]);
+        bn_mulw_addtw(a[0], b[3],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[0], c2, c1, c0, &c2, &c1, &r[3]);
+        bn_mulw_addtw(a[4], b[0],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[0], b[4], c2, c1, c0, &c2, &c1, &r[4]);
+        bn_mulw_addtw(a[0], b[5],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[5], b[0], c2, c1, c0, &c2, &c1, &r[5]);
+        bn_mulw_addtw(a[5], b[1],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[5], c2, c1, c0, &c2, &c1, &r[6]);
+        bn_mulw_addtw(a[2], b[5],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[5], b[2], c2, c1, c0, &c2, &c1, &r[7]);
+        bn_mulw_addtw(a[5], b[3],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[5], c2, c1, c0, &c2, &c1, &r[8]);
+        bn_mulw_addtw(a[4], b[5],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[5], b[4], c2, c1, c0, &c2, &c1, &r[9]);
+        bn_mulw_addtw(a[5], b[5],  0, c2, c1, &c2, &r[11], &r[10]);
+}
+#endif
+/*
 * bn_mul_comba8() computes r[] = a[] * b[] using Comba multiplication
 * (https://everything2.com/title/Comba+multiplication), where a and b are both
 * eight word arrays, producing a 16 word array result.
 */
 #ifndef HAVE_BN_MUL_COMBA8
 void
-bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+bn_mul_comba8(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
 {
        BN_ULONG c0, c1, c2;
@@ -338,14 +399,16 @@ BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
        if (rr == NULL)
                goto err;
-        rn = a->top + b->top;
+        if (a->top > INT_MAX - b->top)
-        if (rn < a->top)
                goto err;
+        rn = a->top + b->top;
        if (!bn_wexpand(rr, rn))
                goto err;
        if (a->top == 4 && b->top == 4) {
                bn_mul_comba4(rr->d, a->d, b->d);
+        } else if (a->top == 6 && b->top == 6) {
+                bn_mul_comba6(rr->d, a->d, b->d);
        } else if (a->top == 8 && b->top == 8) {
                bn_mul_comba8(rr->d, a->d, b->d);
        } else {
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
index 0dbccbf85d..2f7f71f819 100644
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ b/src/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_sqr.c,v 1.36 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_sqr.c,v 1.38 2025/08/14 15:15:04 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -97,6 +97,51 @@ bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 #endif
 /*
+ * bn_sqr_comba6() computes r[] = a[] * a[] using Comba multiplication
+ * (https://everything2.com/title/Comba+multiplication), where a is an
+ * six word array, producing an 12 word array result.
+ */
+#ifndef HAVE_BN_SQR_COMBA6
+void
+bn_sqr_comba6(BN_ULONG *r, const BN_ULONG *a)
+{
+        BN_ULONG c2, c1, c0;
+        bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
+        bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
+        bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
+        bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
+        bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]);
+        bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]);
+        bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &r[6]);
+        bn_mul2_mulw_addtw(a[5], a[2], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]);
+        bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &r[8]);
+        bn_mul2_mulw_addtw(a[5], a[4], 0, c2, c1, &c2, &c1, &r[9]);
+        bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &r[11], &r[10]);
+}
+#endif
+/*
 * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication
 * (https://everything2.com/title/Comba+multiplication), where a is an
 * eight word array, producing an 16 word array result.
@@ -281,6 +326,8 @@ BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
        if (a->top == 4) {
                bn_sqr_comba4(rr->d, a->d);
+        } else if (a->top == 6) {
+                bn_sqr_comba6(rr->d, a->d);
        } else if (a->top == 8) {
                bn_sqr_comba8(rr->d, a->d);
        } else {
diff --git a/src/lib/libcrypto/bn/s2n_bignum.h b/src/lib/libcrypto/bn/s2n_bignum.h
index ce6e8cdc94..7d77894cdc 100644
--- a/src/lib/libcrypto/bn/s2n_bignum.h
+++ b/src/lib/libcrypto/bn/s2n_bignum.h
@@ -1,3 +1,5 @@
+// $OpenBSD: s2n_bignum.h,v 1.4 2025/08/12 10:01:37 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -34,182 +36,240 @@
 //        throughput, generally offering higher performance there.
 // ----------------------------------------------------------------------------
+#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
+#define S2N_BIGNUM_STATIC
+#else
+#define S2N_BIGNUM_STATIC static
+#endif
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
-extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_add_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_add_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_add_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_add_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Compute "amontification" constant z :== 2^{128k} (congruent mod m)
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_amontifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_amontifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
 // Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_amontmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_amontmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
 // Inputs x[n], m[k], p; output z[k]
-extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
 // Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_amontsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_amontsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Convert 4-digit (256-bit) bignum to/from big-endian form
 // Input x[4]; output z[4]
-extern void bignum_bigendian_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_bigendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to/from big-endian form
 // Input x[6]; output z[6]
-extern void bignum_bigendian_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_bigendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Select bitfield starting at bit n with length l <= 64
 // Inputs x[k], n, l; output function return
-extern uint64_t bignum_bitfield (uint64_t k, uint64_t *x, uint64_t n, uint64_t l);
+extern uint64_t bignum_bitfield (uint64_t k, const uint64_t *x, uint64_t n, uint64_t l);
 // Return size of bignum in bits
 // Input x[k]; output function return
-extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x);
+extern uint64_t bignum_bitsize (uint64_t k, const uint64_t *x);
 // Divide by a single (nonzero) word, z := x / m and return x mod m
 // Inputs x[n], m; outputs function return (remainder) and z[k]
-extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
 // Divide by a single word, z := x / m when known to be exact
 // Inputs x[n], m; output z[k]
-extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
 // Count leading zero digits (64-bit words)
 // Input x[k]; output function return
-extern uint64_t bignum_cld (uint64_t k, uint64_t *x);
+extern uint64_t bignum_cld (uint64_t k, const uint64_t *x);
 // Count leading zero bits
 // Input x[k]; output function return
-extern uint64_t bignum_clz (uint64_t k, uint64_t *x);
+extern uint64_t bignum_clz (uint64_t k, const uint64_t *x);
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Negated multiply-add with single-word multiplier, z := z - c * y
 // Inputs c, y[n]; outputs function return (negative carry-out) and z[k]
-extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Find modulus of bignum w.r.t. single nonzero word m, returning x mod m
 // Input x[k], m; output function return
-extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m);
+extern uint64_t bignum_cmod (uint64_t k, const uint64_t *x, uint64_t m);
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p25519 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p25519_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p256 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p256_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p256k1 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p256k1_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced
 // Inputs c, x[6]; output z[6]
-extern void bignum_cmul_p384 (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
+extern void bignum_cmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_cmul_p384_alt (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
+extern void bignum_cmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced
 // Inputs c, x[9]; output z[9]
-extern void bignum_cmul_p521 (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
+extern void bignum_cmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_cmul_p521_alt (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
+extern void bignum_cmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test bignums for coprimality, gcd(x,y) = 1
 // Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
-extern uint64_t bignum_coprime (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t);
+extern uint64_t bignum_coprime (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y, uint64_t *t);
 // Copy bignum with zero-extension or truncation, z := x
 // Input x[n]; output z[k]
-extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, uint64_t height,
+        uint64_t width, uint64_t idx);
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1]. width must be a multiple of 8.
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table_8n (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t width, uint64_t idx);
+// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*16]; output z[16]
+extern void bignum_copy_row_from_table_16 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
+// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*32]; output z[32]
+extern void bignum_copy_row_from_table_32 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
 // Count trailing zero digits (64-bit words)
 // Input x[k]; output function return
-extern uint64_t bignum_ctd (uint64_t k, uint64_t *x);
+extern uint64_t bignum_ctd (uint64_t k, const uint64_t *x);
 // Count trailing zero bits
 // Input x[k]; output function return
-extern uint64_t bignum_ctz (uint64_t k, uint64_t *x);
+extern uint64_t bignum_ctz (uint64_t k, const uint64_t *x);
 // Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_deamont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_deamont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_deamont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_deamont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_deamont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_deamont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_deamont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert from almost-Montgomery form z := (x / 2^576) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_deamont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_deamont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_deamont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_demont (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_demont (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_demont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_demont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_demont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_demont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_demont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_demont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_demont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_demont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_demont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_demont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Select digit x[n]
 // Inputs x[k], n; output function return
-extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n);
+extern uint64_t bignum_digit (uint64_t k, const uint64_t *x, uint64_t n);
 // Return size of bignum in digits (64-bit word)
 // Input x[k]; output function return
-extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x);
+extern uint64_t bignum_digitsize (uint64_t k, const uint64_t *x);
 // Divide bignum by 10: z' := z div 10, returning remainder z mod 10
 // Inputs z[k]; outputs function return (remainder) and z[k]
@@ -217,294 +277,391 @@ extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
 // Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_double_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_double_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_double_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_double_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Extended Montgomery reduce, returning results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
 // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, const uint64_t *m,
+                                          uint64_t w, uint64_t *m_precalc);
 // Test bignums for equality, x = y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_eq (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_eq (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Test bignum for even-ness
 // Input x[k]; output function return
-extern uint64_t bignum_even (uint64_t k, uint64_t *x);
+extern uint64_t bignum_even (uint64_t k, const uint64_t *x);
 // Convert 4-digit (256-bit) bignum from big-endian bytes
 // Input x[32] (bytes); output z[4]
-extern void bignum_frombebytes_4 (uint64_t z[static 4], uint8_t x[static 32]);
+extern void bignum_frombebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
 // Convert 6-digit (384-bit) bignum from big-endian bytes
 // Input x[48] (bytes); output z[6]
-extern void bignum_frombebytes_6 (uint64_t z[static 6], uint8_t x[static 48]);
+extern void bignum_frombebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
 // Convert 4-digit (256-bit) bignum from little-endian bytes
 // Input x[32] (bytes); output z[4]
-extern void bignum_fromlebytes_4 (uint64_t z[static 4], uint8_t x[static 32]);
+extern void bignum_fromlebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
 // Convert 6-digit (384-bit) bignum from little-endian bytes
 // Input x[48] (bytes); output z[6]
-extern void bignum_fromlebytes_6 (uint64_t z[static 6], uint8_t x[static 48]);
+extern void bignum_fromlebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
 // Convert little-endian bytes to 9-digit 528-bit bignum
 // Input x[66] (bytes); output z[9]
-extern void bignum_fromlebytes_p521 (uint64_t z[static 9],uint8_t x[static 66]);
+extern void bignum_fromlebytes_p521 (uint64_t z[S2N_BIGNUM_STATIC 9],const uint8_t x[S2N_BIGNUM_STATIC 66]);
 // Compare bignums, x >= y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_ge (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_ge (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Compare bignums, x > y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_gt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_gt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_half_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_half_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_half_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_half_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_half_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_half_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_half_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_half_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_half_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_25519 = 2^255 - 19
+// Input x[4]; output z[4]
+extern void bignum_inv_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_inv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+// Modular inverse modulo p_521 = 2^521 - 1
+// Input x[9]; output z[9]
+extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9],const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Inverse square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_invsqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test bignum for zero-ness, x = 0
 // Input x[k]; output function return
-extern uint64_t bignum_iszero (uint64_t k, uint64_t *x);
+extern uint64_t bignum_iszero (uint64_t k, const uint64_t *x);
 // Multiply z := x * y
 // Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
-extern void bignum_kmul_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], uint64_t t[static 32]);
+extern void bignum_kmul_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], const uint64_t y[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 32]);
 // Multiply z := x * y
 // Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
-extern void bignum_kmul_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], uint64_t t[static 96]);
+extern void bignum_kmul_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], const uint64_t y[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 96]);
 // Square, z := x^2
 // Input x[16]; output z[32]; temporary buffer t[>=24]
-extern void bignum_ksqr_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]);
+extern void bignum_ksqr_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 24]);
 // Square, z := x^2
 // Input x[32]; output z[64]; temporary buffer t[>=72]
-extern void bignum_ksqr_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]);
+extern void bignum_ksqr_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 72]);
 // Compare bignums, x <= y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_le (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_le (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Convert 4-digit (256-bit) bignum to/from little-endian form
 // Input x[4]; output z[4]
-extern void bignum_littleendian_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_littleendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to/from little-endian form
 // Input x[6]; output z[6]
-extern void bignum_littleendian_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_littleendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Compare bignums, x < y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_lt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_lt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Multiply-add, z := z + x * y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
+// Inputs x[4], y[4], c[4]; output z[4]
+extern void bignum_madd_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+extern void bignum_madd_n25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+// Reduce modulo group order, z := x mod m_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_m25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[k]; output z[4]
+extern void bignum_mod_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_n25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_256
 // Input x[k]; output z[4]
-extern void bignum_mod_n256 (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
-extern void bignum_mod_n256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
 // Reduce modulo group order, z := x mod n_256
 // Input x[4]; output z[4]
-extern void bignum_mod_n256_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_n256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_256k1
 // Input x[4]; output z[4]
-extern void bignum_mod_n256k1_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_n256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_384
 // Input x[k]; output z[6]
-extern void bignum_mod_n384 (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
-extern void bignum_mod_n384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
 // Reduce modulo group order, z := x mod n_384
 // Input x[6]; output z[6]
-extern void bignum_mod_n384_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_mod_n384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Reduce modulo group order, z := x mod n_521
 // Input x[9]; output z[9]
-extern void bignum_mod_n521_9 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_n521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mod_n521_9_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_n521_9_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_nsm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+extern void bignum_mod_nsm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_nsm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_25519
 // Input x[4]; output z[4]
-extern void bignum_mod_p25519_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_256
 // Input x[k]; output z[4]
-extern void bignum_mod_p256 (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
-extern void bignum_mod_p256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
 // Reduce modulo field characteristic, z := x mod p_256
 // Input x[4]; output z[4]
-extern void bignum_mod_p256_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_mod_p256k1_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_384
 // Input x[k]; output z[6]
-extern void bignum_mod_p384 (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
-extern void bignum_mod_p384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
 // Reduce modulo field characteristic, z := x mod p_384
 // Input x[6]; output z[6]
-extern void bignum_mod_p384_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_mod_p384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Reduce modulo field characteristic, z := x mod p_521
 // Input x[9]; output z[9]
-extern void bignum_mod_p521_9 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_p521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_sm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Add modulo m, z := (x + y) mod m, assuming x and y reduced
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_modadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_modadd (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Double modulo m, z := (2 * x) mod m, assuming x reduced
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_moddouble (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_moddouble (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+// Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m
+// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
+extern void bignum_modexp(uint64_t k,uint64_t *z, const uint64_t *a,const uint64_t *p,const uint64_t *m,uint64_t *t);
 // Compute "modification" constant z := 2^{64k} mod m
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_modifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_modifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
 // Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b
 // Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
-extern void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+extern void bignum_modinv (uint64_t k, uint64_t *z, const uint64_t *a, const uint64_t *b, uint64_t *t);
 // Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[k], m[k]; output z[k]
-extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m);
+extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x, const uint64_t *m);
 // Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_modsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_modsub (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Compute "montification" constant z := 2^{128k} mod m
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_montifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_montifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
+// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^{64k}) mod m
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_montmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_montmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Montgomery multiply, z := (x * y / 2^256) mod p_256
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_montmul_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montmul_p256_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^256) mod p_256k1
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_montmul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montmul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^384) mod p_384
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_montmul_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_montmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montmul_p384_alt (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_montmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Montgomery multiply, z := (x * y / 2^576) mod p_521
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_montmul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_montmul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_montmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery reduce, z := (x' / 2^{64p}) MOD m
 // Inputs x[n], m[k], p; output z[k]
-extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
 // Montgomery square, z := (x^2 / 2^{64k}) mod m
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_montsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_montsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Montgomery square, z := (x^2 / 2^256) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_montsqr_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montsqr_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery square, z := (x^2 / 2^256) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_montsqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montsqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery square, z := (x^2 / 2^384) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_montsqr_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_montsqr_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montsqr_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_montsqr_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Montgomery square, z := (x^2 / 2^576) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_montsqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_montsqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_montsqr_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montsqr_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
-extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
-extern void bignum_mul_4_8 (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_4_8_alt (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply z := x * y
 // Inputs x[6], y[6]; output z[12]
-extern void bignum_mul_6_12 (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mul_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_mul_6_12_alt (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mul_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
-extern void bignum_mul_8_16 (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+extern void bignum_mul_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
-extern void bignum_mul_8_16_alt (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+extern void bignum_mul_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
 // Multiply modulo p_25519, z := (x * y) mod p_25519
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_mul_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_p25519_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply modulo p_256k1, z := (x * y) mod p_256k1
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_mul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_mul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
 // Multiply bignum by 10 and add word: z := 10 * z + d
 // Inputs z[k], d; outputs function return (carry) and z[k]
@@ -512,55 +669,59 @@ extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
 // Multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[k], y[k]; output z[k]
-extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y);
+extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y);
 // 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[4], y[4]; output z[4]
-extern void bignum_mux_4 (uint64_t p, uint64_t z[static 4],uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mux_4 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[6], y[6]; output z[6]
-extern void bignum_mux_6 (uint64_t p, uint64_t z[static 6],uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mux_6 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Select element from 16-element table, z := xs[k*i]
 // Inputs xs[16*k], i; output z[k]
-extern void bignum_mux16 (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i);
+extern void bignum_mux16 (uint64_t k, uint64_t *z, const uint64_t *xs, uint64_t i);
 // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_neg_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_neg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_neg_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_neg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negated modular inverse, z := (-1/x) mod 2^{64k}
 // Input x[k]; output z[k]
-extern void bignum_negmodinv (uint64_t k, uint64_t *z, uint64_t *x);
+extern void bignum_negmodinv (uint64_t k, uint64_t *z, const uint64_t *x);
 // Test bignum for nonzero-ness x =/= 0
 // Input x[k]; output function return
-extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x);
+extern uint64_t bignum_nonzero (uint64_t k, const uint64_t *x);
 // Test 256-bit bignum for nonzero-ness x =/= 0
 // Input x[4]; output function return
-extern uint64_t bignum_nonzero_4(uint64_t x[static 4]);
+extern uint64_t bignum_nonzero_4(const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test 384-bit bignum for nonzero-ness x =/= 0
 // Input x[6]; output function return
-extern uint64_t bignum_nonzero_6(uint64_t x[static 6]);
+extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Normalize bignum in-place by shifting left till top bit is 1
 // Input z[k]; outputs function return (bits shifted left) and z[k]
@@ -568,7 +729,7 @@ extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
 // Test bignum for odd-ness
 // Input x[k]; output function return
-extern uint64_t bignum_odd (uint64_t k, uint64_t *x);
+extern uint64_t bignum_odd (uint64_t k, const uint64_t *x);
 // Convert single digit to bignum, z := n
 // Input n; output z[k]
@@ -576,39 +737,43 @@ extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
 // Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
 // Inputs p, x[k]; outputs function return (nonzero input) and z[k]
-extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x);
+extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x);
 // Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p25519 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p256 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p256k1 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[6]; output z[6]
-extern void bignum_optneg_p384 (uint64_t z[static 6], uint64_t p, uint64_t x[static 6]);
+extern void bignum_optneg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[9]; output z[9]
-extern void bignum_optneg_p521 (uint64_t z[static 9], uint64_t p, uint64_t x[static 9]);
+extern void bignum_optneg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Return bignum of power of 2, z := 2^n
 // Input n; output z[k]
@@ -616,216 +781,376 @@ extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
 // Shift bignum left by c < 64 bits z := x * 2^c
 // Inputs x[n], c; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
 // Shift bignum right by c < 64 bits z := floor(x / 2^c)
 // Inputs x[n], c; outputs function return (bits shifted out) and z[k]
-extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
 // Square, z := x^2
 // Input x[n]; output z[k]
-extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
 // Square, z := x^2
 // Input x[4]; output z[8]
-extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]);
+extern void bignum_sqr_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_4_8_alt (uint64_t z[static 8], uint64_t x[static 4]);
+extern void bignum_sqr_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square, z := x^2
 // Input x[6]; output z[12]
-extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]);
+extern void bignum_sqr_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]);
+extern void bignum_sqr_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Square, z := x^2
 // Input x[8]; output z[16]
-extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]);
+extern void bignum_sqr_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
-extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+extern void bignum_sqr_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
 // Square modulo p_25519, z := (x^2) mod p_25519
 // Input x[4]; output z[4]
-extern void bignum_sqr_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_p25519_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square modulo p_256k1, z := (x^2) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_sqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_sqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_sqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_sqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_sqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
-extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_sub_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_sub_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_sub_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_sub_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Convert 4-digit (256-bit) bignum to big-endian bytes
 // Input x[4]; output z[32] (bytes)
-extern void bignum_tobebytes_4 (uint8_t z[static 32], uint64_t x[static 4]);
+extern void bignum_tobebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to big-endian bytes
 // Input x[6]; output z[48] (bytes)
-extern void bignum_tobebytes_6 (uint8_t z[static 48], uint64_t x[static 6]);
+extern void bignum_tobebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert 4-digit (256-bit) bignum to little-endian bytes
 // Input x[4]; output z[32] (bytes)
-extern void bignum_tolebytes_4 (uint8_t z[static 32], uint64_t x[static 4]);
+extern void bignum_tolebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to little-endian bytes
 // Input x[6]; output z[48] (bytes)
-extern void bignum_tolebytes_6 (uint8_t z[static 48], uint64_t x[static 6]);
+extern void bignum_tolebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert 9-digit 528-bit bignum to little-endian bytes
 // Input x[6]; output z[66] (bytes)
-extern void bignum_tolebytes_p521 (uint8_t z[static 66], uint64_t x[static 9]);
+extern void bignum_tolebytes_p521 (uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
 // Convert to Montgomery form z := (2^256 * x) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_tomont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_tomont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert to Montgomery form z := (2^256 * x) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_tomont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_tomont_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert to Montgomery form z := (2^384 * x) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_tomont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_tomont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_tomont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_tomont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert to Montgomery form z := (2^576 * x) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_tomont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_tomont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert to Montgomery form z := (2^256 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_tomont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_256, z := (3 * x) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_triple_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_triple_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_256k1, z := (3 * x) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_triple_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_triple_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_384, z := (3 * x) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_triple_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_triple_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_triple_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_triple_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_triple_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_triple_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_triple_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_triple_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_triple_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_triple_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery ladder step for curve25519
 // Inputs point[8], pp[16], b; output rr[16]
-extern void curve25519_ladderstep(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
-extern void curve25519_ladderstep_alt(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep_alt(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
 // Projective scalar multiplication, x coordinate only, for curve25519
 // Inputs scalar[4], point[4]; output res[8]
-extern void curve25519_pxscalarmul(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_pxscalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_pxscalarmul_alt(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_pxscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
 // x25519 function for curve25519
 // Inputs scalar[4], point[4]; output res[4]
-extern void curve25519_x25519(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_x25519(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_x25519_alt(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_x25519_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+// x25519 function for curve25519 (byte array arguments)
+// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
 // x25519 function for curve25519 on base element 9
 // Input scalar[4]; output res[4]
-extern void curve25519_x25519base(uint64_t res[static 4],uint64_t scalar[static 4]);
+extern void curve25519_x25519base(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_x25519base_alt(uint64_t res[static 4],uint64_t scalar[static 4]);
+extern void curve25519_x25519base_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+// x25519 function for curve25519 on base element 9 (byte array arguments)
+// Input scalar[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+// Decode compressed 256-bit form of edwards25519 point
+// Input c[32] (bytes); output function return and z[8]
+extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+// Encode edwards25519 point into compressed form as 256-bit number
+// Input p[8]; output z[32] (bytes)
+extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t p[S2N_BIGNUM_STATIC 8]);
 // Extended projective addition for edwards25519
 // Inputs p1[16], p2[16]; output p3[16]
-extern void edwards25519_epadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]);
+extern void edwards25519_epadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
-extern void edwards25519_epadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]);
+extern void edwards25519_epadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
 // Extended projective doubling for edwards25519
 // Inputs p1[12]; output p3[16]
-extern void edwards25519_epdouble(uint64_t p3[static 16],uint64_t p1[static 12]);
+extern void edwards25519_epdouble(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_epdouble_alt(uint64_t p3[static 16],uint64_t p1[static 12]);
+extern void edwards25519_epdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Projective doubling for edwards25519
 // Inputs p1[12]; output p3[12]
-extern void edwards25519_pdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void edwards25519_pdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_pdouble_alt(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void edwards25519_pdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Extended projective + precomputed mixed addition for edwards25519
 // Inputs p1[16], p2[12]; output p3[16]
-extern void edwards25519_pepadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]);
+extern void edwards25519_pepadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_pepadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]);
+extern void edwards25519_pepadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Scalar multiplication by standard basepoint for edwards25519 (Ed25519)
+// Input scalar[4]; output res[8]
+extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+// Double scalar multiplication for edwards25519, fresh and base point
+// Input scalar[4], point[8], bscalar[4]; output res[8]
+extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k2(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 512],const int16_t b[S2N_BIGNUM_STATIC 512],const int16_t bt[S2N_BIGNUM_STATIC 256]);
+// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k3(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 768],const int16_t b[S2N_BIGNUM_STATIC 768],const int16_t bt[S2N_BIGNUM_STATIC 384]);
+// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k4(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 1024],const int16_t b[S2N_BIGNUM_STATIC 1024],const int16_t bt[S2N_BIGNUM_STATIC 512]);
+// Inverse number-theoretic transform from ML-KEM
+// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_intt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
+// Precompute the mulcache data for a polynomial in the NTT domain
+// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words)
+extern void mlkem_mulcache_compute(int16_t x[S2N_BIGNUM_STATIC 128],const int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z[S2N_BIGNUM_STATIC 128],const int16_t t[S2N_BIGNUM_STATIC 128]);
+// Forward number-theoretic transform from ML-KEM
+// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_ntt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
+// Canonical modular reduction of polynomial coefficients for ML-KEM
+// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_reduce(int16_t a[S2N_BIGNUM_STATIC 256]);
+// Pack ML-KEM polynomial coefficients as 12-bit numbers
+// Input a[256] (signed 16-bit words); output r[384] (bytes)
+extern void mlkem_tobytes(uint8_t r[S2N_BIGNUM_STATIC 384],const int16_t a[S2N_BIGNUM_STATIC 256]);
+// Conversion of ML-KEM polynomial coefficients to Montgomery form
+// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_tomont(int16_t a[S2N_BIGNUM_STATIC 256]);
+// Uniform rejection sampling for ML-KEM
+// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return
+extern uint64_t mlkem_rej_uniform_VARIABLE_TIME(int16_t r[S2N_BIGNUM_STATIC 256],const uint8_t *buf,uint64_t buflen,const uint8_t *table);
 // Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12], p2[12]; output p3[12]
-extern void p256_montjadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
 // Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12]; output p3[12]
-extern void p256_montjdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void p256_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12], p2[8]; output p3[12]
-extern void p256_montjmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+extern void p256_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void p256_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Montgomery-Jacobian form scalar multiplication for P-256
+// Input scalar[4], point[12]; output res[12]
+extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+// Scalar multiplication for NIST curve P-256
+// Input scalar[4], point[8]; output res[8]
+extern void p256_scalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+extern void p256_scalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+extern void p256_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
+extern void p256_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
 // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18], p2[18]; output p3[18]
-extern void p384_montjadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]);
+extern void p384_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
 // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18]; output p3[18]
-extern void p384_montjdouble(uint64_t p3[static 18],uint64_t p1[static 18]);
+extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
 // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18], p2[12]; output p3[18]
-extern void p384_montjmixadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
+extern void p384_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p384_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Montgomery-Jacobian form scalar multiplication for P-384
+// Input scalar[6], point[18]; output res[18]
+extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
 // Point addition on NIST curve P-521 in Jacobian coordinates
 // Inputs p1[27], p2[27]; output p3[27]
-extern void p521_jadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]);
+extern void p521_jadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
+extern void p521_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
 // Point doubling on NIST curve P-521 in Jacobian coordinates
 // Input p1[27]; output p3[27]
-extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]);
+extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
+extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
 // Point mixed addition on NIST curve P-521 in Jacobian coordinates
 // Inputs p1[27], p2[18]; output p3[27]
-extern void p521_jmixadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]);
+extern void p521_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p521_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+// Jacobian form scalar multiplication for P-521
+// Input scalar[9], point[27]; output res[27]
+extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
+extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
 // Point addition on SECG curve secp256k1 in Jacobian coordinates
 // Inputs p1[12], p2[12]; output p3[12]
-extern void secp256k1_jadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+extern void secp256k1_jadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
 // Point doubling on SECG curve secp256k1 in Jacobian coordinates
 // Input p1[12]; output p3[12]
-extern void secp256k1_jdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void secp256k1_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
 // Inputs p1[12], p2[8]; output p3[12]
-extern void secp256k1_jmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+extern void secp256k1_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void secp256k1_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Keccak-f1600 permutation for SHA3
+// Inputs a[25], rc[24]; output a[25]
+extern void sha3_keccak_f1600(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Batched 2-way Keccak-f1600 permutation for SHA3
+// Inputs a[50], rc[24]; output a[50]
+extern void sha3_keccak2_f1600(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak2_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Batched 4-way Keccak-f1600 permutation for SHA3
+// Inputs a[100], rc[24]; output a[100]
+extern void sha3_keccak4_f1600(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak4_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak4_f1600_alt2(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Point addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[12]; output p3[12]
+extern void sm2_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12]; output p3[12]
+extern void sm2_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+// Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[8]; output p3[12]
+extern void sm2_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void sm2_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Montgomery-Jacobian form scalar multiplication for CC curve SM2
+// Input scalar[4], point[12]; output res[12]
+extern void sm2_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
 // Reverse the bytes in a single word
 // Input a; output function return
@@ -839,6 +1164,10 @@ extern uint64_t word_clz (uint64_t a);
 // Input a; output function return
 extern uint64_t word_ctz (uint64_t a);
+// Perform 59 "divstep" iterations and return signed matrix of updates
+// Inputs d, f, g; output m[2][2] and function return
+extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
 // Return maximum of two unsigned 64-bit words
 // Inputs a, b; output function return
 extern uint64_t word_max (uint64_t a, uint64_t b);
@@ -851,6 +1180,10 @@ extern uint64_t word_min (uint64_t a, uint64_t b);
 // Input a; output function return
 extern uint64_t word_negmodinv (uint64_t a);
+// Count number of set bits in a single 64-bit word (population count)
+// Input a; output function return
+extern uint64_t word_popcount (uint64_t a);
 // Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set
 // Input a; output function return
 extern uint64_t word_recip (uint64_t a);
diff --git a/src/lib/libcrypto/bn/s2n_bignum_internal.h b/src/lib/libcrypto/bn/s2n_bignum_internal.h
index b82db7d019..37eebb4fd6 100644
--- a/src/lib/libcrypto/bn/s2n_bignum_internal.h
+++ b/src/lib/libcrypto/bn/s2n_bignum_internal.h
@@ -1,3 +1,5 @@
+// $OpenBSD: s2n_bignum_internal.h,v 1.5 2025/08/12 10:01:37 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -14,14 +16,14 @@
 #ifdef __APPLE__
 #   define S2N_BN_SYMBOL(NAME) _##NAME
+#   if defined(__AARCH64EL__) || defined(__ARMEL__)
+#     define __LF %%
+#   else
+#     define __LF ;
+#   endif
 #else
 #   define S2N_BN_SYMBOL(name) name
-#endif
+#   define __LF ;
-#ifdef __CET__
-#   include <cet.h>
-#else
-#   define _CET_ENDBR
 #endif
 #define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name)
@@ -34,3 +36,24 @@
 #else
 #   define S2N_BN_SYM_PRIVACY_DIRECTIVE(name)  /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */
 #endif
+// Enable indirect branch tracking support unless explicitly disabled
+// with -DNO_IBT. If the platform supports CET, simply inherit this from
+// the usual header. Otherwise manually define _CET_ENDBR, used at each
+// x86 entry point, to be the ENDBR64 instruction, with an explicit byte
+// sequence for compilers/assemblers that don't know about it. Note that
+// it is safe to use ENDBR64 on all platforms, since the encoding is by
+// design interpreted as a NOP on all pre-CET x86_64 processors. The only
+// downside is a small increase in code size and potentially a modest
+// slowdown from executing one more instruction.
+#if NO_IBT
+#   if defined(_CET_ENDBR)
+#     error "The s2n-bignum build option NO_IBT was configured, but _CET_ENDBR is defined in this compilation unit. That is weird, so failing the build."
+#   endif
+#   define _CET_ENDBR
+#elif defined(__CET__)
+#   include <cet.h>
+#elif !defined(_CET_ENDBR)
+#   define _CET_ENDBR .byte 0xf3,0x0f,0x1e,0xfa
+#endif