56 files changed, 3827 insertions, 743 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
index 5fe4aae7a1..1d4e6d08ef 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_add
+//    extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x + y operation, truncating modulo p words in general and
 // returning a top carry (0 or 1) in the p'th place, only adding the input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_add):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_add_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_add_xtest
-xmainloop:
+bignum_add_xmainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_add_xmainloop
-        jmp     xtest
+        jmp     bignum_add_xtest
-xtoploop:
+bignum_add_xtoploop:
        mov     a, [x+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_add_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_add_xtoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -112,30 +113,30 @@ xtest:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_add_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_add_ytoploop
-ymainloop:
+bignum_add_ymainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_add_ymainloop
-ytoploop:
+bignum_add_ytoploop:
        mov     a, [y+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_add_ytoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -144,16 +145,16 @@ ytoploop:
 // Adding a non-trivial tail, when p > max(m,n)
-tails:
+bignum_add_tails:
        mov     [z+8*i],a
        xor     a, a
-        jmp     tail
+        jmp     bignum_add_tail
-tailloop:
+bignum_add_tailloop:
        mov     [z+8*i],a
-tail:
+bignum_add_tail:
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_add_tailloop
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
index 25ba17bce2..a611919603 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmadd
+//    extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                 const uint64_t *y);
 //
 // Does the "z := z + c * y" operation where y is n digits, result z is p.
 // Truncates the result in general.
@@ -54,7 +56,7 @@
 S2N_BN_SYMBOL(bignum_cmadd):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd):
        xor     h, h
        test    n, n
-        jz      end
+        jz      bignum_cmadd_end
 // Move c into a safer register as multiplies overwrite rdx
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd):
        mov     h, rdx
        mov     ishort, 1
        dec     n
-        jz      hightail
+        jz      bignum_cmadd_hightail
 // Main loop, where we always have CF + previous high part h to add in
-loop:
+bignum_cmadd_loop:
        adc     h, [z+8*i]
        sbb     r, r
        mov     rax, [x+8*i]
@@ -111,36 +113,36 @@ loop:
        mov     h, rdx
        inc     i
        dec     n
-        jnz     loop
+        jnz     bignum_cmadd_loop
-hightail:
+bignum_cmadd_hightail:
        adc     h, 0
 // Propagate the carry all the way to the end with h as extra carry word
-tail:
+bignum_cmadd_tail:
        test    p, p
-        jz      end
+        jz      bignum_cmadd_end
        add     [z+8*i], h
        mov     hshort, 0
        inc     i
        dec     p
-        jz      highend
+        jz      bignum_cmadd_highend
-tloop:
+bignum_cmadd_tloop:
        adc     [z+8*i], h
        inc     i
        dec     p
-        jnz     tloop
+        jnz     bignum_cmadd_tloop
-highend:
+bignum_cmadd_highend:
        adc     h, 0
 // Return the high/carry word
-end:
+bignum_cmadd_end:
        mov     rax, h
        pop     rbx
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
index 12f785d63a..eb71d9da44 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmul
+//    extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                const uint64_t *y);
 //
 // Does the "z := c * y" operation where y is n digits, result z is p.
 // Truncates the result in general unless p >= n + 1.
@@ -51,7 +53,7 @@
 S2N_BN_SYMBOL(bignum_cmul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul):
        xor     h, h
        xor     i, i
        test    n, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Move c into a safer register as multiplies overwrite rdx
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul):
        mov     h, rdx
        inc     i
        cmp     i, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Main loop doing the multiplications
-loop:
+bignum_cmul_loop:
        mov     rax, [x+8*i]
        mul     c
        add     rax, h
@@ -103,28 +105,28 @@ loop:
        mov     h, rdx
        inc     i
        cmp     i, n
-        jc      loop
+        jc      bignum_cmul_loop
 // Add a tail when the destination is longer
-tail:
+bignum_cmul_tail:
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
        mov     [z+8*i], h
        xor     h, h
        inc     i
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
-tloop:
+bignum_cmul_tloop:
        mov     [z+8*i], h
        inc     i
        cmp     i, p
-        jc      tloop
+        jc      bignum_cmul_tloop
 // Return the high/carry word
-end:
+bignum_cmul_end:
        mov     rax, h
 #if WINDOWS_ABI
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
new file mode 100644
index 0000000000..baf27fdc7f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
@@ -0,0 +1,112 @@
+// $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modadd):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modadd_end
+// First just add (c::z) := x + y
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modadd_addloop:
+        mov     a, [x+8*i]
+        adc     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modadd_addloop
+        adc     c, 0
+// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
+        mov     j, k
+        xor     i, i
+bignum_modadd_cmploop:
+        mov     a, [z+8*i]
+        sbb     a, [m+8*i]
+        inc     i
+        dec     j
+        jnz     bignum_modadd_cmploop
+        sbb     c, 0
+        not     c
+// Now do a masked subtraction z := z - [c] * m
+        xor     i, i
+bignum_modadd_subloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        sbb     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modadd_subloop
+bignum_modadd_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
new file mode 100644
index 0000000000..63b3230e35
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
@@ -0,0 +1,99 @@
+// $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modsub):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modsub_end
+// Subtract z := x - y and record a mask for the carry x - y < 0
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modsub_subloop:
+        mov     a, [x+8*i]
+        sbb     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modsub_subloop
+        sbb     c, c
+// Now do a masked addition z := z + [c] * m
+        xor     i, i
+bignum_modsub_addloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        adc     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modsub_addloop
+bignum_modsub_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
index a3552679a2..538cce9af7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
 //
-//    extern void bignum_mul
+//    extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
-//     (uint64_t k, uint64_t *z,
+//                           uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the "z := x * y" operation where x is m digits, y is n, result z is k.
 // Truncates the result in general unless k >= m + n
@@ -59,7 +60,7 @@
 S2N_BN_SYMBOL(bignum_mul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul):
 // If we did a multiply-add variant, however, then we could
        test    p, p
-        jz      end
+        jz      bignum_mul_end
 // Set initial 2-part sum to zero (we zero c inside the body)
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul):
        xor     k, k
-outerloop:
+bignum_mul_outerloop:
 // Zero our carry term first; we eventually want it and a zero is useful now
 // Set a =  max 0 (k + 1 - n), i = min (k + 1) m
@@ -125,11 +126,11 @@ outerloop:
        mov     d, k
        sub     d, i
        sub     i, a
-        jbe     innerend
+        jbe     bignum_mul_innerend
        lea     x,[rcx+8*a]
        lea     y,[r9+8*d-8]
-innerloop:
+bignum_mul_innerloop:
        mov     rax, [y+8*i]
        mul     QWORD PTR  [x]
        add     x, 8
@@ -137,9 +138,9 @@ innerloop:
        adc     h, rdx
        adc     c, 0
        dec     i
-        jnz     innerloop
+        jnz     bignum_mul_innerloop
-innerend:
+bignum_mul_innerend:
        mov     [z], l
        mov     l, h
@@ -147,9 +148,9 @@ innerend:
        add     z, 8
        cmp     k, p
-        jc      outerloop
+        jc      bignum_mul_outerloop
-end:
+bignum_mul_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..d6ad514020
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,187 @@
+// $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
+//                               const uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 4 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 4 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 4 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 4 == 3)
+        adcx    r11, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 4 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 4 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 4 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 4 == 3)
+        mov     [z+8*\arg1],r11
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+.if (\arg1 % 4 == 0)
+        mulx    r8, rax, [x+24]
+        adcx    r11, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 4 == 1)
+        mulx    r9, rax, [x+24]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 4 == 2)
+        mulx    r10, rax, [x+24]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 4 == 3)
+        mulx    r11, rax, [x+24]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r11,r10,r9 as y[0] * x from 1..4
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r8, rbx, [x+24]
+        adcx    r11, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+// Now write back the additional columns
+        mov     [z+32], r8
+        mov     [z+40], r9
+        mov     [z+48], r10
+        mov     [z+56], r11
+// Restore registers and return
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
index 70ff69e372..2592d1d658 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
 //
-//    extern void bignum_mul_4_8_alt
+//    extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4],
-//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//                                   const uint64_t y[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +74,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..56cbdf06e0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,223 @@
+// $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
+//                                const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcx    r13, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 6 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 6 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 6 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 6 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 6 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 6 == 5)
+        mov     [z+8*\arg1],r13
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+.if (\arg1 % 6 == 0)
+        mulx    r8, rax, [x+40]
+        adcx    r13, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 6 == 1)
+        mulx    r9, rax, [x+40]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 6 == 2)
+        mulx    r10, rax, [x+40]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 6 == 3)
+        mulx    r11, rax, [x+40]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.elseif (\arg1 % 6 == 4)
+        mulx    r12, rax, [x+40]
+        adcx    r11, rax
+        adox    r12, zero
+        adcx    r12, zero
+.elseif (\arg1 % 6 == 5)
+        mulx    r13, rax, [x+40]
+        adcx    r12, rax
+        adox    r13, zero
+        adcx    r13, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r12, rbx, [x+24]
+        adcx    r11, rbx
+        mulx    r13, rbx, [x+32]
+        adcx    r12, rbx
+        mulx    r8, rbx, [x+40]
+        adcx    r13, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+// Now write back the additional columns
+        mov     [z+48], r8
+        mov     [z+56], r9
+        mov     [z+64], r10
+        mov     [z+72], r11
+        mov     [z+80], r12
+        mov     [z+88], r13
+// Restore registers and return
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..077c52b38e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,199 @@
+// $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6],
+//                                    const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// This is moved from rdx to free it for muls
+#define y rcx
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A minutely shorter form for when c = 0 initially
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+// A short form where we don't expect a top carry
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Result term 0
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+// Result term 3
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+// Result term 4
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+// Result term 5
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+// Result term 6
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        mov     [z+48], t2
+// Result term 7
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40],[y+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..faa0196d8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,273 @@
+// $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
+//                                const uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rbx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rbx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rbx
+.endif
+.endm
+// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd 0, \arg1
+.if (\arg1 % 8 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 8 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 8 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 8 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 8 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 8 == 5)
+        mov     [z+8*\arg1],r13
+.elseif (\arg1 % 8 == 6)
+        mov     [z+8*\arg1],r14
+.elseif (\arg1 % 8 == 7)
+        mov     [z+8*\arg1],r15
+.endif
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+.if (\arg1 % 8 == 0)
+        adc     r8, zero
+.elseif (\arg1 % 8 == 1)
+        adc     r9, zero
+.elseif (\arg1 % 8 == 2)
+        adc     r10, zero
+.elseif (\arg1 % 8 == 3)
+        adc     r11, zero
+.elseif (\arg1 % 8 == 4)
+        adc     r12, zero
+.elseif (\arg1 % 8 == 5)
+        adc     r13, zero
+.elseif (\arg1 % 8 == 6)
+        adc     r14, zero
+.elseif (\arg1 % 8 == 7)
+        adc     r15, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adc     r9, rbx
+        mulx    r11, rbx, [x+16]
+        adc     r10, rbx
+        mulx    r12, rbx, [x+24]
+        adc     r11, rbx
+        mulx    r13, rbx, [x+32]
+        adc     r12, rbx
+        mulx    r14, rbx, [x+40]
+        adc     r13, rbx
+        mulx    r15, rbx, [x+48]
+        adc     r14, rbx
+        mulx    r8, rbx, [x+56]
+        adc     r15, rbx
+        adc     r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+        addrow  6
+        addrow  7
+// Now write back the additional columns
+        mov     [z+64], r8
+        mov     [z+72], r9
+        mov     [z+80], r10
+        mov     [z+88], r11
+        mov     [z+96], r12
+        mov     [z+104], r13
+        mov     [z+112], r14
+        mov     [z+120], r15
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
index 066403b074..0e30b9170f 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,9 @@
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
 //
-//    extern void bignum_mul_8_16_alt
+//    extern void bignum_mul_8_16_alt(uint64_t z[static 16],
-//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//                                    const uint64_t x[static 8],
+//                                    const uint64_t y[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +75,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
index 54e3f59442..86f1af2ac4 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,7 @@
 // Square z := x^2
 // Input x[n]; output z[k]
 //
-//    extern void bignum_sqr
+//    extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
-//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
 //
 // Does the "z := x^2" operation where x is n digits and result z is k.
 // Truncates the result in general unless k >= 2 * n
@@ -62,7 +63,7 @@
 #define llshort ebp
 S2N_BN_SYMBOL(bignum_sqr):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr):
 // If p = 0 the result is trivial and nothing needs doing
        test    p, p
-        jz      end
+        jz      bignum_sqr_end
 // initialize (hh,ll) = 0
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr):
        xor     k, k
-outerloop:
+bignum_sqr_outerloop:
 // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
 // We want to accumulate all x[i] * x[k - i] for bot <= i < top
@@ -122,7 +123,7 @@ outerloop:
 // If htop <= bot then main doubled part of the sum is empty
        cmp     i, htop
-        jnc     nosumming
+        jnc     bignum_sqr_nosumming
 // Use a moving pointer for [y] = x[k-i] for the cofactor
@@ -132,7 +133,7 @@ outerloop:
 // Do the main part of the sum x[i] * x[k - i] for 2 * i < k
-innerloop:
+bignum_sqr_innerloop:
        mov     a, [x+8*i]
        mul     QWORD PTR [y]
        add     l, a
@@ -141,7 +142,7 @@ innerloop:
        sub     y, 8
        inc     i
        cmp     i, htop
-        jc      innerloop
+        jc      bignum_sqr_innerloop
 // Now double it
@@ -151,11 +152,11 @@ innerloop:
 // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
-nosumming:
+bignum_sqr_nosumming:
        test    k, 1
-        jnz     innerend
+        jnz     bignum_sqr_innerend
        cmp     i, n
-        jnc     innerend
+        jnc     bignum_sqr_innerend
        mov     a, [x+8*i]
        mul     a
@@ -165,7 +166,7 @@ nosumming:
 // Now add the local sum into the global sum, store and shift
-innerend:
+bignum_sqr_innerend:
        add     l, ll
        mov     [z+8*k], l
        adc     h, hh
@@ -175,11 +176,11 @@ innerend:
        inc     k
        cmp     k, p
-        jc      outerloop
+        jc      bignum_sqr_outerloop
 // Restore registers and return
-end:
+bignum_sqr_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..25664782f7
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,158 @@
+// $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+S2N_BN_SYMBOL(bignum_sqr_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+// Set up an initial window [d6;...d1] = [23;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mov     rdx, [x+16]
+        mulx    d6, d5, [x+24]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+24]
+        mulx    rcx, rax, [x+8]
+        adcx    d4, rax
+        adox    d5, rcx
+        adcx    d5, zero
+        adox    d6, zero
+        adcx    d6, zero
+// In principle this is otiose as CF and OF carries are absorbed at this point
+// However it seems helpful for the OOO engine to be told it's a fresh start
+        xor     zeroe, zeroe
+// Double and add to the 00 + 11 + 22 + 33 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        mov     [z+32], d4
+        adcx    d6, d6
+        mov     [z+40], d5
+        adox    d6, rax
+        mov     [z+48], d6
+        adcx    rdx, zero
+        adox    rdx, zero
+        mov     [z+56], rdx
+// Restore saved registers and return
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
index 7c534ae907..7eafac3284 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Square, z := x^2
 // Input x[4]; output z[8]
 //
-//    extern void bignum_sqr_4_8_alt
+//    extern void bignum_sqr_4_8_alt(uint64_t z[static 8],
-//      (uint64_t z[static 8], uint64_t x[static 4]);
+//                                   const uint64_t x[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -71,7 +73,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..3f055e8b75
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,227 @@
+// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+#define d7 r14
+#define d8 r15
+#define d9 rbx
+// Care is needed: re-using the zero register
+#define d10 rbp
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mulx    d6, d5, [x+40]
+        mov     rdx, [x+24]
+        mulx    d8, d7, [x+32]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+8]
+        mulx    rcx, rax, [x+24]
+        adcx    d4, rax
+        adox    d5, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d6, rax
+        adox    d7, rcx
+        adcx    d7, zero
+        adox    d8, zero
+        adcx    d8, zero
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+        xor     zeroe, zeroe
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+        mov     rdx, [x+32]
+        mulx    rcx, rax, [x]
+        adcx    d4, rax
+        adox    d5, rcx
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x+24]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d6, rax
+        adox    d7, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d7, rax
+        adox    d8, rcx
+        mov     rdx, [x+24]
+        mulx    d9, rax, [x+40]
+        adcx    d8, rax
+        adox    d9, zero
+        mov     rdx, [x+32]
+        mulx    d10, rax, [x+40]
+        adcx    d9, rax
+        mov     eax, 0
+        adox    d10, rax
+        adcx    d10, rax
+// Again, just for a clear fresh start for the flags
+        xor     eax, eax
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        adcx    d6, d6
+        adox    d6, rax
+        adcx    d7, d7
+        adox    d7, rdx
+        mov     rdx, [x+32]
+        mov     [z+32], d4
+        mulx    rdx, rax, rdx
+        adcx    d8, d8
+        adox    d8, rax
+        adcx    d9, d9
+        adox    d9, rdx
+        mov     rdx, [x+40]
+        mov     [z+40], d5
+        mulx    rdx, rax, rdx
+        mov     [z+48], d6
+        adcx    d10, d10
+        mov     [z+56], d7
+        adox    d10, rax
+        mov     [z+64], d8
+        mov     eax, 0
+        mov     [z+72], d9
+        adcx    rdx, rax
+        mov     [z+80], d10
+        adox    rdx, rax
+        mov     [z+88], rdx
+// Restore saved registers and return
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..eb43b0a15b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,210 @@
+// $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+// Input arguments
+#define z rdi
+#define x rsi
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Additional temporaries for local windows to share doublings
+#define u0 rcx
+#define u1 r11
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// Set up initial window (c,h,l) = numa * numb
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+// Square term incorporation (c,h,l) += numba^2
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A short form where we don't expect a top carry
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+// A version doubling directly before adding, for single non-square terms
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Result term 0
+        mov     rax, [x]
+        mul     rax
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+// Result term 3
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+// Result term 4
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+// Result term 5
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+// Result term 6
+        combaddz(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+// Result term 7
+        combaddz(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+24],[x+40])
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+32],[x+40])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..41277b5b6a
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,311 @@
+// $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds rdx * x[i] into the window  at the i+j point
+.macro mulpadd arg1,arg2
+        mulx    rcx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rcx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rcx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rcx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rcx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rcx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rcx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rcx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rcx
+.endif
+.endm
+// mulpade i, j adds rdx * x[i] into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+.macro diagonals
+        xor     zeroe, zeroe
+// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+        mov     rdx, [x]
+        mulx    rax, r9, [x+8]
+        mov     [z+8], r9
+        mulx    rcx, r10, [x+16]
+        adcx    r10, rax
+        mov     [z+16], r10
+        mulx    rax, r11, [x+24]
+        adcx    r11, rcx
+        mulx    rcx, r12, [x+32]
+        adcx    r12, rax
+        mulx    rax, r13, [x+40]
+        adcx    r13, rcx
+        mulx    rcx, r14, [x+48]
+        adcx    r14, rax
+        mulx    r8, r15, [x+56]
+        adcx    r15, rcx
+        adcx    r8, zero
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+        xor     zeroe, zeroe
+        mov     rdx, [x+8]
+        mulpadd 2, 1
+        mov     [z+24], r11
+        mulpadd 3, 1
+        mov     [z+32], r12
+        mulpadd 4, 1
+        mulpadd 5, 1
+        mulpadd 6, 1
+        mulpade 7, 1
+        mov     rdx, [x+32]
+        mulpade 5, 4
+        adcx    r10, zero
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+        xor     zeroe, zeroe
+        mov     rdx, [x+16]
+        mulpadd 3, 2
+        mov     [z+40], r13
+        mulpadd 4, 2
+        mov     [z+48], r14
+        mulpadd 5, 2
+        mulpadd 6, 2
+        mulpadd 7, 2
+        mov     rdx, [x+48]
+        mulpade 4, 6
+        mulpade 5, 6
+        adcx    r12, zero
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+        xor     zeroe, zeroe
+        mov     rdx, [x+24]
+        mulpadd 4, 3
+        mov     [z+56], r15
+        mulpadd 5, 3
+        mov     [z+64], r8
+        mulpadd 6, 3
+        mulpadd 7, 3
+        mov     rdx, [x+56]
+        mulpadd 4, 7
+        mulpade 5, 7
+        mulpade 6, 7
+        adcx    r14, zero
+// Double and add things; use z[1]..z[8] and thereafter the registers
+// r9..r15 which haven't been written back yet
+        xor     zeroe, zeroe
+        mov     rdx, [x]
+        mulx    rcx, rax, rdx
+        mov     [z], rax
+        mov     rax, [z+8]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+8], rax
+        mov     rax, [z+16]
+        mov     rdx, [x+8]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+16], rax
+        mov     rax, [z+24]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+24], rax
+        mov     rax, [z+32]
+        mov     rdx, [x+16]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+32], rax
+        mov     rax, [z+40]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+40], rax
+        mov     rax, [z+48]
+        mov     rdx, [x+24]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+48], rax
+        mov     rax, [z+56]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+56], rax
+        mov     rax, [z+64]
+        mov     rdx, [x+32]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+64], rax
+        adcx    r9, r9
+        adox    r9, rcx
+        mov     [z+72], r9
+        mov     rdx, [x+40]
+        mulx    rcx, rdx, rdx
+        adcx    r10, r10
+        adox    r10, rdx
+        mov     [z+80], r10
+        adcx    r11, r11
+        adox    r11, rcx
+        mov     [z+88], r11
+        mov     rdx, [x+48]
+        mulx    rcx, rdx, rdx
+        adcx    r12, r12
+        adox    r12, rdx
+        mov     [z+96], r12
+        adcx    r13, r13
+        adox    r13, rcx
+        mov     [z+104], r13
+        mov     rdx, [x+56]
+        mulx    r15, rdx, rdx
+        adcx    r14, r14
+        adox    r14, rdx
+        mov     [z+112], r14
+        adcx    r15, zero
+        adox    r15, zero
+        mov     [z+120], r15
+.endm
+S2N_BN_SYMBOL(bignum_sqr_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Do the multiplication
+        diagonals
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
index ac0b6f96c2..cb10ba2a12 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,8 @@
 // Square, z := x^2
 // Input x[8]; output z[16]
 //
-//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//    extern void bignum_sqr_8_16_alt(uint64_t z[static 16],
+//                                    const uint64_t x[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -103,7 +106,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
index 3ff8a30510..7324d3a71e 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_sub
+//    extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x - y operation, truncating modulo p words in general and
 // returning a top borrow (0 or 1) in the p'th place, only subtracting input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_sub):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_sub_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_sub_xtest
-xmainloop:
+bignum_sub_xmainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_sub_xmainloop
-        jmp     xtest
+        jmp     bignum_sub_xtest
-xtoploop:
+bignum_sub_xtoploop:
        mov     a, [x+8*i]
        sbb     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_sub_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_sub_xtoploop
        sbb     a, a
        test    p, p
-        jz      tailskip
+        jz      bignum_sub_tailskip
-tailloop:
+bignum_sub_tailloop:
        mov     [z+8*i],a
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
-tailskip:
+bignum_sub_tailskip:
        neg     a
 #if WINDOWS_ABI
        pop    rsi
@@ -118,29 +119,29 @@ tailskip:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_sub_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_sub_ytoploop
-ymainloop:
+bignum_sub_ymainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_sub_ymainloop
-ytoploop:
+bignum_sub_ytoploop:
        mov     ashort, 0
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_sub_ytoploop
        sbb     a, a
        test    p, p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
        neg     a
 #if WINDOWS_ABI
        pop    rsi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
index a377a05681..6c3888687b 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */
+/*      $OpenBSD: bn_arch.c,v 1.17 2025/09/01 15:33:23 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -19,6 +19,7 @@
 #include "bn_arch.h"
 #include "bn_local.h"
+#include "crypto_arch.h"
 #include "s2n_bignum.h"
 #ifdef HAVE_BN_ADD
@@ -26,8 +27,8 @@ BN_ULONG
 bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
 }
 #endif
@@ -46,8 +47,8 @@ BN_ULONG
 bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -55,52 +56,99 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
 }
 #endif
-#ifdef HAVE_BN_MUL_ADD_WORDS
+#ifdef HAVE_BN_MOD_ADD_WORDS
-BN_ULONG
+void
-bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
+bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
 {
-        return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
 }
 #endif
-#ifdef HAVE_BN_MUL_WORDS
+#ifdef HAVE_BN_MOD_SUB_WORDS
-BN_ULONG
+void
-bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
+bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
 {
-        return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
 }
 #endif
 #ifdef HAVE_BN_MUL_COMBA4
 void
-bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MUL_COMBA6
+void
+bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
 #ifdef HAVE_BN_MUL_COMBA8
 void
-bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
-#ifdef HAVE_BN_SQR
+#ifdef HAVE_BN_MUL_WORDS
-int
+void
-bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
+bn_mul_words(BN_ULONG *r, const BN_ULONG *a, int a_len, const BN_ULONG *b,
+    int b_len)
+{
+        bignum_mul(a_len + b_len, (uint64_t *)r, a_len, (const uint64_t *)a,
+            b_len, (const uint64_t *)b);
+}
+#endif
+#ifdef HAVE_BN_MULW_ADD_WORDS
+BN_ULONG
+bn_mulw_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d);
+        return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
+}
+#endif
-        return 1;
+#ifdef HAVE_BN_MULW_WORDS
+BN_ULONG
+bn_mulw_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
+{
+        return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 void
 bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad);
+}
+#endif
+#ifdef HAVE_BN_SQR_COMBA6
+void
+bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
@@ -117,8 +182,20 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 void
 bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad);
+}
+#endif
+#ifdef HAVE_BN_SQR_WORDS
+void
+bn_sqr_words(BN_ULONG *rd, const BN_ULONG *ad, int a_len)
+{
+        bignum_sqr(a_len * 2, (uint64_t *)rd, a_len, (const uint64_t *)ad);
 }
 #endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 927cd75208..3cb1d1d274 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.19 2025/09/01 15:15:44 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -27,14 +27,20 @@
 #define HAVE_BN_DIV_WORDS
-#define HAVE_BN_MUL_ADD_WORDS
+#define HAVE_BN_MOD_ADD_WORDS
+#define HAVE_BN_MOD_SUB_WORDS
 #define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA6
 #define HAVE_BN_MUL_COMBA8
 #define HAVE_BN_MUL_WORDS
+#define HAVE_BN_MULW_ADD_WORDS
+#define HAVE_BN_MULW_WORDS
-#define HAVE_BN_SQR
 #define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA6
 #define HAVE_BN_SQR_COMBA8
+#define HAVE_BN_SQR_WORDS
 #define HAVE_BN_SUB
 #define HAVE_BN_SUB_WORDS
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
index 3926fcd4b0..705fbdbbda 100644
--- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S
+++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
@@ -1,3 +1,5 @@
+// $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,7 @@
 // Count leading zero bits in a single word
 // Input a; output function return
 //
-//    extern uint64_t word_clz (uint64_t a);
+//    extern uint64_t word_clz(uint64_t a);
 //
 // Standard x86-64 ABI: RDI = a, returns RAX
 // Microsoft x64 ABI:   RCX = a, returns RAX
@@ -30,7 +32,7 @@
        .text
 S2N_BN_SYMBOL(word_clz):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/i386/bn_arch.h b/src/lib/libcrypto/bn/arch/i386/bn_arch.h
index eef519fcc7..288cbdeaa9 100644
--- a/src/lib/libcrypto/bn/arch/i386/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/i386/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.9 2023/02/16 10:41:03 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.11 2025/09/07 03:56:37 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -26,14 +26,13 @@
 #define HAVE_BN_DIV_WORDS
-#define HAVE_BN_MUL_ADD_WORDS
 #define HAVE_BN_MUL_COMBA4
 #define HAVE_BN_MUL_COMBA8
-#define HAVE_BN_MUL_WORDS
+#define HAVE_BN_MULW_ADD_WORDS
+#define HAVE_BN_MULW_WORDS
 #define HAVE_BN_SQR_COMBA4
 #define HAVE_BN_SQR_COMBA8
-#define HAVE_BN_SQR_WORDS
 #define HAVE_BN_SUB_WORDS
diff --git a/src/lib/libcrypto/bn/arch/mips64/bn_arch.h b/src/lib/libcrypto/bn/arch/mips64/bn_arch.h
index 53771bce1e..562a398f33 100644
--- a/src/lib/libcrypto/bn/arch/mips64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/mips64/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.7 2023/01/23 12:17:58 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.9 2025/09/07 03:56:37 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -25,14 +25,13 @@
 #define HAVE_BN_DIV_WORDS
 #define HAVE_BN_DIV_3_WORDS
-#define HAVE_BN_MUL_ADD_WORDS
 #define HAVE_BN_MUL_COMBA4
 #define HAVE_BN_MUL_COMBA8
-#define HAVE_BN_MUL_WORDS
+#define HAVE_BN_MULW_ADD_WORDS
+#define HAVE_BN_MULW_WORDS
 #define HAVE_BN_SQR_COMBA4
 #define HAVE_BN_SQR_COMBA8
-#define HAVE_BN_SQR_WORDS
 #define HAVE_BN_SUB_WORDS
diff --git a/src/lib/libcrypto/bn/arch/powerpc/bn_arch.h b/src/lib/libcrypto/bn/arch/powerpc/bn_arch.h
index 46e932a2d5..21bcdf48d3 100644
--- a/src/lib/libcrypto/bn/arch/powerpc/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/powerpc/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.6 2023/01/23 12:17:58 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.8 2025/09/07 03:56:37 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -24,14 +24,13 @@
 #define HAVE_BN_DIV_WORDS
-#define HAVE_BN_MUL_ADD_WORDS
 #define HAVE_BN_MUL_COMBA4
 #define HAVE_BN_MUL_COMBA8
-#define HAVE_BN_MUL_WORDS
+#define HAVE_BN_MULW_ADD_WORDS
+#define HAVE_BN_MULW_WORDS
 #define HAVE_BN_SQR_COMBA4
 #define HAVE_BN_SQR_COMBA8
-#define HAVE_BN_SQR_WORDS
 #define HAVE_BN_SUB_WORDS
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
index 71b775af8d..9b4b11ad5b 100644
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ b/src/lib/libcrypto/bn/asm/bn-586.pl
@@ -6,21 +6,20 @@ require "x86asm.pl";
 &asm_init($ARGV[0],$0);
-$sse2=0;
+$sse2=1;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &external_label("OPENSSL_ia32cap_P") if ($sse2);
-&bn_mul_add_words("bn_mul_add_words");
+&bn_mulw_add_words("bn_mulw_add_words");
-&bn_mul_words("bn_mul_words");
+&bn_mulw_words("bn_mulw_words");
-&bn_sqr_words("bn_sqr_words");
+&bn_sqr_word_wise("bn_sqr_word_wise");
 &bn_div_words("bn_div_words");
 &bn_add_words("bn_add_words");
 &bn_sub_words("bn_sub_words");
 &asm_finish();
-sub bn_mul_add_words
+sub bn_mulw_add_words
        {
        local($name)=@_;
@@ -207,7 +206,7 @@ sub bn_mul_add_words
        &function_end($name);
        }
-sub bn_mul_words
+sub bn_mulw_words
        {
        local($name)=@_;
@@ -319,7 +318,7 @@ sub bn_mul_words
        &function_end($name);
        }
-sub bn_sqr_words
+sub bn_sqr_word_wise
        {
        local($name)=@_;
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl
index 02d43e15b0..aaa0c5d8b0 100644
--- a/src/lib/libcrypto/bn/asm/mips.pl
+++ b/src/lib/libcrypto/bn/asm/mips.pl
@@ -110,19 +110,19 @@ $code.=<<___;
 .set    noat
 .align  5
-.globl  bn_mul_add_words
+.globl  bn_mulw_add_words
-.ent    bn_mul_add_words
+.ent    bn_mulw_add_words
-bn_mul_add_words:
+bn_mulw_add_words:
        .set    noreorder
-        bgtz    $a2,bn_mul_add_words_internal
+        bgtz    $a2,bn_mulw_add_words_internal
        move    $v0,$zero
        jr      $ra
        move    $a0,$v0
-.end    bn_mul_add_words
+.end    bn_mulw_add_words
 .align  5
-.ent    bn_mul_add_words_internal
+.ent    bn_mulw_add_words_internal
-bn_mul_add_words_internal:
+bn_mulw_add_words_internal:
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
        .frame  $sp,6*$SZREG,$ra
@@ -140,9 +140,9 @@ $code.=<<___;
        .set    reorder
        li      $minus4,-4
        and     $ta0,$a2,$minus4
-        beqz    $ta0,.L_bn_mul_add_words_tail
+        beqz    $ta0,.L_bn_mulw_add_words_tail
-.L_bn_mul_add_words_loop:
+.L_bn_mulw_add_words_loop:
        $LD     $t0,0($a1)
        $MULTU  $t0,$a3
        $LD     $t1,0($a0)
@@ -201,13 +201,13 @@ $code.=<<___;
        sltu    $at,$ta3,$at
        $ST     $ta3,-$BNSZ($a0)
        .set    noreorder
-        bgtz    $ta0,.L_bn_mul_add_words_loop
+        bgtz    $ta0,.L_bn_mulw_add_words_loop
        $ADDU   $v0,$at
-        beqz    $a2,.L_bn_mul_add_words_return
+        beqz    $a2,.L_bn_mulw_add_words_return
        nop
-.L_bn_mul_add_words_tail:
+.L_bn_mulw_add_words_tail:
        .set    reorder
        $LD     $t0,0($a1)
        $MULTU  $t0,$a3
@@ -222,7 +222,7 @@ $code.=<<___;
        sltu    $at,$t1,$at
        $ST     $t1,0($a0)
        $ADDU   $v0,$at
-        beqz    $a2,.L_bn_mul_add_words_return
+        beqz    $a2,.L_bn_mulw_add_words_return
        $LD     $t0,$BNSZ($a1)
        $MULTU  $t0,$a3
@@ -237,7 +237,7 @@ $code.=<<___;
        sltu    $at,$t1,$at
        $ST     $t1,$BNSZ($a0)
        $ADDU   $v0,$at
-        beqz    $a2,.L_bn_mul_add_words_return
+        beqz    $a2,.L_bn_mulw_add_words_return
        $LD     $t0,2*$BNSZ($a1)
        $MULTU  $t0,$a3
@@ -252,7 +252,7 @@ $code.=<<___;
        $ST     $t1,2*$BNSZ($a0)
        $ADDU   $v0,$at
-.L_bn_mul_add_words_return:
+.L_bn_mulw_add_words_return:
        .set    noreorder
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
@@ -266,22 +266,22 @@ ___
 $code.=<<___;
        jr      $ra
        move    $a0,$v0
-.end    bn_mul_add_words_internal
+.end    bn_mulw_add_words_internal
 .align  5
-.globl  bn_mul_words
+.globl  bn_mulw_words
-.ent    bn_mul_words
+.ent    bn_mulw_words
-bn_mul_words:
+bn_mulw_words:
        .set    noreorder
-        bgtz    $a2,bn_mul_words_internal
+        bgtz    $a2,bn_mulw_words_internal
        move    $v0,$zero
        jr      $ra
        move    $a0,$v0
-.end    bn_mul_words
+.end    bn_mulw_words
 .align  5
-.ent    bn_mul_words_internal
+.ent    bn_mulw_words_internal
-bn_mul_words_internal:
+bn_mulw_words_internal:
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
        .frame  $sp,6*$SZREG,$ra
@@ -299,9 +299,9 @@ $code.=<<___;
        .set    reorder
        li      $minus4,-4
        and     $ta0,$a2,$minus4
-        beqz    $ta0,.L_bn_mul_words_tail
+        beqz    $ta0,.L_bn_mulw_words_tail
-.L_bn_mul_words_loop:
+.L_bn_mulw_words_loop:
        $LD     $t0,0($a1)
        $MULTU  $t0,$a3
        $LD     $t2,$BNSZ($a1)
@@ -341,13 +341,13 @@ $code.=<<___;
        sltu    $ta3,$v0,$at
        $ST     $v0,-$BNSZ($a0)
        .set    noreorder
-        bgtz    $ta0,.L_bn_mul_words_loop
+        bgtz    $ta0,.L_bn_mulw_words_loop
        $ADDU   $v0,$ta3,$ta2
-        beqz    $a2,.L_bn_mul_words_return
+        beqz    $a2,.L_bn_mulw_words_return
        nop
-.L_bn_mul_words_tail:
+.L_bn_mulw_words_tail:
        .set    reorder
        $LD     $t0,0($a1)
        $MULTU  $t0,$a3
@@ -358,7 +358,7 @@ $code.=<<___;
        sltu    $t1,$v0,$at
        $ST     $v0,0($a0)
        $ADDU   $v0,$t1,$t0
-        beqz    $a2,.L_bn_mul_words_return
+        beqz    $a2,.L_bn_mulw_words_return
        $LD     $t0,$BNSZ($a1)
        $MULTU  $t0,$a3
@@ -369,7 +369,7 @@ $code.=<<___;
        sltu    $t1,$v0,$at
        $ST     $v0,$BNSZ($a0)
        $ADDU   $v0,$t1,$t0
-        beqz    $a2,.L_bn_mul_words_return
+        beqz    $a2,.L_bn_mulw_words_return
        $LD     $t0,2*$BNSZ($a1)
        $MULTU  $t0,$a3
@@ -380,7 +380,7 @@ $code.=<<___;
        $ST     $v0,2*$BNSZ($a0)
        $ADDU   $v0,$t1,$t0
-.L_bn_mul_words_return:
+.L_bn_mulw_words_return:
        .set    noreorder
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
@@ -394,22 +394,22 @@ ___
 $code.=<<___;
        jr      $ra
        move    $a0,$v0
-.end    bn_mul_words_internal
+.end    bn_mulw_words_internal
 .align  5
-.globl  bn_sqr_words
+.globl  bn_sqr_word_wise
-.ent    bn_sqr_words
+.ent    bn_sqr_word_wise
-bn_sqr_words:
+bn_sqr_word_wise:
        .set    noreorder
-        bgtz    $a2,bn_sqr_words_internal
+        bgtz    $a2,bn_sqr_word_wise_internal
        move    $v0,$zero
        jr      $ra
        move    $a0,$v0
-.end    bn_sqr_words
+.end    bn_sqr_word_wise
 .align  5
-.ent    bn_sqr_words_internal
+.ent    bn_sqr_word_wise_internal
-bn_sqr_words_internal:
+bn_sqr_word_wise_internal:
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
        .frame  $sp,6*$SZREG,$ra
@@ -427,9 +427,9 @@ $code.=<<___;
        .set    reorder
        li      $minus4,-4
        and     $ta0,$a2,$minus4
-        beqz    $ta0,.L_bn_sqr_words_tail
+        beqz    $ta0,.L_bn_sqr_word_wise_tail
-.L_bn_sqr_words_loop:
+.L_bn_sqr_word_wise_loop:
        $LD     $t0,0($a1)
        $MULTU  $t0,$t0
        $LD     $t2,$BNSZ($a1)
@@ -463,13 +463,13 @@ $code.=<<___;
        $ST     $ta3,-2*$BNSZ($a0)
        .set    noreorder
-        bgtz    $ta0,.L_bn_sqr_words_loop
+        bgtz    $ta0,.L_bn_sqr_word_wise_loop
        $ST     $ta2,-$BNSZ($a0)
-        beqz    $a2,.L_bn_sqr_words_return
+        beqz    $a2,.L_bn_sqr_word_wise_return
        nop
-.L_bn_sqr_words_tail:
+.L_bn_sqr_word_wise_tail:
        .set    reorder
        $LD     $t0,0($a1)
        $MULTU  $t0,$t0
@@ -478,7 +478,7 @@ $code.=<<___;
        mfhi    $t0
        $ST     $t1,0($a0)
        $ST     $t0,$BNSZ($a0)
-        beqz    $a2,.L_bn_sqr_words_return
+        beqz    $a2,.L_bn_sqr_word_wise_return
        $LD     $t0,$BNSZ($a1)
        $MULTU  $t0,$t0
@@ -487,7 +487,7 @@ $code.=<<___;
        mfhi    $t0
        $ST     $t1,2*$BNSZ($a0)
        $ST     $t0,3*$BNSZ($a0)
-        beqz    $a2,.L_bn_sqr_words_return
+        beqz    $a2,.L_bn_sqr_word_wise_return
        $LD     $t0,2*$BNSZ($a1)
        $MULTU  $t0,$t0
@@ -496,7 +496,7 @@ $code.=<<___;
        $ST     $t1,4*$BNSZ($a0)
        $ST     $t0,5*$BNSZ($a0)
-.L_bn_sqr_words_return:
+.L_bn_sqr_word_wise_return:
        .set    noreorder
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
@@ -511,7 +511,7 @@ $code.=<<___;
        jr      $ra
        move    $a0,$v0
-.end    bn_sqr_words_internal
+.end    bn_sqr_word_wise_internal
 .align  5
 .globl  bn_add_words
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
index c9b7f9477d..9b8dc55bff 100644
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ b/src/lib/libcrypto/bn/asm/ppc.pl
@@ -204,9 +204,9 @@ $data=<<EOF;
 #       bn_sub_words
 #       bn_add_words
 #       bn_div_words
-#       bn_sqr_words
+#       bn_sqr_word_wise
-#       bn_mul_words
+#       bn_mulw_words
-#       bn_mul_add_words
+#       bn_mulw_add_words
 #
 #       NOTE:   It is possible to optimize this code more for
 #       specific PowerPC or Power architectures. On the Northstar
@@ -248,9 +248,9 @@ $data=<<EOF;
        .globl  .bn_sub_words
        .globl  .bn_add_words
        .globl  .bn_div_words
-        .globl  .bn_sqr_words
+        .globl  .bn_sqr_word_wise
-        .globl  .bn_mul_words
+        .globl  .bn_mulw_words
-        .globl  .bn_mul_add_words
+        .globl  .bn_mulw_add_words
        
 # .text section
        
@@ -1702,16 +1702,16 @@ Lppcasm_div9:
 #
 #       NOTE:   The following label name should be changed to
-#               "bn_sqr_words" i.e. remove the first dot
+#               "bn_sqr_word_wise" i.e. remove the first dot
 #               for the gcc compiler. This should be automatically
 #               done in the build
 #
 .align  4
-.bn_sqr_words:
+.bn_sqr_word_wise:
 #
-#       Optimized version of bn_sqr_words
+#       Optimized version of bn_sqr_word_wise
 #
-#       void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+#       void bn_sqr_word_wise(BN_ULONG *r, BN_ULONG *a, int n)
 #
 #       r3 = r
 #       r4 = a
@@ -1740,15 +1740,15 @@ Lppcasm_sqr_adios:
 #
 #       NOTE:   The following label name should be changed to
-#               "bn_mul_words" i.e. remove the first dot
+#               "bn_mulw_words" i.e. remove the first dot
 #               for the gcc compiler. This should be automatically
 #               done in the build
 #
 .align  4       
-.bn_mul_words:
+.bn_mulw_words:
 #
-# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+# BN_ULONG bn_mulw_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 #
 # r3 = rp
 # r4 = ap
@@ -1842,15 +1842,15 @@ Lppcasm_mw_OVER:
 #
 #       NOTE:   The following label name should be changed to
-#               "bn_mul_add_words" i.e. remove the first dot
+#               "bn_mulw_add_words" i.e. remove the first dot
 #               for the gcc compiler. This should be automatically
 #               done in the build
 #
 .align  4
-.bn_mul_add_words:
+.bn_mulw_add_words:
 #
-# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+# BN_ULONG bn_mulw_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 #
 # r3 = rp
 # r4 = ap
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
index 6524651748..3be440f11f 100755
--- a/src/lib/libcrypto/bn/asm/x86-mont.pl
+++ b/src/lib/libcrypto/bn/asm/x86-mont.pl
@@ -32,8 +32,7 @@ require "x86asm.pl";
 &asm_init($ARGV[0],$0);
-$sse2=0;
+$sse2=1;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &external_label("OPENSSL_ia32cap_P") if ($sse2);
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
index 7c3c0b142f..3f9e24a868 100644
--- a/src/lib/libcrypto/bn/bn.h
+++ b/src/lib/libcrypto/bn/bn.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn.h,v 1.80 2025/03/09 15:22:40 tb Exp $ */
+/* $OpenBSD: bn.h,v 1.85 2025/12/05 17:25:55 tb Exp $ */
 /* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -125,6 +125,8 @@
 #ifndef HEADER_BN_H
 #define HEADER_BN_H
+#include <inttypes.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -138,59 +140,17 @@
 extern "C" {
 #endif
-/* This next option uses the C libraries (2 word)/(1 word) function.
+#if defined(_LP64) || defined(_WIN64)
- * If it is not defined, I use my C version (which is slower).
- * The reason for this flag is that when the particular C compiler
- * library routine is used, and the library is linked with a different
- * compiler, the library is missing.  This mostly happens when the
- * library is built with gcc and then linked using normal cc.  This would
- * be a common occurrence because gcc normally produces code that is
- * 2 times faster than system compilers for the big number stuff.
- * For machines with only one compiler (or shared libraries), this should
- * be on.  Again this in only really a problem on machines
- * using "long long's", are 32bit, and are not using my assembler code. */
-/* #define BN_DIV2W */
-#ifdef _LP64
 #undef  BN_LLONG
-#define BN_ULONG        unsigned long
+#define BN_ULONG        uint64_t
-#define BN_LONG         long
-#define BN_BITS         128
 #define BN_BYTES        8
 #define BN_BITS2        64
-#define BN_BITS4        32
-#define BN_MASK2        (0xffffffffffffffffL)
-#define BN_MASK2l       (0xffffffffL)
-#define BN_MASK2h       (0xffffffff00000000L)
-#define BN_MASK2h1      (0xffffffff80000000L)
-#define BN_TBIT         (0x8000000000000000L)
-#define BN_DEC_CONV     (10000000000000000000UL)
-#define BN_DEC_FMT1     "%lu"
-#define BN_DEC_FMT2     "%019lu"
-#define BN_DEC_NUM      19
-#define BN_HEX_FMT1     "%lX"
-#define BN_HEX_FMT2     "%016lX"
 #else
-#define BN_ULLONG       unsigned long long
+#define BN_ULLONG       uint64_t
 #define BN_LLONG
-#define BN_ULONG        unsigned int
+#define BN_ULONG        uint32_t
-#define BN_LONG         int
-#define BN_BITS         64
 #define BN_BYTES        4
 #define BN_BITS2        32
-#define BN_BITS4        16
-#define BN_MASK         (0xffffffffffffffffLL)
-#define BN_MASK2        (0xffffffffL)
-#define BN_MASK2l       (0xffff)
-#define BN_MASK2h1      (0xffff8000L)
-#define BN_MASK2h       (0xffff0000L)
-#define BN_TBIT         (0x80000000L)
-#define BN_DEC_CONV     (1000000000L)
-#define BN_DEC_FMT1     "%u"
-#define BN_DEC_FMT2     "%09u"
-#define BN_DEC_NUM      9
-#define BN_HEX_FMT1     "%X"
-#define BN_HEX_FMT2     "%08X"
 #endif
 #define BN_FLG_MALLOCED         0x01
diff --git a/src/lib/libcrypto/bn/bn_add.c b/src/lib/libcrypto/bn/bn_add.c
index 86768a312a..81fa60e429 100644
--- a/src/lib/libcrypto/bn/bn_add.c
+++ b/src/lib/libcrypto/bn/bn_add.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_add.c,v 1.26 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_add.c,v 1.29 2025/05/25 04:53:05 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -60,44 +60,10 @@
 #include <limits.h>
 #include <stdio.h>
-#include <openssl/err.h>
 #include "bn_arch.h"
 #include "bn_local.h"
 #include "bn_internal.h"
+#include "err_local.h"
-/*
- * bn_add_words() computes (carry:r[i]) = a[i] + b[i] + carry, where a and b
- * are both arrays of words. Any carry resulting from the addition is returned.
- */
-#ifndef HAVE_BN_ADD_WORDS
-BN_ULONG
-bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
-{
-        BN_ULONG carry = 0;
-        assert(n >= 0);
-        if (n <= 0)
-                return 0;
-        while (n & ~3) {
-                bn_qwaddqw(a[3], a[2], a[1], a[0], b[3], b[2], b[1], b[0],
-                    carry, &carry, &r[3], &r[2], &r[1], &r[0]);
-                a += 4;
-                b += 4;
-                r += 4;
-                n -= 4;
-        }
-        while (n) {
-                bn_addw_addw(a[0], b[0], carry, &carry, &r[0]);
-                a++;
-                b++;
-                r++;
-                n--;
-        }
-        return carry;
-}
-#endif
 /*
 * bn_add() computes (carry:r[i]) = a[i] + b[i] + carry, where a and b are both
@@ -147,40 +113,6 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 #endif
 /*
- * bn_sub_words() computes (borrow:r[i]) = a[i] - b[i] - borrow, where a and b
- * are both arrays of words. Any borrow resulting from the subtraction is
- * returned.
- */
-#ifndef HAVE_BN_SUB_WORDS
-BN_ULONG
-bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
-{
-        BN_ULONG borrow = 0;
-        assert(n >= 0);
-        if (n <= 0)
-                return 0;
-        while (n & ~3) {
-                bn_qwsubqw(a[3], a[2], a[1], a[0], b[3], b[2], b[1], b[0],
-                    borrow, &borrow, &r[3], &r[2], &r[1], &r[0]);
-                a += 4;
-                b += 4;
-                r += 4;
-                n -= 4;
-        }
-        while (n) {
-                bn_subw_subw(a[0], b[0], borrow, &borrow, &r[0]);
-                a++;
-                b++;
-                r++;
-                n--;
-        }
-        return borrow;
-}
-#endif
-/*
 * bn_sub() computes (borrow:r[i]) = a[i] - b[i] - borrow, where a and b are both
 * arrays of words (r may be the same as a or b). The length of a and b may
 * differ, while r must be at least max(a_len, b_len) in length. Any borrow
@@ -208,7 +140,7 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
        /* XXX - consider doing four at a time to match bn_sub_words. */
        while (diff_len < 0) {
                /* Compute r[0] = 0 - b[0] - borrow. */
-                bn_subw(0 - b[0], borrow, &borrow, &r[0]);
+                bn_subw_subw(0, b[0], borrow, &borrow, &r[0]);
                diff_len++;
                b++;
                r++;
@@ -217,7 +149,7 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
        /* XXX - consider doing four at a time to match bn_sub_words. */
        while (diff_len > 0) {
                /* Compute r[0] = a[0] - 0 - borrow. */
-                bn_subw(a[0], borrow, &borrow, &r[0]);
+                bn_subw_subw(a[0], 0, borrow, &borrow, &r[0]);
                diff_len--;
                a++;
                r++;
diff --git a/src/lib/libcrypto/bn/bn_add_sub.c b/src/lib/libcrypto/bn/bn_add_sub.c
new file mode 100644
index 0000000000..5c9d5a2b1a
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_add_sub.c
@@ -0,0 +1,178 @@
+/* $OpenBSD: bn_add_sub.c,v 1.1 2025/05/25 04:30:55 jsing Exp $ */
+/*
+ * Copyright (c) 2023,2024,2025 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <openssl/bn.h>
+#include "bn_internal.h"
+/*
+ * bn_add_words() computes (carry:r[i]) = a[i] + b[i] + carry, where a and b
+ * are both arrays of words. Any carry resulting from the addition is returned.
+ */
+#ifndef HAVE_BN_ADD_WORDS
+BN_ULONG
+bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
+{
+        BN_ULONG carry = 0;
+        while (n >= 4) {
+                bn_qwaddqw(a[3], a[2], a[1], a[0], b[3], b[2], b[1], b[0],
+                    carry, &carry, &r[3], &r[2], &r[1], &r[0]);
+                a += 4;
+                b += 4;
+                r += 4;
+                n -= 4;
+        }
+        while (n > 0) {
+                bn_addw_addw(a[0], b[0], carry, &carry, &r[0]);
+                a++;
+                b++;
+                r++;
+                n--;
+        }
+        return carry;
+}
+#endif
+/*
+ * bn_sub_words() computes (borrow:r[i]) = a[i] - b[i] - borrow, where a and b
+ * are both arrays of words. Any borrow resulting from the subtraction is
+ * returned.
+ */
+#ifndef HAVE_BN_SUB_WORDS
+BN_ULONG
+bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
+{
+        BN_ULONG borrow = 0;
+        while (n >= 4) {
+                bn_qwsubqw(a[3], a[2], a[1], a[0], b[3], b[2], b[1], b[0],
+                    borrow, &borrow, &r[3], &r[2], &r[1], &r[0]);
+                a += 4;
+                b += 4;
+                r += 4;
+                n -= 4;
+        }
+        while (n > 0) {
+                bn_subw_subw(a[0], b[0], borrow, &borrow, &r[0]);
+                a++;
+                b++;
+                r++;
+                n--;
+        }
+        return borrow;
+}
+#endif
+/*
+ * bn_sub_borrow() computes a[i] - b[i], returning the resulting borrow only.
+ */
+#ifndef HAVE_BN_SUB_WORDS_BORROW
+BN_ULONG
+bn_sub_words_borrow(const BN_ULONG *a, const BN_ULONG *b, size_t n)
+{
+        BN_ULONG borrow = 0;
+        BN_ULONG r;
+        while (n >= 4) {
+                bn_qwsubqw(a[3], a[2], a[1], a[0], b[3], b[2], b[1], b[0],
+                    borrow, &borrow, &r, &r, &r, &r);
+                a += 4;
+                b += 4;
+                n -= 4;
+        }
+        while (n > 0) {
+                bn_subw_subw(a[0], b[0], borrow, &borrow, &r);
+                a++;
+                b++;
+                n--;
+        }
+        return borrow;
+}
+#endif
+/*
+ * bn_add_words_masked() computes r[] = a[] + (b[] & mask), where a, b and r are
+ * arrays of words with length n (r may be the same as a or b).
+ */
+#ifndef HAVE_BN_ADD_WORDS_MASKED
+BN_ULONG
+bn_add_words_masked(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    BN_ULONG mask, size_t n)
+{
+        BN_ULONG carry = 0;
+        /* XXX - consider conditional/masked versions of bn_addw_addw/bn_qwaddqw. */
+        while (n >= 4) {
+                bn_qwaddqw(a[3], a[2], a[1], a[0], b[3] & mask, b[2] & mask,
+                    b[1] & mask, b[0] & mask, carry, &carry, &r[3], &r[2],
+                    &r[1], &r[0]);
+                a += 4;
+                b += 4;
+                r += 4;
+                n -= 4;
+        }
+        while (n > 0) {
+                bn_addw_addw(a[0], b[0] & mask, carry, &carry, &r[0]);
+                a++;
+                b++;
+                r++;
+                n--;
+        }
+        return carry;
+}
+#endif
+/*
+ * bn_sub_words_masked() computes r[] = a[] - (b[] & mask), where a, b and r are
+ * arrays of words with length n (r may be the same as a or b).
+ */
+#ifndef HAVE_BN_SUB_WORDS_MASKED
+BN_ULONG
+bn_sub_words_masked(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    BN_ULONG mask, size_t n)
+{
+        BN_ULONG borrow = 0;
+        /* XXX - consider conditional/masked versions of bn_subw_subw/bn_qwsubqw. */
+        /* Compute conditional r[i] = a[i] - b[i]. */
+        while (n >= 4) {
+                bn_qwsubqw(a[3], a[2], a[1], a[0], b[3] & mask, b[2] & mask,
+                    b[1] & mask, b[0] & mask, borrow, &borrow, &r[3], &r[2],
+                    &r[1], &r[0]);
+                a += 4;
+                b += 4;
+                r += 4;
+                n -= 4;
+        }
+        while (n > 0) {
+                bn_subw_subw(a[0], b[0] & mask, borrow, &borrow, &r[0]);
+                a++;
+                b++;
+                r++;
+                n--;
+        }
+        return borrow;
+}
+#endif
diff --git a/src/lib/libcrypto/bn/bn_const.c b/src/lib/libcrypto/bn/bn_const.c
index bf684c8a46..389e95ca15 100644
--- a/src/lib/libcrypto/bn/bn_const.c
+++ b/src/lib/libcrypto/bn/bn_const.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_const.c,v 1.8 2023/07/28 10:07:30 tb Exp $ */
+/* $OpenBSD: bn_const.c,v 1.9 2026/01/23 08:29:04 tb Exp $ */
 /* Insert boilerplate */
 #include <openssl/bn.h>
@@ -431,3 +431,295 @@ BN_get_rfc3526_prime_8192(BIGNUM *bn)
        return BN_bin2bn(RFC3526_PRIME_8192, sizeof(RFC3526_PRIME_8192), bn);
 }
 LCRYPTO_ALIAS(BN_get_rfc3526_prime_8192);
+static const unsigned char RFC7919_PRIME_2048[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAD, 0xF8, 0x54, 0x58,
+        0xA2, 0xBB, 0x4A, 0x9A, 0xAF, 0xDC, 0x56, 0x20, 0x27, 0x3D, 0x3C, 0xF1,
+        0xD8, 0xB9, 0xC5, 0x83, 0xCE, 0x2D, 0x36, 0x95, 0xA9, 0xE1, 0x36, 0x41,
+        0x14, 0x64, 0x33, 0xFB, 0xCC, 0x93, 0x9D, 0xCE, 0x24, 0x9B, 0x3E, 0xF9,
+        0x7D, 0x2F, 0xE3, 0x63, 0x63, 0x0C, 0x75, 0xD8, 0xF6, 0x81, 0xB2, 0x02,
+        0xAE, 0xC4, 0x61, 0x7A, 0xD3, 0xDF, 0x1E, 0xD5, 0xD5, 0xFD, 0x65, 0x61,
+        0x24, 0x33, 0xF5, 0x1F, 0x5F, 0x06, 0x6E, 0xD0, 0x85, 0x63, 0x65, 0x55,
+        0x3D, 0xED, 0x1A, 0xF3, 0xB5, 0x57, 0x13, 0x5E, 0x7F, 0x57, 0xC9, 0x35,
+        0x98, 0x4F, 0x0C, 0x70, 0xE0, 0xE6, 0x8B, 0x77, 0xE2, 0xA6, 0x89, 0xDA,
+        0xF3, 0xEF, 0xE8, 0x72, 0x1D, 0xF1, 0x58, 0xA1, 0x36, 0xAD, 0xE7, 0x35,
+        0x30, 0xAC, 0xCA, 0x4F, 0x48, 0x3A, 0x79, 0x7A, 0xBC, 0x0A, 0xB1, 0x82,
+        0xB3, 0x24, 0xFB, 0x61, 0xD1, 0x08, 0xA9, 0x4B, 0xB2, 0xC8, 0xE3, 0xFB,
+        0xB9, 0x6A, 0xDA, 0xB7, 0x60, 0xD7, 0xF4, 0x68, 0x1D, 0x4F, 0x42, 0xA3,
+        0xDE, 0x39, 0x4D, 0xF4, 0xAE, 0x56, 0xED, 0xE7, 0x63, 0x72, 0xBB, 0x19,
+        0x0B, 0x07, 0xA7, 0xC8, 0xEE, 0x0A, 0x6D, 0x70, 0x9E, 0x02, 0xFC, 0xE1,
+        0xCD, 0xF7, 0xE2, 0xEC, 0xC0, 0x34, 0x04, 0xCD, 0x28, 0x34, 0x2F, 0x61,
+        0x91, 0x72, 0xFE, 0x9C, 0xE9, 0x85, 0x83, 0xFF, 0x8E, 0x4F, 0x12, 0x32,
+        0xEE, 0xF2, 0x81, 0x83, 0xC3, 0xFE, 0x3B, 0x1B, 0x4C, 0x6F, 0xAD, 0x73,
+        0x3B, 0xB5, 0xFC, 0xBC, 0x2E, 0xC2, 0x20, 0x05, 0xC5, 0x8E, 0xF1, 0x83,
+        0x7D, 0x16, 0x83, 0xB2, 0xC6, 0xF3, 0x4A, 0x26, 0xC1, 0xB2, 0xEF, 0xFA,
+        0x88, 0x6B, 0x42, 0x38, 0x61, 0x28, 0x5C, 0x97, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF,
+};
+BIGNUM *
+BN_get_rfc7919_prime_2048(BIGNUM *bn)
+{
+        return BN_bin2bn(RFC7919_PRIME_2048, sizeof(RFC7919_PRIME_2048), bn);
+}
+static const unsigned char RFC7919_PRIME_3072[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAD, 0xF8, 0x54, 0x58,
+        0xA2, 0xBB, 0x4A, 0x9A, 0xAF, 0xDC, 0x56, 0x20, 0x27, 0x3D, 0x3C, 0xF1,
+        0xD8, 0xB9, 0xC5, 0x83, 0xCE, 0x2D, 0x36, 0x95, 0xA9, 0xE1, 0x36, 0x41,
+        0x14, 0x64, 0x33, 0xFB, 0xCC, 0x93, 0x9D, 0xCE, 0x24, 0x9B, 0x3E, 0xF9,
+        0x7D, 0x2F, 0xE3, 0x63, 0x63, 0x0C, 0x75, 0xD8, 0xF6, 0x81, 0xB2, 0x02,
+        0xAE, 0xC4, 0x61, 0x7A, 0xD3, 0xDF, 0x1E, 0xD5, 0xD5, 0xFD, 0x65, 0x61,
+        0x24, 0x33, 0xF5, 0x1F, 0x5F, 0x06, 0x6E, 0xD0, 0x85, 0x63, 0x65, 0x55,
+        0x3D, 0xED, 0x1A, 0xF3, 0xB5, 0x57, 0x13, 0x5E, 0x7F, 0x57, 0xC9, 0x35,
+        0x98, 0x4F, 0x0C, 0x70, 0xE0, 0xE6, 0x8B, 0x77, 0xE2, 0xA6, 0x89, 0xDA,
+        0xF3, 0xEF, 0xE8, 0x72, 0x1D, 0xF1, 0x58, 0xA1, 0x36, 0xAD, 0xE7, 0x35,
+        0x30, 0xAC, 0xCA, 0x4F, 0x48, 0x3A, 0x79, 0x7A, 0xBC, 0x0A, 0xB1, 0x82,
+        0xB3, 0x24, 0xFB, 0x61, 0xD1, 0x08, 0xA9, 0x4B, 0xB2, 0xC8, 0xE3, 0xFB,
+        0xB9, 0x6A, 0xDA, 0xB7, 0x60, 0xD7, 0xF4, 0x68, 0x1D, 0x4F, 0x42, 0xA3,
+        0xDE, 0x39, 0x4D, 0xF4, 0xAE, 0x56, 0xED, 0xE7, 0x63, 0x72, 0xBB, 0x19,
+        0x0B, 0x07, 0xA7, 0xC8, 0xEE, 0x0A, 0x6D, 0x70, 0x9E, 0x02, 0xFC, 0xE1,
+        0xCD, 0xF7, 0xE2, 0xEC, 0xC0, 0x34, 0x04, 0xCD, 0x28, 0x34, 0x2F, 0x61,
+        0x91, 0x72, 0xFE, 0x9C, 0xE9, 0x85, 0x83, 0xFF, 0x8E, 0x4F, 0x12, 0x32,
+        0xEE, 0xF2, 0x81, 0x83, 0xC3, 0xFE, 0x3B, 0x1B, 0x4C, 0x6F, 0xAD, 0x73,
+        0x3B, 0xB5, 0xFC, 0xBC, 0x2E, 0xC2, 0x20, 0x05, 0xC5, 0x8E, 0xF1, 0x83,
+        0x7D, 0x16, 0x83, 0xB2, 0xC6, 0xF3, 0x4A, 0x26, 0xC1, 0xB2, 0xEF, 0xFA,
+        0x88, 0x6B, 0x42, 0x38, 0x61, 0x1F, 0xCF, 0xDC, 0xDE, 0x35, 0x5B, 0x3B,
+        0x65, 0x19, 0x03, 0x5B, 0xBC, 0x34, 0xF4, 0xDE, 0xF9, 0x9C, 0x02, 0x38,
+        0x61, 0xB4, 0x6F, 0xC9, 0xD6, 0xE6, 0xC9, 0x07, 0x7A, 0xD9, 0x1D, 0x26,
+        0x91, 0xF7, 0xF7, 0xEE, 0x59, 0x8C, 0xB0, 0xFA, 0xC1, 0x86, 0xD9, 0x1C,
+        0xAE, 0xFE, 0x13, 0x09, 0x85, 0x13, 0x92, 0x70, 0xB4, 0x13, 0x0C, 0x93,
+        0xBC, 0x43, 0x79, 0x44, 0xF4, 0xFD, 0x44, 0x52, 0xE2, 0xD7, 0x4D, 0xD3,
+        0x64, 0xF2, 0xE2, 0x1E, 0x71, 0xF5, 0x4B, 0xFF, 0x5C, 0xAE, 0x82, 0xAB,
+        0x9C, 0x9D, 0xF6, 0x9E, 0xE8, 0x6D, 0x2B, 0xC5, 0x22, 0x36, 0x3A, 0x0D,
+        0xAB, 0xC5, 0x21, 0x97, 0x9B, 0x0D, 0xEA, 0xDA, 0x1D, 0xBF, 0x9A, 0x42,
+        0xD5, 0xC4, 0x48, 0x4E, 0x0A, 0xBC, 0xD0, 0x6B, 0xFA, 0x53, 0xDD, 0xEF,
+        0x3C, 0x1B, 0x20, 0xEE, 0x3F, 0xD5, 0x9D, 0x7C, 0x25, 0xE4, 0x1D, 0x2B,
+        0x66, 0xC6, 0x2E, 0x37, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+};
+BIGNUM *
+BN_get_rfc7919_prime_3072(BIGNUM *bn)
+{
+        return BN_bin2bn(RFC7919_PRIME_3072, sizeof(RFC7919_PRIME_3072), bn);
+}
+static const unsigned char RFC7919_PRIME_4096[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAD, 0xF8, 0x54, 0x58,
+        0xA2, 0xBB, 0x4A, 0x9A, 0xAF, 0xDC, 0x56, 0x20, 0x27, 0x3D, 0x3C, 0xF1,
+        0xD8, 0xB9, 0xC5, 0x83, 0xCE, 0x2D, 0x36, 0x95, 0xA9, 0xE1, 0x36, 0x41,
+        0x14, 0x64, 0x33, 0xFB, 0xCC, 0x93, 0x9D, 0xCE, 0x24, 0x9B, 0x3E, 0xF9,
+        0x7D, 0x2F, 0xE3, 0x63, 0x63, 0x0C, 0x75, 0xD8, 0xF6, 0x81, 0xB2, 0x02,
+        0xAE, 0xC4, 0x61, 0x7A, 0xD3, 0xDF, 0x1E, 0xD5, 0xD5, 0xFD, 0x65, 0x61,
+        0x24, 0x33, 0xF5, 0x1F, 0x5F, 0x06, 0x6E, 0xD0, 0x85, 0x63, 0x65, 0x55,
+        0x3D, 0xED, 0x1A, 0xF3, 0xB5, 0x57, 0x13, 0x5E, 0x7F, 0x57, 0xC9, 0x35,
+        0x98, 0x4F, 0x0C, 0x70, 0xE0, 0xE6, 0x8B, 0x77, 0xE2, 0xA6, 0x89, 0xDA,
+        0xF3, 0xEF, 0xE8, 0x72, 0x1D, 0xF1, 0x58, 0xA1, 0x36, 0xAD, 0xE7, 0x35,
+        0x30, 0xAC, 0xCA, 0x4F, 0x48, 0x3A, 0x79, 0x7A, 0xBC, 0x0A, 0xB1, 0x82,
+        0xB3, 0x24, 0xFB, 0x61, 0xD1, 0x08, 0xA9, 0x4B, 0xB2, 0xC8, 0xE3, 0xFB,
+        0xB9, 0x6A, 0xDA, 0xB7, 0x60, 0xD7, 0xF4, 0x68, 0x1D, 0x4F, 0x42, 0xA3,
+        0xDE, 0x39, 0x4D, 0xF4, 0xAE, 0x56, 0xED, 0xE7, 0x63, 0x72, 0xBB, 0x19,
+        0x0B, 0x07, 0xA7, 0xC8, 0xEE, 0x0A, 0x6D, 0x70, 0x9E, 0x02, 0xFC, 0xE1,
+        0xCD, 0xF7, 0xE2, 0xEC, 0xC0, 0x34, 0x04, 0xCD, 0x28, 0x34, 0x2F, 0x61,
+        0x91, 0x72, 0xFE, 0x9C, 0xE9, 0x85, 0x83, 0xFF, 0x8E, 0x4F, 0x12, 0x32,
+        0xEE, 0xF2, 0x81, 0x83, 0xC3, 0xFE, 0x3B, 0x1B, 0x4C, 0x6F, 0xAD, 0x73,
+        0x3B, 0xB5, 0xFC, 0xBC, 0x2E, 0xC2, 0x20, 0x05, 0xC5, 0x8E, 0xF1, 0x83,
+        0x7D, 0x16, 0x83, 0xB2, 0xC6, 0xF3, 0x4A, 0x26, 0xC1, 0xB2, 0xEF, 0xFA,
+        0x88, 0x6B, 0x42, 0x38, 0x61, 0x1F, 0xCF, 0xDC, 0xDE, 0x35, 0x5B, 0x3B,
+        0x65, 0x19, 0x03, 0x5B, 0xBC, 0x34, 0xF4, 0xDE, 0xF9, 0x9C, 0x02, 0x38,
+        0x61, 0xB4, 0x6F, 0xC9, 0xD6, 0xE6, 0xC9, 0x07, 0x7A, 0xD9, 0x1D, 0x26,
+        0x91, 0xF7, 0xF7, 0xEE, 0x59, 0x8C, 0xB0, 0xFA, 0xC1, 0x86, 0xD9, 0x1C,
+        0xAE, 0xFE, 0x13, 0x09, 0x85, 0x13, 0x92, 0x70, 0xB4, 0x13, 0x0C, 0x93,
+        0xBC, 0x43, 0x79, 0x44, 0xF4, 0xFD, 0x44, 0x52, 0xE2, 0xD7, 0x4D, 0xD3,
+        0x64, 0xF2, 0xE2, 0x1E, 0x71, 0xF5, 0x4B, 0xFF, 0x5C, 0xAE, 0x82, 0xAB,
+        0x9C, 0x9D, 0xF6, 0x9E, 0xE8, 0x6D, 0x2B, 0xC5, 0x22, 0x36, 0x3A, 0x0D,
+        0xAB, 0xC5, 0x21, 0x97, 0x9B, 0x0D, 0xEA, 0xDA, 0x1D, 0xBF, 0x9A, 0x42,
+        0xD5, 0xC4, 0x48, 0x4E, 0x0A, 0xBC, 0xD0, 0x6B, 0xFA, 0x53, 0xDD, 0xEF,
+        0x3C, 0x1B, 0x20, 0xEE, 0x3F, 0xD5, 0x9D, 0x7C, 0x25, 0xE4, 0x1D, 0x2B,
+        0x66, 0x9E, 0x1E, 0xF1, 0x6E, 0x6F, 0x52, 0xC3, 0x16, 0x4D, 0xF4, 0xFB,
+        0x79, 0x30, 0xE9, 0xE4, 0xE5, 0x88, 0x57, 0xB6, 0xAC, 0x7D, 0x5F, 0x42,
+        0xD6, 0x9F, 0x6D, 0x18, 0x77, 0x63, 0xCF, 0x1D, 0x55, 0x03, 0x40, 0x04,
+        0x87, 0xF5, 0x5B, 0xA5, 0x7E, 0x31, 0xCC, 0x7A, 0x71, 0x35, 0xC8, 0x86,
+        0xEF, 0xB4, 0x31, 0x8A, 0xED, 0x6A, 0x1E, 0x01, 0x2D, 0x9E, 0x68, 0x32,
+        0xA9, 0x07, 0x60, 0x0A, 0x91, 0x81, 0x30, 0xC4, 0x6D, 0xC7, 0x78, 0xF9,
+        0x71, 0xAD, 0x00, 0x38, 0x09, 0x29, 0x99, 0xA3, 0x33, 0xCB, 0x8B, 0x7A,
+        0x1A, 0x1D, 0xB9, 0x3D, 0x71, 0x40, 0x00, 0x3C, 0x2A, 0x4E, 0xCE, 0xA9,
+        0xF9, 0x8D, 0x0A, 0xCC, 0x0A, 0x82, 0x91, 0xCD, 0xCE, 0xC9, 0x7D, 0xCF,
+        0x8E, 0xC9, 0xB5, 0x5A, 0x7F, 0x88, 0xA4, 0x6B, 0x4D, 0xB5, 0xA8, 0x51,
+        0xF4, 0x41, 0x82, 0xE1, 0xC6, 0x8A, 0x00, 0x7E, 0x5E, 0x65, 0x5F, 0x6A,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+};
+BIGNUM *
+BN_get_rfc7919_prime_4096(BIGNUM *bn)
+{
+        return BN_bin2bn(RFC7919_PRIME_4096, sizeof(RFC7919_PRIME_4096), bn);
+}
+static const unsigned char RFC7919_PRIME_6144[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAD, 0xF8, 0x54, 0x58,
+        0xA2, 0xBB, 0x4A, 0x9A, 0xAF, 0xDC, 0x56, 0x20, 0x27, 0x3D, 0x3C, 0xF1,
+        0xD8, 0xB9, 0xC5, 0x83, 0xCE, 0x2D, 0x36, 0x95, 0xA9, 0xE1, 0x36, 0x41,
+        0x14, 0x64, 0x33, 0xFB, 0xCC, 0x93, 0x9D, 0xCE, 0x24, 0x9B, 0x3E, 0xF9,
+        0x7D, 0x2F, 0xE3, 0x63, 0x63, 0x0C, 0x75, 0xD8, 0xF6, 0x81, 0xB2, 0x02,
+        0xAE, 0xC4, 0x61, 0x7A, 0xD3, 0xDF, 0x1E, 0xD5, 0xD5, 0xFD, 0x65, 0x61,
+        0x24, 0x33, 0xF5, 0x1F, 0x5F, 0x06, 0x6E, 0xD0, 0x85, 0x63, 0x65, 0x55,
+        0x3D, 0xED, 0x1A, 0xF3, 0xB5, 0x57, 0x13, 0x5E, 0x7F, 0x57, 0xC9, 0x35,
+        0x98, 0x4F, 0x0C, 0x70, 0xE0, 0xE6, 0x8B, 0x77, 0xE2, 0xA6, 0x89, 0xDA,
+        0xF3, 0xEF, 0xE8, 0x72, 0x1D, 0xF1, 0x58, 0xA1, 0x36, 0xAD, 0xE7, 0x35,
+        0x30, 0xAC, 0xCA, 0x4F, 0x48, 0x3A, 0x79, 0x7A, 0xBC, 0x0A, 0xB1, 0x82,
+        0xB3, 0x24, 0xFB, 0x61, 0xD1, 0x08, 0xA9, 0x4B, 0xB2, 0xC8, 0xE3, 0xFB,
+        0xB9, 0x6A, 0xDA, 0xB7, 0x60, 0xD7, 0xF4, 0x68, 0x1D, 0x4F, 0x42, 0xA3,
+        0xDE, 0x39, 0x4D, 0xF4, 0xAE, 0x56, 0xED, 0xE7, 0x63, 0x72, 0xBB, 0x19,
+        0x0B, 0x07, 0xA7, 0xC8, 0xEE, 0x0A, 0x6D, 0x70, 0x9E, 0x02, 0xFC, 0xE1,
+        0xCD, 0xF7, 0xE2, 0xEC, 0xC0, 0x34, 0x04, 0xCD, 0x28, 0x34, 0x2F, 0x61,
+        0x91, 0x72, 0xFE, 0x9C, 0xE9, 0x85, 0x83, 0xFF, 0x8E, 0x4F, 0x12, 0x32,
+        0xEE, 0xF2, 0x81, 0x83, 0xC3, 0xFE, 0x3B, 0x1B, 0x4C, 0x6F, 0xAD, 0x73,
+        0x3B, 0xB5, 0xFC, 0xBC, 0x2E, 0xC2, 0x20, 0x05, 0xC5, 0x8E, 0xF1, 0x83,
+        0x7D, 0x16, 0x83, 0xB2, 0xC6, 0xF3, 0x4A, 0x26, 0xC1, 0xB2, 0xEF, 0xFA,
+        0x88, 0x6B, 0x42, 0x38, 0x61, 0x1F, 0xCF, 0xDC, 0xDE, 0x35, 0x5B, 0x3B,
+        0x65, 0x19, 0x03, 0x5B, 0xBC, 0x34, 0xF4, 0xDE, 0xF9, 0x9C, 0x02, 0x38,
+        0x61, 0xB4, 0x6F, 0xC9, 0xD6, 0xE6, 0xC9, 0x07, 0x7A, 0xD9, 0x1D, 0x26,
+        0x91, 0xF7, 0xF7, 0xEE, 0x59, 0x8C, 0xB0, 0xFA, 0xC1, 0x86, 0xD9, 0x1C,
+        0xAE, 0xFE, 0x13, 0x09, 0x85, 0x13, 0x92, 0x70, 0xB4, 0x13, 0x0C, 0x93,
+        0xBC, 0x43, 0x79, 0x44, 0xF4, 0xFD, 0x44, 0x52, 0xE2, 0xD7, 0x4D, 0xD3,
+        0x64, 0xF2, 0xE2, 0x1E, 0x71, 0xF5, 0x4B, 0xFF, 0x5C, 0xAE, 0x82, 0xAB,
+        0x9C, 0x9D, 0xF6, 0x9E, 0xE8, 0x6D, 0x2B, 0xC5, 0x22, 0x36, 0x3A, 0x0D,
+        0xAB, 0xC5, 0x21, 0x97, 0x9B, 0x0D, 0xEA, 0xDA, 0x1D, 0xBF, 0x9A, 0x42,
+        0xD5, 0xC4, 0x48, 0x4E, 0x0A, 0xBC, 0xD0, 0x6B, 0xFA, 0x53, 0xDD, 0xEF,
+        0x3C, 0x1B, 0x20, 0xEE, 0x3F, 0xD5, 0x9D, 0x7C, 0x25, 0xE4, 0x1D, 0x2B,
+        0x66, 0x9E, 0x1E, 0xF1, 0x6E, 0x6F, 0x52, 0xC3, 0x16, 0x4D, 0xF4, 0xFB,
+        0x79, 0x30, 0xE9, 0xE4, 0xE5, 0x88, 0x57, 0xB6, 0xAC, 0x7D, 0x5F, 0x42,
+        0xD6, 0x9F, 0x6D, 0x18, 0x77, 0x63, 0xCF, 0x1D, 0x55, 0x03, 0x40, 0x04,
+        0x87, 0xF5, 0x5B, 0xA5, 0x7E, 0x31, 0xCC, 0x7A, 0x71, 0x35, 0xC8, 0x86,
+        0xEF, 0xB4, 0x31, 0x8A, 0xED, 0x6A, 0x1E, 0x01, 0x2D, 0x9E, 0x68, 0x32,
+        0xA9, 0x07, 0x60, 0x0A, 0x91, 0x81, 0x30, 0xC4, 0x6D, 0xC7, 0x78, 0xF9,
+        0x71, 0xAD, 0x00, 0x38, 0x09, 0x29, 0x99, 0xA3, 0x33, 0xCB, 0x8B, 0x7A,
+        0x1A, 0x1D, 0xB9, 0x3D, 0x71, 0x40, 0x00, 0x3C, 0x2A, 0x4E, 0xCE, 0xA9,
+        0xF9, 0x8D, 0x0A, 0xCC, 0x0A, 0x82, 0x91, 0xCD, 0xCE, 0xC9, 0x7D, 0xCF,
+        0x8E, 0xC9, 0xB5, 0x5A, 0x7F, 0x88, 0xA4, 0x6B, 0x4D, 0xB5, 0xA8, 0x51,
+        0xF4, 0x41, 0x82, 0xE1, 0xC6, 0x8A, 0x00, 0x7E, 0x5E, 0x0D, 0xD9, 0x02,
+        0x0B, 0xFD, 0x64, 0xB6, 0x45, 0x03, 0x6C, 0x7A, 0x4E, 0x67, 0x7D, 0x2C,
+        0x38, 0x53, 0x2A, 0x3A, 0x23, 0xBA, 0x44, 0x42, 0xCA, 0xF5, 0x3E, 0xA6,
+        0x3B, 0xB4, 0x54, 0x32, 0x9B, 0x76, 0x24, 0xC8, 0x91, 0x7B, 0xDD, 0x64,
+        0xB1, 0xC0, 0xFD, 0x4C, 0xB3, 0x8E, 0x8C, 0x33, 0x4C, 0x70, 0x1C, 0x3A,
+        0xCD, 0xAD, 0x06, 0x57, 0xFC, 0xCF, 0xEC, 0x71, 0x9B, 0x1F, 0x5C, 0x3E,
+        0x4E, 0x46, 0x04, 0x1F, 0x38, 0x81, 0x47, 0xFB, 0x4C, 0xFD, 0xB4, 0x77,
+        0xA5, 0x24, 0x71, 0xF7, 0xA9, 0xA9, 0x69, 0x10, 0xB8, 0x55, 0x32, 0x2E,
+        0xDB, 0x63, 0x40, 0xD8, 0xA0, 0x0E, 0xF0, 0x92, 0x35, 0x05, 0x11, 0xE3,
+        0x0A, 0xBE, 0xC1, 0xFF, 0xF9, 0xE3, 0xA2, 0x6E, 0x7F, 0xB2, 0x9F, 0x8C,
+        0x18, 0x30, 0x23, 0xC3, 0x58, 0x7E, 0x38, 0xDA, 0x00, 0x77, 0xD9, 0xB4,
+        0x76, 0x3E, 0x4E, 0x4B, 0x94, 0xB2, 0xBB, 0xC1, 0x94, 0xC6, 0x65, 0x1E,
+        0x77, 0xCA, 0xF9, 0x92, 0xEE, 0xAA, 0xC0, 0x23, 0x2A, 0x28, 0x1B, 0xF6,
+        0xB3, 0xA7, 0x39, 0xC1, 0x22, 0x61, 0x16, 0x82, 0x0A, 0xE8, 0xDB, 0x58,
+        0x47, 0xA6, 0x7C, 0xBE, 0xF9, 0xC9, 0x09, 0x1B, 0x46, 0x2D, 0x53, 0x8C,
+        0xD7, 0x2B, 0x03, 0x74, 0x6A, 0xE7, 0x7F, 0x5E, 0x62, 0x29, 0x2C, 0x31,
+        0x15, 0x62, 0xA8, 0x46, 0x50, 0x5D, 0xC8, 0x2D, 0xB8, 0x54, 0x33, 0x8A,
+        0xE4, 0x9F, 0x52, 0x35, 0xC9, 0x5B, 0x91, 0x17, 0x8C, 0xCF, 0x2D, 0xD5,
+        0xCA, 0xCE, 0xF4, 0x03, 0xEC, 0x9D, 0x18, 0x10, 0xC6, 0x27, 0x2B, 0x04,
+        0x5B, 0x3B, 0x71, 0xF9, 0xDC, 0x6B, 0x80, 0xD6, 0x3F, 0xDD, 0x4A, 0x8E,
+        0x9A, 0xDB, 0x1E, 0x69, 0x62, 0xA6, 0x95, 0x26, 0xD4, 0x31, 0x61, 0xC1,
+        0xA4, 0x1D, 0x57, 0x0D, 0x79, 0x38, 0xDA, 0xD4, 0xA4, 0x0E, 0x32, 0x9C,
+        0xD0, 0xE4, 0x0E, 0x65, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+};
+BIGNUM *
+BN_get_rfc7919_prime_6144(BIGNUM *bn)
+{
+        return BN_bin2bn(RFC7919_PRIME_6144, sizeof(RFC7919_PRIME_6144), bn);
+}
+static const unsigned char RFC7919_PRIME_8192[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAD, 0xF8, 0x54, 0x58,
+        0xA2, 0xBB, 0x4A, 0x9A, 0xAF, 0xDC, 0x56, 0x20, 0x27, 0x3D, 0x3C, 0xF1,
+        0xD8, 0xB9, 0xC5, 0x83, 0xCE, 0x2D, 0x36, 0x95, 0xA9, 0xE1, 0x36, 0x41,
+        0x14, 0x64, 0x33, 0xFB, 0xCC, 0x93, 0x9D, 0xCE, 0x24, 0x9B, 0x3E, 0xF9,
+        0x7D, 0x2F, 0xE3, 0x63, 0x63, 0x0C, 0x75, 0xD8, 0xF6, 0x81, 0xB2, 0x02,
+        0xAE, 0xC4, 0x61, 0x7A, 0xD3, 0xDF, 0x1E, 0xD5, 0xD5, 0xFD, 0x65, 0x61,
+        0x24, 0x33, 0xF5, 0x1F, 0x5F, 0x06, 0x6E, 0xD0, 0x85, 0x63, 0x65, 0x55,
+        0x3D, 0xED, 0x1A, 0xF3, 0xB5, 0x57, 0x13, 0x5E, 0x7F, 0x57, 0xC9, 0x35,
+        0x98, 0x4F, 0x0C, 0x70, 0xE0, 0xE6, 0x8B, 0x77, 0xE2, 0xA6, 0x89, 0xDA,
+        0xF3, 0xEF, 0xE8, 0x72, 0x1D, 0xF1, 0x58, 0xA1, 0x36, 0xAD, 0xE7, 0x35,
+        0x30, 0xAC, 0xCA, 0x4F, 0x48, 0x3A, 0x79, 0x7A, 0xBC, 0x0A, 0xB1, 0x82,
+        0xB3, 0x24, 0xFB, 0x61, 0xD1, 0x08, 0xA9, 0x4B, 0xB2, 0xC8, 0xE3, 0xFB,
+        0xB9, 0x6A, 0xDA, 0xB7, 0x60, 0xD7, 0xF4, 0x68, 0x1D, 0x4F, 0x42, 0xA3,
+        0xDE, 0x39, 0x4D, 0xF4, 0xAE, 0x56, 0xED, 0xE7, 0x63, 0x72, 0xBB, 0x19,
+        0x0B, 0x07, 0xA7, 0xC8, 0xEE, 0x0A, 0x6D, 0x70, 0x9E, 0x02, 0xFC, 0xE1,
+        0xCD, 0xF7, 0xE2, 0xEC, 0xC0, 0x34, 0x04, 0xCD, 0x28, 0x34, 0x2F, 0x61,
+        0x91, 0x72, 0xFE, 0x9C, 0xE9, 0x85, 0x83, 0xFF, 0x8E, 0x4F, 0x12, 0x32,
+        0xEE, 0xF2, 0x81, 0x83, 0xC3, 0xFE, 0x3B, 0x1B, 0x4C, 0x6F, 0xAD, 0x73,
+        0x3B, 0xB5, 0xFC, 0xBC, 0x2E, 0xC2, 0x20, 0x05, 0xC5, 0x8E, 0xF1, 0x83,
+        0x7D, 0x16, 0x83, 0xB2, 0xC6, 0xF3, 0x4A, 0x26, 0xC1, 0xB2, 0xEF, 0xFA,
+        0x88, 0x6B, 0x42, 0x38, 0x61, 0x1F, 0xCF, 0xDC, 0xDE, 0x35, 0x5B, 0x3B,
+        0x65, 0x19, 0x03, 0x5B, 0xBC, 0x34, 0xF4, 0xDE, 0xF9, 0x9C, 0x02, 0x38,
+        0x61, 0xB4, 0x6F, 0xC9, 0xD6, 0xE6, 0xC9, 0x07, 0x7A, 0xD9, 0x1D, 0x26,
+        0x91, 0xF7, 0xF7, 0xEE, 0x59, 0x8C, 0xB0, 0xFA, 0xC1, 0x86, 0xD9, 0x1C,
+        0xAE, 0xFE, 0x13, 0x09, 0x85, 0x13, 0x92, 0x70, 0xB4, 0x13, 0x0C, 0x93,
+        0xBC, 0x43, 0x79, 0x44, 0xF4, 0xFD, 0x44, 0x52, 0xE2, 0xD7, 0x4D, 0xD3,
+        0x64, 0xF2, 0xE2, 0x1E, 0x71, 0xF5, 0x4B, 0xFF, 0x5C, 0xAE, 0x82, 0xAB,
+        0x9C, 0x9D, 0xF6, 0x9E, 0xE8, 0x6D, 0x2B, 0xC5, 0x22, 0x36, 0x3A, 0x0D,
+        0xAB, 0xC5, 0x21, 0x97, 0x9B, 0x0D, 0xEA, 0xDA, 0x1D, 0xBF, 0x9A, 0x42,
+        0xD5, 0xC4, 0x48, 0x4E, 0x0A, 0xBC, 0xD0, 0x6B, 0xFA, 0x53, 0xDD, 0xEF,
+        0x3C, 0x1B, 0x20, 0xEE, 0x3F, 0xD5, 0x9D, 0x7C, 0x25, 0xE4, 0x1D, 0x2B,
+        0x66, 0x9E, 0x1E, 0xF1, 0x6E, 0x6F, 0x52, 0xC3, 0x16, 0x4D, 0xF4, 0xFB,
+        0x79, 0x30, 0xE9, 0xE4, 0xE5, 0x88, 0x57, 0xB6, 0xAC, 0x7D, 0x5F, 0x42,
+        0xD6, 0x9F, 0x6D, 0x18, 0x77, 0x63, 0xCF, 0x1D, 0x55, 0x03, 0x40, 0x04,
+        0x87, 0xF5, 0x5B, 0xA5, 0x7E, 0x31, 0xCC, 0x7A, 0x71, 0x35, 0xC8, 0x86,
+        0xEF, 0xB4, 0x31, 0x8A, 0xED, 0x6A, 0x1E, 0x01, 0x2D, 0x9E, 0x68, 0x32,
+        0xA9, 0x07, 0x60, 0x0A, 0x91, 0x81, 0x30, 0xC4, 0x6D, 0xC7, 0x78, 0xF9,
+        0x71, 0xAD, 0x00, 0x38, 0x09, 0x29, 0x99, 0xA3, 0x33, 0xCB, 0x8B, 0x7A,
+        0x1A, 0x1D, 0xB9, 0x3D, 0x71, 0x40, 0x00, 0x3C, 0x2A, 0x4E, 0xCE, 0xA9,
+        0xF9, 0x8D, 0x0A, 0xCC, 0x0A, 0x82, 0x91, 0xCD, 0xCE, 0xC9, 0x7D, 0xCF,
+        0x8E, 0xC9, 0xB5, 0x5A, 0x7F, 0x88, 0xA4, 0x6B, 0x4D, 0xB5, 0xA8, 0x51,
+        0xF4, 0x41, 0x82, 0xE1, 0xC6, 0x8A, 0x00, 0x7E, 0x5E, 0x0D, 0xD9, 0x02,
+        0x0B, 0xFD, 0x64, 0xB6, 0x45, 0x03, 0x6C, 0x7A, 0x4E, 0x67, 0x7D, 0x2C,
+        0x38, 0x53, 0x2A, 0x3A, 0x23, 0xBA, 0x44, 0x42, 0xCA, 0xF5, 0x3E, 0xA6,
+        0x3B, 0xB4, 0x54, 0x32, 0x9B, 0x76, 0x24, 0xC8, 0x91, 0x7B, 0xDD, 0x64,
+        0xB1, 0xC0, 0xFD, 0x4C, 0xB3, 0x8E, 0x8C, 0x33, 0x4C, 0x70, 0x1C, 0x3A,
+        0xCD, 0xAD, 0x06, 0x57, 0xFC, 0xCF, 0xEC, 0x71, 0x9B, 0x1F, 0x5C, 0x3E,
+        0x4E, 0x46, 0x04, 0x1F, 0x38, 0x81, 0x47, 0xFB, 0x4C, 0xFD, 0xB4, 0x77,
+        0xA5, 0x24, 0x71, 0xF7, 0xA9, 0xA9, 0x69, 0x10, 0xB8, 0x55, 0x32, 0x2E,
+        0xDB, 0x63, 0x40, 0xD8, 0xA0, 0x0E, 0xF0, 0x92, 0x35, 0x05, 0x11, 0xE3,
+        0x0A, 0xBE, 0xC1, 0xFF, 0xF9, 0xE3, 0xA2, 0x6E, 0x7F, 0xB2, 0x9F, 0x8C,
+        0x18, 0x30, 0x23, 0xC3, 0x58, 0x7E, 0x38, 0xDA, 0x00, 0x77, 0xD9, 0xB4,
+        0x76, 0x3E, 0x4E, 0x4B, 0x94, 0xB2, 0xBB, 0xC1, 0x94, 0xC6, 0x65, 0x1E,
+        0x77, 0xCA, 0xF9, 0x92, 0xEE, 0xAA, 0xC0, 0x23, 0x2A, 0x28, 0x1B, 0xF6,
+        0xB3, 0xA7, 0x39, 0xC1, 0x22, 0x61, 0x16, 0x82, 0x0A, 0xE8, 0xDB, 0x58,
+        0x47, 0xA6, 0x7C, 0xBE, 0xF9, 0xC9, 0x09, 0x1B, 0x46, 0x2D, 0x53, 0x8C,
+        0xD7, 0x2B, 0x03, 0x74, 0x6A, 0xE7, 0x7F, 0x5E, 0x62, 0x29, 0x2C, 0x31,
+        0x15, 0x62, 0xA8, 0x46, 0x50, 0x5D, 0xC8, 0x2D, 0xB8, 0x54, 0x33, 0x8A,
+        0xE4, 0x9F, 0x52, 0x35, 0xC9, 0x5B, 0x91, 0x17, 0x8C, 0xCF, 0x2D, 0xD5,
+        0xCA, 0xCE, 0xF4, 0x03, 0xEC, 0x9D, 0x18, 0x10, 0xC6, 0x27, 0x2B, 0x04,
+        0x5B, 0x3B, 0x71, 0xF9, 0xDC, 0x6B, 0x80, 0xD6, 0x3F, 0xDD, 0x4A, 0x8E,
+        0x9A, 0xDB, 0x1E, 0x69, 0x62, 0xA6, 0x95, 0x26, 0xD4, 0x31, 0x61, 0xC1,
+        0xA4, 0x1D, 0x57, 0x0D, 0x79, 0x38, 0xDA, 0xD4, 0xA4, 0x0E, 0x32, 0x9C,
+        0xCF, 0xF4, 0x6A, 0xAA, 0x36, 0xAD, 0x00, 0x4C, 0xF6, 0x00, 0xC8, 0x38,
+        0x1E, 0x42, 0x5A, 0x31, 0xD9, 0x51, 0xAE, 0x64, 0xFD, 0xB2, 0x3F, 0xCE,
+        0xC9, 0x50, 0x9D, 0x43, 0x68, 0x7F, 0xEB, 0x69, 0xED, 0xD1, 0xCC, 0x5E,
+        0x0B, 0x8C, 0xC3, 0xBD, 0xF6, 0x4B, 0x10, 0xEF, 0x86, 0xB6, 0x31, 0x42,
+        0xA3, 0xAB, 0x88, 0x29, 0x55, 0x5B, 0x2F, 0x74, 0x7C, 0x93, 0x26, 0x65,
+        0xCB, 0x2C, 0x0F, 0x1C, 0xC0, 0x1B, 0xD7, 0x02, 0x29, 0x38, 0x88, 0x39,
+        0xD2, 0xAF, 0x05, 0xE4, 0x54, 0x50, 0x4A, 0xC7, 0x8B, 0x75, 0x82, 0x82,
+        0x28, 0x46, 0xC0, 0xBA, 0x35, 0xC3, 0x5F, 0x5C, 0x59, 0x16, 0x0C, 0xC0,
+        0x46, 0xFD, 0x82, 0x51, 0x54, 0x1F, 0xC6, 0x8C, 0x9C, 0x86, 0xB0, 0x22,
+        0xBB, 0x70, 0x99, 0x87, 0x6A, 0x46, 0x0E, 0x74, 0x51, 0xA8, 0xA9, 0x31,
+        0x09, 0x70, 0x3F, 0xEE, 0x1C, 0x21, 0x7E, 0x6C, 0x38, 0x26, 0xE5, 0x2C,
+        0x51, 0xAA, 0x69, 0x1E, 0x0E, 0x42, 0x3C, 0xFC, 0x99, 0xE9, 0xE3, 0x16,
+        0x50, 0xC1, 0x21, 0x7B, 0x62, 0x48, 0x16, 0xCD, 0xAD, 0x9A, 0x95, 0xF9,
+        0xD5, 0xB8, 0x01, 0x94, 0x88, 0xD9, 0xC0, 0xA0, 0xA1, 0xFE, 0x30, 0x75,
+        0xA5, 0x77, 0xE2, 0x31, 0x83, 0xF8, 0x1D, 0x4A, 0x3F, 0x2F, 0xA4, 0x57,
+        0x1E, 0xFC, 0x8C, 0xE0, 0xBA, 0x8A, 0x4F, 0xE8, 0xB6, 0x85, 0x5D, 0xFE,
+        0x72, 0xB0, 0xA6, 0x6E, 0xDE, 0xD2, 0xFB, 0xAB, 0xFB, 0xE5, 0x8A, 0x30,
+        0xFA, 0xFA, 0xBE, 0x1C, 0x5D, 0x71, 0xA8, 0x7E, 0x2F, 0x74, 0x1E, 0xF8,
+        0xC1, 0xFE, 0x86, 0xFE, 0xA6, 0xBB, 0xFD, 0xE5, 0x30, 0x67, 0x7F, 0x0D,
+        0x97, 0xD1, 0x1D, 0x49, 0xF7, 0xA8, 0x44, 0x3D, 0x08, 0x22, 0xE5, 0x06,
+        0xA9, 0xF4, 0x61, 0x4E, 0x01, 0x1E, 0x2A, 0x94, 0x83, 0x8F, 0xF8, 0x8C,
+        0xD6, 0x8C, 0x8B, 0xB7, 0xC5, 0xC6, 0x42, 0x4C, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF,
+};
+BIGNUM *
+BN_get_rfc7919_prime_8192(BIGNUM *bn)
+{
+        return BN_bin2bn(RFC7919_PRIME_8192, sizeof(RFC7919_PRIME_8192), bn);
+}
diff --git a/src/lib/libcrypto/bn/bn_convert.c b/src/lib/libcrypto/bn/bn_convert.c
index 6a6354f44e..ab5bc519c8 100644
--- a/src/lib/libcrypto/bn/bn_convert.c
+++ b/src/lib/libcrypto/bn/bn_convert.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_convert.c,v 1.23 2024/11/08 14:18:44 jsing Exp $ */
+/* $OpenBSD: bn_convert.c,v 1.25 2025/12/05 14:12:32 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -65,11 +65,19 @@
 #include <openssl/bio.h>
 #include <openssl/buffer.h>
-#include <openssl/err.h>
 #include "bn_local.h"
 #include "bytestring.h"
 #include "crypto_internal.h"
+#include "err_local.h"
+#if BN_BYTES == 8
+#define BN_DEC_CONV     UINT64_C(10000000000000000000)
+#define BN_DEC_NUM      19
+#else
+#define BN_DEC_CONV     UINT32_C(1000000000)
+#define BN_DEC_NUM      9
+#endif
 static int bn_dec2bn_cbs(BIGNUM **bnp, CBS *cbs);
 static int bn_hex2bn_cbs(BIGNUM **bnp, CBS *cbs);
diff --git a/src/lib/libcrypto/bn/bn_ctx.c b/src/lib/libcrypto/bn/bn_ctx.c
index 129b9c9781..eda93dcaa4 100644
--- a/src/lib/libcrypto/bn/bn_ctx.c
+++ b/src/lib/libcrypto/bn/bn_ctx.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_ctx.c,v 1.22 2023/07/08 12:21:58 beck Exp $ */
+/*      $OpenBSD: bn_ctx.c,v 1.23 2025/05/10 05:54:38 tb Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -19,9 +19,9 @@
 #include <string.h>
 #include <openssl/opensslconf.h>
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 #define BN_CTX_INITIAL_LEN      8
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c
index 09a8a364df..0a914db752 100644
--- a/src/lib/libcrypto/bn/bn_div.c
+++ b/src/lib/libcrypto/bn/bn_div.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_div.c,v 1.41 2024/04/10 14:58:06 beck Exp $ */
+/* $OpenBSD: bn_div.c,v 1.44 2025/09/07 06:28:03 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -62,25 +62,15 @@
 #include <openssl/opensslconf.h>
 #include <openssl/bn.h>
-#include <openssl/err.h>
 #include "bn_arch.h"
 #include "bn_local.h"
 #include "bn_internal.h"
+#include "err_local.h"
 BN_ULONG bn_div_3_words(const BN_ULONG *m, BN_ULONG d1, BN_ULONG d0);
 #ifndef HAVE_BN_DIV_WORDS
-#if defined(BN_LLONG) && defined(BN_DIV2W)
-BN_ULONG
-bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
-{
-        return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2)|l)/(BN_ULLONG)d));
-}
-#else
 /* Divide h,l by d and return the result. */
 /* I need to test this some more :-( */
 BN_ULONG
@@ -148,7 +138,6 @@ bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
        ret |= q;
        return (ret);
 }
-#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 #endif
 /*
@@ -375,7 +364,7 @@ BN_div_internal(BIGNUM *quotient, BIGNUM *remainder, const BIGNUM *numerator,
                 *  | wnum - sdiv * q | < sdiv
                 */
                q = bn_div_3_words(wnump, d1, d0);
-                l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
+                l0 = bn_mulw_words(tmp->d, sdiv->d, div_n, q);
                tmp->d[div_n] = l0;
                wnum.d--;
diff --git a/src/lib/libcrypto/bn/bn_exp.c b/src/lib/libcrypto/bn/bn_exp.c
index e925d325d2..6a5c1c857a 100644
--- a/src/lib/libcrypto/bn/bn_exp.c
+++ b/src/lib/libcrypto/bn/bn_exp.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_exp.c,v 1.58 2025/02/13 11:15:09 tb Exp $ */
+/* $OpenBSD: bn_exp.c,v 1.59 2025/05/10 05:54:38 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -112,10 +112,9 @@
 #include <stdlib.h>
 #include <string.h>
-#include <openssl/err.h>
 #include "bn_local.h"
 #include "constant_time.h"
+#include "err_local.h"
 /* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE      32
diff --git a/src/lib/libcrypto/bn/bn_gcd.c b/src/lib/libcrypto/bn/bn_gcd.c
index fa5d71a7f3..319d9ca390 100644
--- a/src/lib/libcrypto/bn/bn_gcd.c
+++ b/src/lib/libcrypto/bn/bn_gcd.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_gcd.c,v 1.29 2024/04/10 14:58:06 beck Exp $ */
+/* $OpenBSD: bn_gcd.c,v 1.31 2025/06/02 12:40:10 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -109,9 +109,8 @@
 *
 */
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 static BIGNUM *
 euclid(BIGNUM *a, BIGNUM *b)
@@ -681,8 +680,10 @@ BN_mod_inverse_internal(BIGNUM *in, const BIGNUM *a, const BIGNUM *n, BN_CTX *ct
                                        /* A >= 2*B, so D=2 or D=3 */
                                        if (!BN_sub(M, A, T))
                                                goto err;
-                                        if (!BN_add(D,T,B)) goto err; /* use D (:= 3*B) as temp */
+                                        /* use D (:= 3*B) as temp */
-                                                if (BN_ucmp(A, D) < 0) {
+                                        if (!BN_add(D, T, B))
+                                                goto err;
+                                        if (BN_ucmp(A, D) < 0) {
                                                /* A < 3*B, so D=2 */
                                                if (!BN_set_word(D, 2))
                                                        goto err;
diff --git a/src/lib/libcrypto/bn/bn_internal.h b/src/lib/libcrypto/bn/bn_internal.h
index fd04bc9f8a..efe8202aa0 100644
--- a/src/lib/libcrypto/bn/bn_internal.h
+++ b/src/lib/libcrypto/bn/bn_internal.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_internal.h,v 1.15 2023/06/25 11:42:26 jsing Exp $ */
+/*      $OpenBSD: bn_internal.h,v 1.21 2025/12/05 14:12:32 tb Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -18,6 +18,7 @@
 #include <openssl/bn.h>
 #include "bn_arch.h"
+#include "bn_local.h"
 #ifndef HEADER_BN_INTERNAL_H
 #define HEADER_BN_INTERNAL_H
@@ -26,6 +27,30 @@ int bn_word_clz(BN_ULONG w);
 int bn_bitsize(const BIGNUM *bn);
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    int num);
+BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    int num);
+BN_ULONG bn_sub_words_borrow(const BN_ULONG *a, const BN_ULONG *b, size_t n);
+BN_ULONG bn_add_words_masked(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    BN_ULONG mask, size_t n);
+BN_ULONG bn_sub_words_masked(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    BN_ULONG mask, size_t n);
+void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n);
+void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n);
+void bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n);
+void bn_mod_sqr_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *m,
+    BN_ULONG *t, BN_ULONG m0, size_t n);
+void bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap,
+    const BN_ULONG *bp, const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0,
+    int n_len);
+void bn_montgomery_reduce_words(BN_ULONG *r, BN_ULONG *a, const BN_ULONG *n,
+    BN_ULONG n0, int n_len);
 #ifndef HAVE_BN_CT_NE_ZERO
 static inline int
 bn_ct_ne_zero(BN_ULONG w)
diff --git a/src/lib/libcrypto/bn/bn_isqrt.c b/src/lib/libcrypto/bn/bn_isqrt.c
index 018d5f34bd..b725519e1a 100644
--- a/src/lib/libcrypto/bn/bn_isqrt.c
+++ b/src/lib/libcrypto/bn/bn_isqrt.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_isqrt.c,v 1.10 2023/06/04 17:28:35 tb Exp $ */
+/*      $OpenBSD: bn_isqrt.c,v 1.11 2025/05/10 05:54:38 tb Exp $ */
 /*
 * Copyright (c) 2022 Theo Buehler <tb@openbsd.org>
 *
@@ -19,10 +19,10 @@
 #include <stdint.h>
 #include <openssl/bn.h>
-#include <openssl/err.h>
 #include "bn_local.h"
 #include "crypto_internal.h"
+#include "err_local.h"
 /*
 * Calculate integer square root of |n| using a variant of Newton's method.
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c
index 72b988650c..0326e72c4d 100644
--- a/src/lib/libcrypto/bn/bn_lib.c
+++ b/src/lib/libcrypto/bn/bn_lib.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_lib.c,v 1.93 2024/04/16 13:07:14 jsing Exp $ */
+/* $OpenBSD: bn_lib.c,v 1.95 2025/12/15 12:09:46 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -63,10 +63,9 @@
 #include <openssl/opensslconf.h>
-#include <openssl/err.h>
 #include "bn_local.h"
 #include "bn_internal.h"
+#include "err_local.h"
 BIGNUM *
 BN_new(void)
@@ -350,7 +349,7 @@ BN_ULONG
 BN_get_word(const BIGNUM *a)
 {
        if (a->top > 1)
-                return BN_MASK2;
+                return (BN_ULONG)-1;
        else if (a->top == 1)
                return a->d[0];
        /* a->top == 0 */
diff --git a/src/lib/libcrypto/bn/bn_local.h b/src/lib/libcrypto/bn/bn_local.h
index 067ffab3d9..2f5b58a548 100644
--- a/src/lib/libcrypto/bn/bn_local.h
+++ b/src/lib/libcrypto/bn/bn_local.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_local.h,v 1.50 2025/02/13 11:04:20 tb Exp $ */
+/* $OpenBSD: bn_local.h,v 1.62 2026/01/23 08:29:04 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -116,6 +116,20 @@
 #include <openssl/bn.h>
+#if BN_BYTES == 8
+#define BN_MASK2        UINT64_C(0xffffffffffffffff)
+#define BN_MASK2l       UINT64_C(0xffffffff)
+#define BN_MASK2h       UINT64_C(0xffffffff00000000)
+#define BN_BITS         128
+#define BN_BITS4        32
+#else
+#define BN_MASK2        UINT32_C(0xffffffff)
+#define BN_MASK2l       UINT32_C(0xffff)
+#define BN_MASK2h       UINT32_C(0xffff0000)
+#define BN_BITS         64
+#define BN_BITS4        16
+#endif
 __BEGIN_HIDDEN_DECLS
 struct bignum_st {
@@ -239,12 +253,16 @@ BN_ULONG bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len,
 BN_ULONG bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len,
    const BN_ULONG *b, int b_len);
-void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
+void bn_mul_comba4(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba6(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba8(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+void bn_mul_words(BN_ULONG *r, const BN_ULONG *a, int a_len, const BN_ULONG *b,
+    int b_len);
 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
+void bn_sqr_comba6(BN_ULONG *r, const BN_ULONG *a);
 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int a_len);
 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    const BN_ULONG *np, const BN_ULONG *n0, int num);
@@ -254,13 +272,8 @@ int bn_expand_bits(BIGNUM *a, size_t bits);
 int bn_expand_bytes(BIGNUM *a, size_t bytes);
 int bn_wexpand(BIGNUM *a, int words);
-BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+BN_ULONG bn_mulw_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
-    int num);
+BN_ULONG bn_mulw_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
-BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-    int num);
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
-BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
-void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
 void bn_div_rem_words(BN_ULONG h, BN_ULONG l, BN_ULONG d, BN_ULONG *out_q,
    BN_ULONG *out_r);
@@ -331,5 +344,11 @@ int bn_printf(BIO *bio, const BIGNUM *bn, int indent, const char *fmt, ...)
 int bn_bn2hex_nosign(const BIGNUM *bn, char **out, size_t *out_len);
 int bn_bn2hex_nibbles(const BIGNUM *bn, char **out, size_t *out_len);
+BIGNUM *BN_get_rfc7919_prime_2048(BIGNUM *bn);
+BIGNUM *BN_get_rfc7919_prime_3072(BIGNUM *bn);
+BIGNUM *BN_get_rfc7919_prime_4096(BIGNUM *bn);
+BIGNUM *BN_get_rfc7919_prime_6144(BIGNUM *bn);
+BIGNUM *BN_get_rfc7919_prime_8192(BIGNUM *bn);
 __END_HIDDEN_DECLS
 #endif /* !HEADER_BN_LOCAL_H */
diff --git a/src/lib/libcrypto/bn/bn_mod.c b/src/lib/libcrypto/bn/bn_mod.c
index 365f6fcf03..7198c02e3b 100644
--- a/src/lib/libcrypto/bn/bn_mod.c
+++ b/src/lib/libcrypto/bn/bn_mod.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mod.c,v 1.22 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_mod.c,v 1.23 2025/05/10 05:54:38 tb Exp $ */
 /* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
 * for the OpenSSL project. */
 /* ====================================================================
@@ -111,9 +111,8 @@
 * [including the GNU Public Licence.]
 */
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 int
 BN_mod_ct(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
diff --git a/src/lib/libcrypto/bn/bn_mod_sqrt.c b/src/lib/libcrypto/bn/bn_mod_sqrt.c
index 280002cc48..fc55f84317 100644
--- a/src/lib/libcrypto/bn/bn_mod_sqrt.c
+++ b/src/lib/libcrypto/bn/bn_mod_sqrt.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_mod_sqrt.c,v 1.3 2023/08/03 18:53:55 tb Exp $ */
+/*      $OpenBSD: bn_mod_sqrt.c,v 1.4 2025/05/10 05:54:38 tb Exp $ */
 /*
 * Copyright (c) 2022 Theo Buehler <tb@openbsd.org>
@@ -16,9 +16,8 @@
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 /*
 * Tonelli-Shanks according to H. Cohen "A Course in Computational Algebraic
diff --git a/src/lib/libcrypto/bn/bn_mod_words.c b/src/lib/libcrypto/bn/bn_mod_words.c
new file mode 100644
index 0000000000..f368e074db
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_mod_words.c
@@ -0,0 +1,110 @@
+/*      $OpenBSD: bn_mod_words.c,v 1.7 2025/09/07 05:21:29 jsing Exp $  */
+/*
+ * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include "bn_local.h"
+#include "bn_internal.h"
+/*
+ * bn_mod_add_words() computes r[] = (a[] + b[]) mod m[], where a, b, r and
+ * m are arrays of words with length n (r may be the same as a or b).
+ */
+#ifndef HAVE_BN_MOD_ADD_WORDS
+void
+bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        BN_ULONG carry, mask;
+        /*
+         * Compute a + b, then compute r - m to determine if r >= m, considering
+         * any carry that resulted from the addition. Finally complete a
+         * conditional subtraction of r - m.
+         */
+        /* XXX - change bn_add_words to use size_t. */
+        carry = bn_add_words(r, a, b, n);
+        mask = ~(carry - bn_sub_words_borrow(r, m, n));
+        bn_sub_words_masked(r, r, m, mask, n);
+}
+#endif
+/*
+ * bn_mod_sub_words() computes r[] = (a[] - b[]) mod m[], where a, b, r and
+ * m are arrays of words with length n (r may be the same as a or b).
+ */
+#ifndef HAVE_BN_MOD_SUB_WORDS
+void
+bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        BN_ULONG borrow, mask;
+        /*
+         * Compute a - b, then complete a conditional addition of r + m
+         * based on the resulting borrow.
+         */
+        /* XXX - change bn_sub_words to use size_t. */
+        borrow = bn_sub_words(r, a, b, n);
+        mask = (0 - borrow);
+        bn_add_words_masked(r, r, m, mask, n);
+}
+#endif
+/*
+ * bn_mod_mul_words() computes r[] = (a[] * b[]) mod m[], where a, b, r and
+ * m are arrays of words with length n (r may be the same as a or b) in the
+ * Montgomery domain. The result remains in the Montgomery domain.
+ */
+#ifndef HAVE_BN_MOD_MUL_WORDS
+void
+bn_mod_mul_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, BN_ULONG *t, BN_ULONG m0, size_t n)
+{
+        if (n == 4) {
+                bn_mul_comba4(t, a, b);
+        } else if (n == 6) {
+                bn_mul_comba6(t, a, b);
+        } else if (n == 8) {
+                bn_mul_comba8(t, a, b);
+        } else {
+                bn_mul_words(t, a, n, b, n);
+        }
+        bn_montgomery_reduce_words(r, t, m, m0, n);
+}
+#endif
+/*
+ * bn_mod_sqr_words() computes r[] = (a[] * a[]) mod m[], where a, r and
+ * m are arrays of words with length n (r may be the same as a) in the
+ * Montgomery domain. The result remains in the Montgomery domain.
+ */
+#ifndef HAVE_BN_MOD_SQR_WORDS
+void
+bn_mod_sqr_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *m,
+    BN_ULONG *t, BN_ULONG m0, size_t n)
+{
+        if (n == 4) {
+                bn_sqr_comba4(t, a);
+        } else if (n == 6) {
+                bn_sqr_comba6(t, a);
+        } else if (n == 8) {
+                bn_sqr_comba8(t, a);
+        } else {
+                bn_sqr_words(t, a, n);
+        }
+        bn_montgomery_reduce_words(r, t, m, m0, n);
+}
+#endif
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
index edd7bcd0c8..c9e95fb08b 100644
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ b/src/lib/libcrypto/bn/bn_mont.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mont.c,v 1.66 2025/03/09 15:22:40 tb Exp $ */
+/* $OpenBSD: bn_mont.c,v 1.70 2025/08/30 07:54:27 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -116,6 +116,7 @@
 * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
 */
+#include <limits.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
@@ -214,7 +215,7 @@ BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
                 goto err;
        mont->N.neg = 0;
        mont->ri = ((BN_num_bits(mod) + BN_BITS2 - 1) / BN_BITS2) * BN_BITS2;
-        if (mont->ri * 2 < mont->ri)
+        if (mont->ri > INT_MAX / 2)
                goto err;
        /*
@@ -316,6 +317,44 @@ BN_MONT_CTX_set_locked(BN_MONT_CTX **pmctx, int lock, const BIGNUM *mod,
 LCRYPTO_ALIAS(BN_MONT_CTX_set_locked);
 /*
+ * bn_montgomery_reduce_words() performs Montgomery reduction, reducing the input
+ * from its Montgomery form aR to a, returning the result in r. a must be twice
+ * the length of the modulus. Note that the input is mutated in the process of
+ * performing the reduction.
+ */
+void
+bn_montgomery_reduce_words(BN_ULONG *r, BN_ULONG *a, const BN_ULONG *n,
+    BN_ULONG n0, int n_len)
+{
+        BN_ULONG v, mask;
+        BN_ULONG carry = 0;
+        int i;
+        /* Add multiples of the modulus, so that it becomes divisible by R. */
+        for (i = 0; i < n_len; i++) {
+                v = bn_mulw_add_words(&a[i], n, n_len, a[i] * n0);
+                bn_addw_addw(v, a[i + n_len], carry, &carry, &a[i + n_len]);
+        }
+        /* Divide by R (this is the equivalent of right shifting by n_len). */
+        a = &a[n_len];
+        /*
+         * The output is now in the range of [0, 2N). Attempt to reduce once by
+         * subtracting the modulus. If the reduction was necessary then the
+         * result is already in r, otherwise copy the value prior to reduction
+         * from the top half of a.
+         */
+        mask = carry - bn_sub_words(r, a, n, n_len);
+        for (i = 0; i < n_len; i++) {
+                *r = (*r & ~mask) | (*a & mask);
+                r++;
+                a++;
+        }
+}
+/*
 * bn_montgomery_reduce() performs Montgomery reduction, reducing the input
 * from its Montgomery form aR to a, returning the result in r. Note that the
 * input is mutated in the process of performing the reduction, destroying its
@@ -325,7 +364,6 @@ static int
 bn_montgomery_reduce(BIGNUM *r, BIGNUM *a, BN_MONT_CTX *mctx)
 {
        BIGNUM *n;
-        BN_ULONG *ap, *rp, n0, v, carry, mask;
        int i, max, n_len;
        n = &mctx->N;
@@ -341,7 +379,8 @@ bn_montgomery_reduce(BIGNUM *r, BIGNUM *a, BN_MONT_CTX *mctx)
        /*
         * Expand a to twice the length of the modulus, zero if necessary.
-         * XXX - make this a requirement of the caller.
+         * XXX - make this a requirement of the caller or use a temporary
+         * allocation.
         */
        if ((max = 2 * n_len) < n_len)
                return 0;
@@ -350,33 +389,8 @@ bn_montgomery_reduce(BIGNUM *r, BIGNUM *a, BN_MONT_CTX *mctx)
        for (i = a->top; i < max; i++)
                a->d[i] = 0;
-        carry = 0;
+        bn_montgomery_reduce_words(r->d, a->d, n->d, mctx->n0[0], n_len);
-        n0 = mctx->n0[0];
-        /* Add multiples of the modulus, so that it becomes divisible by R. */
-        for (i = 0; i < n_len; i++) {
-                v = bn_mul_add_words(&a->d[i], n->d, n_len, a->d[i] * n0);
-                bn_addw_addw(v, a->d[i + n_len], carry, &carry,
-                    &a->d[i + n_len]);
-        }
-        /* Divide by R (this is the equivalent of right shifting by n_len). */
-        ap = &a->d[n_len];
-        /*
-         * The output is now in the range of [0, 2N). Attempt to reduce once by
-         * subtracting the modulus. If the reduction was necessary then the
-         * result is already in r, otherwise copy the value prior to reduction
-         * from the top half of a.
-         */
-        mask = carry - bn_sub_words(r->d, ap, n->d, n_len);
-        rp = r->d;
-        for (i = 0; i < n_len; i++) {
-                *rp = (*rp & ~mask) | (*ap & mask);
-                rp++;
-                ap++;
-        }
        r->top = n_len;
        bn_correct_top(r);
@@ -417,7 +431,7 @@ bn_mod_mul_montgomery_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
        return ret;
 }
-static void
+static inline void
 bn_montgomery_multiply_word(const BN_ULONG *ap, BN_ULONG b, const BN_ULONG *np,
    BN_ULONG *tp, BN_ULONG w, BN_ULONG *carry_a, BN_ULONG *carry_n, int n_len)
 {
@@ -452,7 +466,7 @@ bn_montgomery_multiply_word(const BN_ULONG *ap, BN_ULONG b, const BN_ULONG *np,
 * given word arrays. The caller must ensure that rp, ap, bp and np are all
 * n_len words in length, while tp must be n_len * 2 + 2 words in length.
 */
-static void
+void
 bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0, int n_len)
 {
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
index bdeb9b0fe8..7db0f61849 100644
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ b/src/lib/libcrypto/bn/bn_mul.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mul.c,v 1.39 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_mul.c,v 1.46 2025/09/01 15:39:59 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -57,6 +57,7 @@
 */
 #include <assert.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
@@ -73,7 +74,7 @@
 */
 #ifndef HAVE_BN_MUL_COMBA4
 void
-bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+bn_mul_comba4(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
 {
        BN_ULONG c0, c1, c2;
@@ -103,13 +104,73 @@ bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 #endif
 /*
+ * bn_mul_comba6() computes r[] = a[] * b[] using Comba multiplication
+ * (https://everything2.com/title/Comba+multiplication), where a and b are both
+ * six word arrays, producing a 12 word array result.
+ */
+#ifndef HAVE_BN_MUL_COMBA6
+void
+bn_mul_comba6(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
+{
+        BN_ULONG c0, c1, c2;
+        bn_mulw_addtw(a[0], b[0],  0,  0,  0, &c2, &c1, &r[0]);
+        bn_mulw_addtw(a[0], b[1],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[0], c2, c1, c0, &c2, &c1, &r[1]);
+        bn_mulw_addtw(a[2], b[0],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[0], b[2], c2, c1, c0, &c2, &c1, &r[2]);
+        bn_mulw_addtw(a[0], b[3],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[0], c2, c1, c0, &c2, &c1, &r[3]);
+        bn_mulw_addtw(a[4], b[0],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[0], b[4], c2, c1, c0, &c2, &c1, &r[4]);
+        bn_mulw_addtw(a[0], b[5],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[5], b[0], c2, c1, c0, &c2, &c1, &r[5]);
+        bn_mulw_addtw(a[5], b[1],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[2], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[1], b[5], c2, c1, c0, &c2, &c1, &r[6]);
+        bn_mulw_addtw(a[2], b[5],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[3], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[5], b[2], c2, c1, c0, &c2, &c1, &r[7]);
+        bn_mulw_addtw(a[5], b[3],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[4], b[4], c2, c1, c0, &c2, &c1, &c0);
+        bn_mulw_addtw(a[3], b[5], c2, c1, c0, &c2, &c1, &r[8]);
+        bn_mulw_addtw(a[4], b[5],  0, c2, c1, &c2, &c1, &c0);
+        bn_mulw_addtw(a[5], b[4], c2, c1, c0, &c2, &c1, &r[9]);
+        bn_mulw_addtw(a[5], b[5],  0, c2, c1, &c2, &r[11], &r[10]);
+}
+#endif
+/*
 * bn_mul_comba8() computes r[] = a[] * b[] using Comba multiplication
 * (https://everything2.com/title/Comba+multiplication), where a and b are both
 * eight word arrays, producing a 16 word array result.
 */
 #ifndef HAVE_BN_MUL_COMBA8
 void
-bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+bn_mul_comba8(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b)
 {
        BN_ULONG c0, c1, c2;
@@ -195,14 +256,13 @@ bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 #endif
 /*
- * bn_mul_words() computes (carry:r[i]) = a[i] * w + carry, where a is an array
+ * bn_mulw_words() computes (carry:r[i]) = a[i] * w + carry, where a is an array
- * of words and w is a single word. This should really be called bn_mulw_words()
+ * of words and w is a single word. This is used as a step in the multiplication
- * since only one input is an array. This is used as a step in the multiplication
 * of word arrays.
 */
-#ifndef HAVE_BN_MUL_WORDS
+#ifndef HAVE_BN_MULW_WORDS
 BN_ULONG
-bn_mul_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w)
+bn_mulw_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w)
 {
        BN_ULONG carry = 0;
@@ -228,14 +288,13 @@ bn_mul_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w)
 #endif
 /*
- * bn_mul_add_words() computes (carry:r[i]) = a[i] * w + r[i] + carry, where
+ * bn_mulw_add_words() computes (carry:r[i]) = a[i] * w + r[i] + carry, where
- * a is an array of words and w is a single word. This should really be called
+ * a is an array of words and w is a single word. This is used as a step in the
- * bn_mulw_add_words() since only one input is an array. This is used as a step
+ * multiplication of word arrays.
- * in the multiplication of word arrays.
 */
-#ifndef HAVE_BN_MUL_ADD_WORDS
+#ifndef HAVE_BN_MULW_ADD_WORDS
 BN_ULONG
-bn_mul_add_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w)
+bn_mulw_add_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w)
 {
        BN_ULONG carry = 0;
@@ -262,62 +321,60 @@ bn_mul_add_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w)
 }
 #endif
+#ifndef HAVE_BN_MUL_WORDS
 void
-bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
+bn_mul_words(BN_ULONG *r, const BN_ULONG *a, int a_len, const BN_ULONG *b,
+    int b_len)
 {
        BN_ULONG *rr;
+        if (a_len < b_len) {
-        if (na < nb) {
                int itmp;
-                BN_ULONG *ltmp;
+                const BN_ULONG *ltmp;
-                itmp = na;
+                itmp = a_len;
-                na = nb;
+                a_len = b_len;
-                nb = itmp;
+                b_len = itmp;
                ltmp = a;
                a = b;
                b = ltmp;
        }
-        rr = &(r[na]);
+        rr = &(r[a_len]);
-        if (nb <= 0) {
+        if (b_len <= 0) {
-                (void)bn_mul_words(r, a, na, 0);
+                (void)bn_mulw_words(r, a, a_len, 0);
                return;
        } else
-                rr[0] = bn_mul_words(r, a, na, b[0]);
+                rr[0] = bn_mulw_words(r, a, a_len, b[0]);
        for (;;) {
-                if (--nb <= 0)
+                if (--b_len <= 0)
                        return;
-                rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
+                rr[1] = bn_mulw_add_words(&(r[1]), a, a_len, b[1]);
-                if (--nb <= 0)
+                if (--b_len <= 0)
                        return;
-                rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
+                rr[2] = bn_mulw_add_words(&(r[2]), a, a_len, b[2]);
-                if (--nb <= 0)
+                if (--b_len <= 0)
                        return;
-                rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
+                rr[3] = bn_mulw_add_words(&(r[3]), a, a_len, b[3]);
-                if (--nb <= 0)
+                if (--b_len <= 0)
                        return;
-                rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
+                rr[4] = bn_mulw_add_words(&(r[4]), a, a_len, b[4]);
                rr += 4;
                r += 4;
                b += 4;
        }
 }
+#endif
+static int
-#ifndef HAVE_BN_MUL
-int
 bn_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, int rn, BN_CTX *ctx)
 {
-        bn_mul_normal(r->d, a->d, a->top, b->d, b->top);
+        bn_mul_words(r->d, a->d, a->top, b->d, b->top);
        return 1;
 }
-#endif /* HAVE_BN_MUL */
 int
 BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 {
@@ -338,14 +395,16 @@ BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
        if (rr == NULL)
                goto err;
-        rn = a->top + b->top;
+        if (a->top > INT_MAX - b->top)
-        if (rn < a->top)
                goto err;
+        rn = a->top + b->top;
        if (!bn_wexpand(rr, rn))
                goto err;
        if (a->top == 4 && b->top == 4) {
                bn_mul_comba4(rr->d, a->d, b->d);
+        } else if (a->top == 6 && b->top == 6) {
+                bn_mul_comba6(rr->d, a->d, b->d);
        } else if (a->top == 8 && b->top == 8) {
                bn_mul_comba8(rr->d, a->d, b->d);
        } else {
diff --git a/src/lib/libcrypto/bn/bn_prime.c b/src/lib/libcrypto/bn/bn_prime.c
index 5a4aa50bf1..3d7f18a8ea 100644
--- a/src/lib/libcrypto/bn/bn_prime.c
+++ b/src/lib/libcrypto/bn/bn_prime.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_prime.c,v 1.34 2023/07/20 06:26:27 tb Exp $ */
+/* $OpenBSD: bn_prime.c,v 1.37 2025/11/08 16:27:33 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -109,12 +109,12 @@
 *
 */
-#include <stdio.h>
+#include <stddef.h>
-#include <time.h>
-#include <openssl/err.h>
+#include <openssl/bn.h>
 #include "bn_local.h"
+#include "err_local.h"
 /* The quick sieve algorithm approach to weeding out primes is
 * Philip Zimmermann's, as implemented in PGP.  I have had a read of
@@ -339,7 +339,7 @@ probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add, const BIGNUM *rem,
 loop:
        for (i = 1; i < NUMPRIMES; i++) {
                /* check that rnd is a prime */
-                BN_LONG mod = BN_mod_word(rnd, primes[i]);
+                BN_ULONG mod = BN_mod_word(rnd, primes[i]);
                if (mod == (BN_ULONG)-1)
                        goto err;
                if (mod <= 1) {
diff --git a/src/lib/libcrypto/bn/bn_rand.c b/src/lib/libcrypto/bn/bn_rand.c
index 9cfcd8e2c0..d3b16f70a0 100644
--- a/src/lib/libcrypto/bn/bn_rand.c
+++ b/src/lib/libcrypto/bn/bn_rand.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_rand.c,v 1.30 2024/03/16 20:42:33 tb Exp $ */
+/* $OpenBSD: bn_rand.c,v 1.31 2025/05/10 05:54:38 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -115,9 +115,8 @@
 #include <string.h>
 #include <time.h>
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 static int
 bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
diff --git a/src/lib/libcrypto/bn/bn_recp.c b/src/lib/libcrypto/bn/bn_recp.c
index e3f22c52a9..ed5049b772 100644
--- a/src/lib/libcrypto/bn/bn_recp.c
+++ b/src/lib/libcrypto/bn/bn_recp.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_recp.c,v 1.33 2025/02/04 20:22:20 tb Exp $ */
+/* $OpenBSD: bn_recp.c,v 1.34 2025/05/10 05:54:38 tb Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -58,9 +58,8 @@
 #include <stdio.h>
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 struct bn_recp_ctx_st {
        BIGNUM *N;      /* the divisor */
diff --git a/src/lib/libcrypto/bn/bn_shift.c b/src/lib/libcrypto/bn/bn_shift.c
index 12edc7c0a0..b9f73cc322 100644
--- a/src/lib/libcrypto/bn/bn_shift.c
+++ b/src/lib/libcrypto/bn/bn_shift.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_shift.c,v 1.22 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_shift.c,v 1.23 2025/05/10 05:54:38 tb Exp $ */
 /*
 * Copyright (c) 2022, 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -16,9 +16,9 @@
 */
 #include <openssl/bn.h>
-#include <openssl/err.h>
 #include "bn_local.h"
+#include "err_local.h"
 static inline int
 bn_lshift(BIGNUM *r, const BIGNUM *a, int n)
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
index 0dbccbf85d..27e08bdf13 100644
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ b/src/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_sqr.c,v 1.36 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_sqr.c,v 1.42 2025/09/07 05:21:29 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -64,8 +64,6 @@
 #include "bn_local.h"
 #include "bn_internal.h"
-int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx);
 /*
 * bn_sqr_comba4() computes r[] = a[] * a[] using Comba multiplication
 * (https://everything2.com/title/Comba+multiplication), where a is a
@@ -97,6 +95,51 @@ bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 #endif
 /*
+ * bn_sqr_comba6() computes r[] = a[] * a[] using Comba multiplication
+ * (https://everything2.com/title/Comba+multiplication), where a is an
+ * six word array, producing an 12 word array result.
+ */
+#ifndef HAVE_BN_SQR_COMBA6
+void
+bn_sqr_comba6(BN_ULONG *r, const BN_ULONG *a)
+{
+        BN_ULONG c2, c1, c0;
+        bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
+        bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
+        bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
+        bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
+        bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]);
+        bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]);
+        bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &r[6]);
+        bn_mul2_mulw_addtw(a[5], a[2], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]);
+        bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0);
+        bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &r[8]);
+        bn_mul2_mulw_addtw(a[5], a[4], 0, c2, c1, &c2, &c1, &r[9]);
+        bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &r[11], &r[10]);
+}
+#endif
+/*
 * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication
 * (https://everything2.com/title/Comba+multiplication), where a is an
 * eight word array, producing an 16 word array result.
@@ -160,7 +203,7 @@ bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 }
 #endif
-#ifndef HAVE_BN_SQR
+#ifndef HAVE_BN_SQR_WORDS
 /*
 * bn_sqr_add_words() computes (r[i*2+1]:r[i*2]) = (r[i*2+1]:r[i*2]) + a[i] * a[i].
 */
@@ -197,12 +240,16 @@ bn_sqr_add_words(BN_ULONG *r, const BN_ULONG *a, int n)
        }
 }
-static void
+/*
-bn_sqr_normal(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len)
+ * bn_sqr_words() computes r[] = a[] * a[].
+ */
+void
+bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int a_len)
 {
        const BN_ULONG *ap;
        BN_ULONG *rp;
        BN_ULONG w;
+        int r_len;
        int n;
        if (a_len <= 0)
@@ -213,13 +260,14 @@ bn_sqr_normal(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len)
        ap++;
        rp = r;
+        r_len = a_len * 2;
        rp[0] = rp[r_len - 1] = 0;
        rp++;
        /* Compute initial product - r[n:1] = a[n:1] * a[0] */
        n = a_len - 1;
        if (n > 0) {
-                rp[n] = bn_mul_words(rp, ap, n, w);
+                rp[n] = bn_mulw_words(rp, ap, n, w);
        }
        rp += 2;
        n--;
@@ -229,7 +277,7 @@ bn_sqr_normal(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len)
                w = ap[0];
                ap++;
-                rp[n] = bn_mul_add_words(rp, ap, n, w);
+                rp[n] = bn_mulw_add_words(rp, ap, n, w);
                rp += 2;
                n--;
        }
@@ -240,20 +288,20 @@ bn_sqr_normal(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len)
        /* Add squares. */
        bn_sqr_add_words(r, a, a_len);
 }
+#endif
 /*
 * bn_sqr() computes a * a, storing the result in r. The caller must ensure that
 * r is not the same BIGNUM as a and that r has been expanded to rn = a->top * 2
 * words.
 */
-int
+static int
-bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
+bn_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
 {
-        bn_sqr_normal(r->d, r_len, a->d, a->top);
+        bn_sqr_words(r->d, a->d, a->top);
        return 1;
 }
-#endif
 int
 BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
@@ -281,10 +329,12 @@ BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
        if (a->top == 4) {
                bn_sqr_comba4(rr->d, a->d);
+        } else if (a->top == 6) {
+                bn_sqr_comba6(rr->d, a->d);
        } else if (a->top == 8) {
                bn_sqr_comba8(rr->d, a->d);
        } else {
-                if (!bn_sqr(rr, a, r_len, ctx))
+                if (!bn_sqr(rr, a, ctx))
                        goto err;
        }
diff --git a/src/lib/libcrypto/bn/bn_word.c b/src/lib/libcrypto/bn/bn_word.c
index a82b911e67..e035878cb9 100644
--- a/src/lib/libcrypto/bn/bn_word.c
+++ b/src/lib/libcrypto/bn/bn_word.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_word.c,v 1.21 2023/07/08 12:21:58 beck Exp $ */
+/* $OpenBSD: bn_word.c,v 1.22 2025/08/30 07:54:27 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -232,7 +232,7 @@ BN_mul_word(BIGNUM *a, BN_ULONG w)
                if (w == 0)
                        BN_zero(a);
                else {
-                        ll = bn_mul_words(a->d, a->d, a->top, w);
+                        ll = bn_mulw_words(a->d, a->d, a->top, w);
                        if (ll) {
                                if (!bn_wexpand(a, a->top + 1))
                                        return (0);
diff --git a/src/lib/libcrypto/bn/s2n_bignum.h b/src/lib/libcrypto/bn/s2n_bignum.h
index ce6e8cdc94..7d77894cdc 100644
--- a/src/lib/libcrypto/bn/s2n_bignum.h
+++ b/src/lib/libcrypto/bn/s2n_bignum.h
@@ -1,3 +1,5 @@
+// $OpenBSD: s2n_bignum.h,v 1.4 2025/08/12 10:01:37 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -34,182 +36,240 @@
 //        throughput, generally offering higher performance there.
 // ----------------------------------------------------------------------------
+#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
+#define S2N_BIGNUM_STATIC
+#else
+#define S2N_BIGNUM_STATIC static
+#endif
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
-extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_add_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_add_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_add_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_add_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Compute "amontification" constant z :== 2^{128k} (congruent mod m)
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_amontifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_amontifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
 // Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_amontmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_amontmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
 // Inputs x[n], m[k], p; output z[k]
-extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
 // Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_amontsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_amontsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Convert 4-digit (256-bit) bignum to/from big-endian form
 // Input x[4]; output z[4]
-extern void bignum_bigendian_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_bigendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to/from big-endian form
 // Input x[6]; output z[6]
-extern void bignum_bigendian_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_bigendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Select bitfield starting at bit n with length l <= 64
 // Inputs x[k], n, l; output function return
-extern uint64_t bignum_bitfield (uint64_t k, uint64_t *x, uint64_t n, uint64_t l);
+extern uint64_t bignum_bitfield (uint64_t k, const uint64_t *x, uint64_t n, uint64_t l);
 // Return size of bignum in bits
 // Input x[k]; output function return
-extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x);
+extern uint64_t bignum_bitsize (uint64_t k, const uint64_t *x);
 // Divide by a single (nonzero) word, z := x / m and return x mod m
 // Inputs x[n], m; outputs function return (remainder) and z[k]
-extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
 // Divide by a single word, z := x / m when known to be exact
 // Inputs x[n], m; output z[k]
-extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
 // Count leading zero digits (64-bit words)
 // Input x[k]; output function return
-extern uint64_t bignum_cld (uint64_t k, uint64_t *x);
+extern uint64_t bignum_cld (uint64_t k, const uint64_t *x);
 // Count leading zero bits
 // Input x[k]; output function return
-extern uint64_t bignum_clz (uint64_t k, uint64_t *x);
+extern uint64_t bignum_clz (uint64_t k, const uint64_t *x);
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Negated multiply-add with single-word multiplier, z := z - c * y
 // Inputs c, y[n]; outputs function return (negative carry-out) and z[k]
-extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Find modulus of bignum w.r.t. single nonzero word m, returning x mod m
 // Input x[k], m; output function return
-extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m);
+extern uint64_t bignum_cmod (uint64_t k, const uint64_t *x, uint64_t m);
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p25519 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p25519_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p256 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p256_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p256k1 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p256k1_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced
 // Inputs c, x[6]; output z[6]
-extern void bignum_cmul_p384 (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
+extern void bignum_cmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_cmul_p384_alt (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
+extern void bignum_cmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced
 // Inputs c, x[9]; output z[9]
-extern void bignum_cmul_p521 (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
+extern void bignum_cmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_cmul_p521_alt (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
+extern void bignum_cmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test bignums for coprimality, gcd(x,y) = 1
 // Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
-extern uint64_t bignum_coprime (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t);
+extern uint64_t bignum_coprime (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y, uint64_t *t);
 // Copy bignum with zero-extension or truncation, z := x
 // Input x[n]; output z[k]
-extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, uint64_t height,
+        uint64_t width, uint64_t idx);
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1]. width must be a multiple of 8.
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table_8n (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t width, uint64_t idx);
+// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*16]; output z[16]
+extern void bignum_copy_row_from_table_16 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
+// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*32]; output z[32]
+extern void bignum_copy_row_from_table_32 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
 // Count trailing zero digits (64-bit words)
 // Input x[k]; output function return
-extern uint64_t bignum_ctd (uint64_t k, uint64_t *x);
+extern uint64_t bignum_ctd (uint64_t k, const uint64_t *x);
 // Count trailing zero bits
 // Input x[k]; output function return
-extern uint64_t bignum_ctz (uint64_t k, uint64_t *x);
+extern uint64_t bignum_ctz (uint64_t k, const uint64_t *x);
 // Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_deamont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_deamont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_deamont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_deamont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_deamont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_deamont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_deamont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert from almost-Montgomery form z := (x / 2^576) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_deamont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_deamont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_deamont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_demont (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_demont (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_demont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_demont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_demont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_demont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_demont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_demont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_demont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_demont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_demont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_demont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Select digit x[n]
 // Inputs x[k], n; output function return
-extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n);
+extern uint64_t bignum_digit (uint64_t k, const uint64_t *x, uint64_t n);
 // Return size of bignum in digits (64-bit word)
 // Input x[k]; output function return
-extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x);
+extern uint64_t bignum_digitsize (uint64_t k, const uint64_t *x);
 // Divide bignum by 10: z' := z div 10, returning remainder z mod 10
 // Inputs z[k]; outputs function return (remainder) and z[k]
@@ -217,294 +277,391 @@ extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
 // Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_double_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_double_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_double_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_double_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Extended Montgomery reduce, returning results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
 // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, const uint64_t *m,
+                                          uint64_t w, uint64_t *m_precalc);
 // Test bignums for equality, x = y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_eq (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_eq (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Test bignum for even-ness
 // Input x[k]; output function return
-extern uint64_t bignum_even (uint64_t k, uint64_t *x);
+extern uint64_t bignum_even (uint64_t k, const uint64_t *x);
 // Convert 4-digit (256-bit) bignum from big-endian bytes
 // Input x[32] (bytes); output z[4]
-extern void bignum_frombebytes_4 (uint64_t z[static 4], uint8_t x[static 32]);
+extern void bignum_frombebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
 // Convert 6-digit (384-bit) bignum from big-endian bytes
 // Input x[48] (bytes); output z[6]
-extern void bignum_frombebytes_6 (uint64_t z[static 6], uint8_t x[static 48]);
+extern void bignum_frombebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
 // Convert 4-digit (256-bit) bignum from little-endian bytes
 // Input x[32] (bytes); output z[4]
-extern void bignum_fromlebytes_4 (uint64_t z[static 4], uint8_t x[static 32]);
+extern void bignum_fromlebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
 // Convert 6-digit (384-bit) bignum from little-endian bytes
 // Input x[48] (bytes); output z[6]
-extern void bignum_fromlebytes_6 (uint64_t z[static 6], uint8_t x[static 48]);
+extern void bignum_fromlebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
 // Convert little-endian bytes to 9-digit 528-bit bignum
 // Input x[66] (bytes); output z[9]
-extern void bignum_fromlebytes_p521 (uint64_t z[static 9],uint8_t x[static 66]);
+extern void bignum_fromlebytes_p521 (uint64_t z[S2N_BIGNUM_STATIC 9],const uint8_t x[S2N_BIGNUM_STATIC 66]);
 // Compare bignums, x >= y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_ge (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_ge (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Compare bignums, x > y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_gt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_gt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_half_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_half_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_half_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_half_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_half_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_half_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_half_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_half_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_half_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_25519 = 2^255 - 19
+// Input x[4]; output z[4]
+extern void bignum_inv_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_inv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+// Modular inverse modulo p_521 = 2^521 - 1
+// Input x[9]; output z[9]
+extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9],const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Inverse square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_invsqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test bignum for zero-ness, x = 0
 // Input x[k]; output function return
-extern uint64_t bignum_iszero (uint64_t k, uint64_t *x);
+extern uint64_t bignum_iszero (uint64_t k, const uint64_t *x);
 // Multiply z := x * y
 // Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
-extern void bignum_kmul_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], uint64_t t[static 32]);
+extern void bignum_kmul_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], const uint64_t y[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 32]);
 // Multiply z := x * y
 // Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
-extern void bignum_kmul_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], uint64_t t[static 96]);
+extern void bignum_kmul_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], const uint64_t y[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 96]);
 // Square, z := x^2
 // Input x[16]; output z[32]; temporary buffer t[>=24]
-extern void bignum_ksqr_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]);
+extern void bignum_ksqr_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 24]);
 // Square, z := x^2
 // Input x[32]; output z[64]; temporary buffer t[>=72]
-extern void bignum_ksqr_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]);
+extern void bignum_ksqr_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 72]);
 // Compare bignums, x <= y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_le (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_le (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Convert 4-digit (256-bit) bignum to/from little-endian form
 // Input x[4]; output z[4]
-extern void bignum_littleendian_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_littleendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to/from little-endian form
 // Input x[6]; output z[6]
-extern void bignum_littleendian_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_littleendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Compare bignums, x < y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_lt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_lt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Multiply-add, z := z + x * y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
+// Inputs x[4], y[4], c[4]; output z[4]
+extern void bignum_madd_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+extern void bignum_madd_n25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+// Reduce modulo group order, z := x mod m_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_m25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[k]; output z[4]
+extern void bignum_mod_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_n25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_256
 // Input x[k]; output z[4]
-extern void bignum_mod_n256 (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
-extern void bignum_mod_n256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
 // Reduce modulo group order, z := x mod n_256
 // Input x[4]; output z[4]
-extern void bignum_mod_n256_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_n256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_256k1
 // Input x[4]; output z[4]
-extern void bignum_mod_n256k1_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_n256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_384
 // Input x[k]; output z[6]
-extern void bignum_mod_n384 (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
-extern void bignum_mod_n384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
 // Reduce modulo group order, z := x mod n_384
 // Input x[6]; output z[6]
-extern void bignum_mod_n384_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_mod_n384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Reduce modulo group order, z := x mod n_521
 // Input x[9]; output z[9]
-extern void bignum_mod_n521_9 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_n521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mod_n521_9_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_n521_9_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_nsm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+extern void bignum_mod_nsm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_nsm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_25519
 // Input x[4]; output z[4]
-extern void bignum_mod_p25519_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_256
 // Input x[k]; output z[4]
-extern void bignum_mod_p256 (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
-extern void bignum_mod_p256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
 // Reduce modulo field characteristic, z := x mod p_256
 // Input x[4]; output z[4]
-extern void bignum_mod_p256_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_mod_p256k1_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_384
 // Input x[k]; output z[6]
-extern void bignum_mod_p384 (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
-extern void bignum_mod_p384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
 // Reduce modulo field characteristic, z := x mod p_384
 // Input x[6]; output z[6]
-extern void bignum_mod_p384_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_mod_p384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Reduce modulo field characteristic, z := x mod p_521
 // Input x[9]; output z[9]
-extern void bignum_mod_p521_9 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_p521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_sm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Add modulo m, z := (x + y) mod m, assuming x and y reduced
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_modadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_modadd (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Double modulo m, z := (2 * x) mod m, assuming x reduced
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_moddouble (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_moddouble (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+// Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m
+// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
+extern void bignum_modexp(uint64_t k,uint64_t *z, const uint64_t *a,const uint64_t *p,const uint64_t *m,uint64_t *t);
 // Compute "modification" constant z := 2^{64k} mod m
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_modifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_modifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
 // Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b
 // Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
-extern void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+extern void bignum_modinv (uint64_t k, uint64_t *z, const uint64_t *a, const uint64_t *b, uint64_t *t);
 // Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[k], m[k]; output z[k]
-extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m);
+extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x, const uint64_t *m);
 // Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_modsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_modsub (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Compute "montification" constant z := 2^{128k} mod m
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_montifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_montifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
+// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^{64k}) mod m
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_montmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_montmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Montgomery multiply, z := (x * y / 2^256) mod p_256
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_montmul_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montmul_p256_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^256) mod p_256k1
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_montmul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montmul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^384) mod p_384
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_montmul_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_montmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montmul_p384_alt (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_montmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Montgomery multiply, z := (x * y / 2^576) mod p_521
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_montmul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_montmul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_montmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery reduce, z := (x' / 2^{64p}) MOD m
 // Inputs x[n], m[k], p; output z[k]
-extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
 // Montgomery square, z := (x^2 / 2^{64k}) mod m
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_montsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_montsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Montgomery square, z := (x^2 / 2^256) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_montsqr_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montsqr_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery square, z := (x^2 / 2^256) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_montsqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montsqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery square, z := (x^2 / 2^384) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_montsqr_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_montsqr_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montsqr_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_montsqr_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Montgomery square, z := (x^2 / 2^576) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_montsqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_montsqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_montsqr_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montsqr_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
-extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
-extern void bignum_mul_4_8 (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_4_8_alt (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply z := x * y
 // Inputs x[6], y[6]; output z[12]
-extern void bignum_mul_6_12 (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mul_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_mul_6_12_alt (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mul_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
-extern void bignum_mul_8_16 (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+extern void bignum_mul_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
-extern void bignum_mul_8_16_alt (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+extern void bignum_mul_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
 // Multiply modulo p_25519, z := (x * y) mod p_25519
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_mul_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_p25519_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply modulo p_256k1, z := (x * y) mod p_256k1
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_mul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_mul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
 // Multiply bignum by 10 and add word: z := 10 * z + d
 // Inputs z[k], d; outputs function return (carry) and z[k]
@@ -512,55 +669,59 @@ extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
 // Multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[k], y[k]; output z[k]
-extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y);
+extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y);
 // 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[4], y[4]; output z[4]
-extern void bignum_mux_4 (uint64_t p, uint64_t z[static 4],uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mux_4 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[6], y[6]; output z[6]
-extern void bignum_mux_6 (uint64_t p, uint64_t z[static 6],uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mux_6 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Select element from 16-element table, z := xs[k*i]
 // Inputs xs[16*k], i; output z[k]
-extern void bignum_mux16 (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i);
+extern void bignum_mux16 (uint64_t k, uint64_t *z, const uint64_t *xs, uint64_t i);
 // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_neg_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_neg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_neg_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_neg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negated modular inverse, z := (-1/x) mod 2^{64k}
 // Input x[k]; output z[k]
-extern void bignum_negmodinv (uint64_t k, uint64_t *z, uint64_t *x);
+extern void bignum_negmodinv (uint64_t k, uint64_t *z, const uint64_t *x);
 // Test bignum for nonzero-ness x =/= 0
 // Input x[k]; output function return
-extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x);
+extern uint64_t bignum_nonzero (uint64_t k, const uint64_t *x);
 // Test 256-bit bignum for nonzero-ness x =/= 0
 // Input x[4]; output function return
-extern uint64_t bignum_nonzero_4(uint64_t x[static 4]);
+extern uint64_t bignum_nonzero_4(const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test 384-bit bignum for nonzero-ness x =/= 0
 // Input x[6]; output function return
-extern uint64_t bignum_nonzero_6(uint64_t x[static 6]);
+extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Normalize bignum in-place by shifting left till top bit is 1
 // Input z[k]; outputs function return (bits shifted left) and z[k]
@@ -568,7 +729,7 @@ extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
 // Test bignum for odd-ness
 // Input x[k]; output function return
-extern uint64_t bignum_odd (uint64_t k, uint64_t *x);
+extern uint64_t bignum_odd (uint64_t k, const uint64_t *x);
 // Convert single digit to bignum, z := n
 // Input n; output z[k]
@@ -576,39 +737,43 @@ extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
 // Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
 // Inputs p, x[k]; outputs function return (nonzero input) and z[k]
-extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x);
+extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x);
 // Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p25519 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p256 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p256k1 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[6]; output z[6]
-extern void bignum_optneg_p384 (uint64_t z[static 6], uint64_t p, uint64_t x[static 6]);
+extern void bignum_optneg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[9]; output z[9]
-extern void bignum_optneg_p521 (uint64_t z[static 9], uint64_t p, uint64_t x[static 9]);
+extern void bignum_optneg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Return bignum of power of 2, z := 2^n
 // Input n; output z[k]
@@ -616,216 +781,376 @@ extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
 // Shift bignum left by c < 64 bits z := x * 2^c
 // Inputs x[n], c; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
 // Shift bignum right by c < 64 bits z := floor(x / 2^c)
 // Inputs x[n], c; outputs function return (bits shifted out) and z[k]
-extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
 // Square, z := x^2
 // Input x[n]; output z[k]
-extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
 // Square, z := x^2
 // Input x[4]; output z[8]
-extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]);
+extern void bignum_sqr_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_4_8_alt (uint64_t z[static 8], uint64_t x[static 4]);
+extern void bignum_sqr_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square, z := x^2
 // Input x[6]; output z[12]
-extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]);
+extern void bignum_sqr_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]);
+extern void bignum_sqr_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Square, z := x^2
 // Input x[8]; output z[16]
-extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]);
+extern void bignum_sqr_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
-extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+extern void bignum_sqr_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
 // Square modulo p_25519, z := (x^2) mod p_25519
 // Input x[4]; output z[4]
-extern void bignum_sqr_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_p25519_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square modulo p_256k1, z := (x^2) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_sqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_sqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_sqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_sqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_sqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
-extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_sub_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_sub_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_sub_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_sub_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Convert 4-digit (256-bit) bignum to big-endian bytes
 // Input x[4]; output z[32] (bytes)
-extern void bignum_tobebytes_4 (uint8_t z[static 32], uint64_t x[static 4]);
+extern void bignum_tobebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to big-endian bytes
 // Input x[6]; output z[48] (bytes)
-extern void bignum_tobebytes_6 (uint8_t z[static 48], uint64_t x[static 6]);
+extern void bignum_tobebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert 4-digit (256-bit) bignum to little-endian bytes
 // Input x[4]; output z[32] (bytes)
-extern void bignum_tolebytes_4 (uint8_t z[static 32], uint64_t x[static 4]);
+extern void bignum_tolebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to little-endian bytes
 // Input x[6]; output z[48] (bytes)
-extern void bignum_tolebytes_6 (uint8_t z[static 48], uint64_t x[static 6]);
+extern void bignum_tolebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert 9-digit 528-bit bignum to little-endian bytes
 // Input x[6]; output z[66] (bytes)
-extern void bignum_tolebytes_p521 (uint8_t z[static 66], uint64_t x[static 9]);
+extern void bignum_tolebytes_p521 (uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
 // Convert to Montgomery form z := (2^256 * x) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_tomont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_tomont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert to Montgomery form z := (2^256 * x) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_tomont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_tomont_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert to Montgomery form z := (2^384 * x) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_tomont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_tomont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_tomont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_tomont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert to Montgomery form z := (2^576 * x) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_tomont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_tomont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert to Montgomery form z := (2^256 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_tomont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_256, z := (3 * x) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_triple_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_triple_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_256k1, z := (3 * x) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_triple_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_triple_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_384, z := (3 * x) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_triple_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_triple_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_triple_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_triple_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_triple_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_triple_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_triple_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_triple_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_triple_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_triple_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery ladder step for curve25519
 // Inputs point[8], pp[16], b; output rr[16]
-extern void curve25519_ladderstep(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
-extern void curve25519_ladderstep_alt(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep_alt(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
 // Projective scalar multiplication, x coordinate only, for curve25519
 // Inputs scalar[4], point[4]; output res[8]
-extern void curve25519_pxscalarmul(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_pxscalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_pxscalarmul_alt(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_pxscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
 // x25519 function for curve25519
 // Inputs scalar[4], point[4]; output res[4]
-extern void curve25519_x25519(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_x25519(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_x25519_alt(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_x25519_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+// x25519 function for curve25519 (byte array arguments)
+// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
 // x25519 function for curve25519 on base element 9
 // Input scalar[4]; output res[4]
-extern void curve25519_x25519base(uint64_t res[static 4],uint64_t scalar[static 4]);
+extern void curve25519_x25519base(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_x25519base_alt(uint64_t res[static 4],uint64_t scalar[static 4]);
+extern void curve25519_x25519base_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+// x25519 function for curve25519 on base element 9 (byte array arguments)
+// Input scalar[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+// Decode compressed 256-bit form of edwards25519 point
+// Input c[32] (bytes); output function return and z[8]
+extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+// Encode edwards25519 point into compressed form as 256-bit number
+// Input p[8]; output z[32] (bytes)
+extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t p[S2N_BIGNUM_STATIC 8]);
 // Extended projective addition for edwards25519
 // Inputs p1[16], p2[16]; output p3[16]
-extern void edwards25519_epadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]);
+extern void edwards25519_epadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
-extern void edwards25519_epadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]);
+extern void edwards25519_epadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
 // Extended projective doubling for edwards25519
 // Inputs p1[12]; output p3[16]
-extern void edwards25519_epdouble(uint64_t p3[static 16],uint64_t p1[static 12]);
+extern void edwards25519_epdouble(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_epdouble_alt(uint64_t p3[static 16],uint64_t p1[static 12]);
+extern void edwards25519_epdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Projective doubling for edwards25519
 // Inputs p1[12]; output p3[12]
-extern void edwards25519_pdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void edwards25519_pdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_pdouble_alt(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void edwards25519_pdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Extended projective + precomputed mixed addition for edwards25519
 // Inputs p1[16], p2[12]; output p3[16]
-extern void edwards25519_pepadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]);
+extern void edwards25519_pepadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_pepadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]);
+extern void edwards25519_pepadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Scalar multiplication by standard basepoint for edwards25519 (Ed25519)
+// Input scalar[4]; output res[8]
+extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+// Double scalar multiplication for edwards25519, fresh and base point
+// Input scalar[4], point[8], bscalar[4]; output res[8]
+extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k2(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 512],const int16_t b[S2N_BIGNUM_STATIC 512],const int16_t bt[S2N_BIGNUM_STATIC 256]);
+// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k3(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 768],const int16_t b[S2N_BIGNUM_STATIC 768],const int16_t bt[S2N_BIGNUM_STATIC 384]);
+// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k4(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 1024],const int16_t b[S2N_BIGNUM_STATIC 1024],const int16_t bt[S2N_BIGNUM_STATIC 512]);
+// Inverse number-theoretic transform from ML-KEM
+// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_intt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
+// Precompute the mulcache data for a polynomial in the NTT domain
+// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words)
+extern void mlkem_mulcache_compute(int16_t x[S2N_BIGNUM_STATIC 128],const int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z[S2N_BIGNUM_STATIC 128],const int16_t t[S2N_BIGNUM_STATIC 128]);
+// Forward number-theoretic transform from ML-KEM
+// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_ntt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
+// Canonical modular reduction of polynomial coefficients for ML-KEM
+// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_reduce(int16_t a[S2N_BIGNUM_STATIC 256]);
+// Pack ML-KEM polynomial coefficients as 12-bit numbers
+// Input a[256] (signed 16-bit words); output r[384] (bytes)
+extern void mlkem_tobytes(uint8_t r[S2N_BIGNUM_STATIC 384],const int16_t a[S2N_BIGNUM_STATIC 256]);
+// Conversion of ML-KEM polynomial coefficients to Montgomery form
+// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_tomont(int16_t a[S2N_BIGNUM_STATIC 256]);
+// Uniform rejection sampling for ML-KEM
+// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return
+extern uint64_t mlkem_rej_uniform_VARIABLE_TIME(int16_t r[S2N_BIGNUM_STATIC 256],const uint8_t *buf,uint64_t buflen,const uint8_t *table);
 // Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12], p2[12]; output p3[12]
-extern void p256_montjadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
 // Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12]; output p3[12]
-extern void p256_montjdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void p256_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12], p2[8]; output p3[12]
-extern void p256_montjmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+extern void p256_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void p256_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Montgomery-Jacobian form scalar multiplication for P-256
+// Input scalar[4], point[12]; output res[12]
+extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+// Scalar multiplication for NIST curve P-256
+// Input scalar[4], point[8]; output res[8]
+extern void p256_scalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+extern void p256_scalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+extern void p256_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
+extern void p256_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
 // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18], p2[18]; output p3[18]
-extern void p384_montjadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]);
+extern void p384_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
 // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18]; output p3[18]
-extern void p384_montjdouble(uint64_t p3[static 18],uint64_t p1[static 18]);
+extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
 // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18], p2[12]; output p3[18]
-extern void p384_montjmixadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
+extern void p384_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p384_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Montgomery-Jacobian form scalar multiplication for P-384
+// Input scalar[6], point[18]; output res[18]
+extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
 // Point addition on NIST curve P-521 in Jacobian coordinates
 // Inputs p1[27], p2[27]; output p3[27]
-extern void p521_jadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]);
+extern void p521_jadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
+extern void p521_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
 // Point doubling on NIST curve P-521 in Jacobian coordinates
 // Input p1[27]; output p3[27]
-extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]);
+extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
+extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
 // Point mixed addition on NIST curve P-521 in Jacobian coordinates
 // Inputs p1[27], p2[18]; output p3[27]
-extern void p521_jmixadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]);
+extern void p521_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p521_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+// Jacobian form scalar multiplication for P-521
+// Input scalar[9], point[27]; output res[27]
+extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
+extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
 // Point addition on SECG curve secp256k1 in Jacobian coordinates
 // Inputs p1[12], p2[12]; output p3[12]
-extern void secp256k1_jadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+extern void secp256k1_jadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
 // Point doubling on SECG curve secp256k1 in Jacobian coordinates
 // Input p1[12]; output p3[12]
-extern void secp256k1_jdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void secp256k1_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
 // Inputs p1[12], p2[8]; output p3[12]
-extern void secp256k1_jmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+extern void secp256k1_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void secp256k1_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Keccak-f1600 permutation for SHA3
+// Inputs a[25], rc[24]; output a[25]
+extern void sha3_keccak_f1600(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Batched 2-way Keccak-f1600 permutation for SHA3
+// Inputs a[50], rc[24]; output a[50]
+extern void sha3_keccak2_f1600(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak2_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Batched 4-way Keccak-f1600 permutation for SHA3
+// Inputs a[100], rc[24]; output a[100]
+extern void sha3_keccak4_f1600(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak4_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak4_f1600_alt2(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Point addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[12]; output p3[12]
+extern void sm2_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12]; output p3[12]
+extern void sm2_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+// Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[8]; output p3[12]
+extern void sm2_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void sm2_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Montgomery-Jacobian form scalar multiplication for CC curve SM2
+// Input scalar[4], point[12]; output res[12]
+extern void sm2_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
 // Reverse the bytes in a single word
 // Input a; output function return
@@ -839,6 +1164,10 @@ extern uint64_t word_clz (uint64_t a);
 // Input a; output function return
 extern uint64_t word_ctz (uint64_t a);
+// Perform 59 "divstep" iterations and return signed matrix of updates
+// Inputs d, f, g; output m[2][2] and function return
+extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
 // Return maximum of two unsigned 64-bit words
 // Inputs a, b; output function return
 extern uint64_t word_max (uint64_t a, uint64_t b);
@@ -851,6 +1180,10 @@ extern uint64_t word_min (uint64_t a, uint64_t b);
 // Input a; output function return
 extern uint64_t word_negmodinv (uint64_t a);
+// Count number of set bits in a single 64-bit word (population count)
+// Input a; output function return
+extern uint64_t word_popcount (uint64_t a);
 // Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set
 // Input a; output function return
 extern uint64_t word_recip (uint64_t a);
diff --git a/src/lib/libcrypto/bn/s2n_bignum_internal.h b/src/lib/libcrypto/bn/s2n_bignum_internal.h
index b82db7d019..37eebb4fd6 100644
--- a/src/lib/libcrypto/bn/s2n_bignum_internal.h
+++ b/src/lib/libcrypto/bn/s2n_bignum_internal.h
@@ -1,3 +1,5 @@
+// $OpenBSD: s2n_bignum_internal.h,v 1.5 2025/08/12 10:01:37 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -14,14 +16,14 @@
 #ifdef __APPLE__
 #   define S2N_BN_SYMBOL(NAME) _##NAME
+#   if defined(__AARCH64EL__) || defined(__ARMEL__)
+#     define __LF %%
+#   else
+#     define __LF ;
+#   endif
 #else
 #   define S2N_BN_SYMBOL(name) name
-#endif
+#   define __LF ;
-#ifdef __CET__
-#   include <cet.h>
-#else
-#   define _CET_ENDBR
 #endif
 #define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name)
@@ -34,3 +36,24 @@
 #else
 #   define S2N_BN_SYM_PRIVACY_DIRECTIVE(name)  /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */
 #endif
+// Enable indirect branch tracking support unless explicitly disabled
+// with -DNO_IBT. If the platform supports CET, simply inherit this from
+// the usual header. Otherwise manually define _CET_ENDBR, used at each
+// x86 entry point, to be the ENDBR64 instruction, with an explicit byte
+// sequence for compilers/assemblers that don't know about it. Note that
+// it is safe to use ENDBR64 on all platforms, since the encoding is by
+// design interpreted as a NOP on all pre-CET x86_64 processors. The only
+// downside is a small increase in code size and potentially a modest
+// slowdown from executing one more instruction.
+#if NO_IBT
+#   if defined(_CET_ENDBR)
+#     error "The s2n-bignum build option NO_IBT was configured, but _CET_ENDBR is defined in this compilation unit. That is weird, so failing the build."
+#   endif
+#   define _CET_ENDBR
+#elif defined(__CET__)
+#   include <cet.h>
+#elif !defined(_CET_ENDBR)
+#   define _CET_ENDBR .byte 0xf3,0x0f,0x1e,0xfa
+#endif