23 files changed, 2231 insertions, 138 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
index 5fe4aae7a1..1d4e6d08ef 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_add
+//    extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x + y operation, truncating modulo p words in general and
 // returning a top carry (0 or 1) in the p'th place, only adding the input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_add):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_add_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_add_xtest
-xmainloop:
+bignum_add_xmainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_add_xmainloop
-        jmp     xtest
+        jmp     bignum_add_xtest
-xtoploop:
+bignum_add_xtoploop:
        mov     a, [x+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_add_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_add_xtoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -112,30 +113,30 @@ xtest:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_add_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_add_ytoploop
-ymainloop:
+bignum_add_ymainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_add_ymainloop
-ytoploop:
+bignum_add_ytoploop:
        mov     a, [y+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_add_ytoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -144,16 +145,16 @@ ytoploop:
 // Adding a non-trivial tail, when p > max(m,n)
-tails:
+bignum_add_tails:
        mov     [z+8*i],a
        xor     a, a
-        jmp     tail
+        jmp     bignum_add_tail
-tailloop:
+bignum_add_tailloop:
        mov     [z+8*i],a
-tail:
+bignum_add_tail:
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_add_tailloop
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
index 25ba17bce2..a611919603 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmadd
+//    extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                 const uint64_t *y);
 //
 // Does the "z := z + c * y" operation where y is n digits, result z is p.
 // Truncates the result in general.
@@ -54,7 +56,7 @@
 S2N_BN_SYMBOL(bignum_cmadd):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd):
        xor     h, h
        test    n, n
-        jz      end
+        jz      bignum_cmadd_end
 // Move c into a safer register as multiplies overwrite rdx
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd):
        mov     h, rdx
        mov     ishort, 1
        dec     n
-        jz      hightail
+        jz      bignum_cmadd_hightail
 // Main loop, where we always have CF + previous high part h to add in
-loop:
+bignum_cmadd_loop:
        adc     h, [z+8*i]
        sbb     r, r
        mov     rax, [x+8*i]
@@ -111,36 +113,36 @@ loop:
        mov     h, rdx
        inc     i
        dec     n
-        jnz     loop
+        jnz     bignum_cmadd_loop
-hightail:
+bignum_cmadd_hightail:
        adc     h, 0
 // Propagate the carry all the way to the end with h as extra carry word
-tail:
+bignum_cmadd_tail:
        test    p, p
-        jz      end
+        jz      bignum_cmadd_end
        add     [z+8*i], h
        mov     hshort, 0
        inc     i
        dec     p
-        jz      highend
+        jz      bignum_cmadd_highend
-tloop:
+bignum_cmadd_tloop:
        adc     [z+8*i], h
        inc     i
        dec     p
-        jnz     tloop
+        jnz     bignum_cmadd_tloop
-highend:
+bignum_cmadd_highend:
        adc     h, 0
 // Return the high/carry word
-end:
+bignum_cmadd_end:
        mov     rax, h
        pop     rbx
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
index 12f785d63a..eb71d9da44 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmul
+//    extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                const uint64_t *y);
 //
 // Does the "z := c * y" operation where y is n digits, result z is p.
 // Truncates the result in general unless p >= n + 1.
@@ -51,7 +53,7 @@
 S2N_BN_SYMBOL(bignum_cmul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul):
        xor     h, h
        xor     i, i
        test    n, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Move c into a safer register as multiplies overwrite rdx
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul):
        mov     h, rdx
        inc     i
        cmp     i, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Main loop doing the multiplications
-loop:
+bignum_cmul_loop:
        mov     rax, [x+8*i]
        mul     c
        add     rax, h
@@ -103,28 +105,28 @@ loop:
        mov     h, rdx
        inc     i
        cmp     i, n
-        jc      loop
+        jc      bignum_cmul_loop
 // Add a tail when the destination is longer
-tail:
+bignum_cmul_tail:
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
        mov     [z+8*i], h
        xor     h, h
        inc     i
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
-tloop:
+bignum_cmul_tloop:
        mov     [z+8*i], h
        inc     i
        cmp     i, p
-        jc      tloop
+        jc      bignum_cmul_tloop
 // Return the high/carry word
-end:
+bignum_cmul_end:
        mov     rax, h
 #if WINDOWS_ABI
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
new file mode 100644
index 0000000000..baf27fdc7f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
@@ -0,0 +1,112 @@
+// $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modadd):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modadd_end
+// First just add (c::z) := x + y
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modadd_addloop:
+        mov     a, [x+8*i]
+        adc     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modadd_addloop
+        adc     c, 0
+// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
+        mov     j, k
+        xor     i, i
+bignum_modadd_cmploop:
+        mov     a, [z+8*i]
+        sbb     a, [m+8*i]
+        inc     i
+        dec     j
+        jnz     bignum_modadd_cmploop
+        sbb     c, 0
+        not     c
+// Now do a masked subtraction z := z - [c] * m
+        xor     i, i
+bignum_modadd_subloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        sbb     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modadd_subloop
+bignum_modadd_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
new file mode 100644
index 0000000000..63b3230e35
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
@@ -0,0 +1,99 @@
+// $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modsub):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modsub_end
+// Subtract z := x - y and record a mask for the carry x - y < 0
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modsub_subloop:
+        mov     a, [x+8*i]
+        sbb     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modsub_subloop
+        sbb     c, c
+// Now do a masked addition z := z + [c] * m
+        xor     i, i
+bignum_modsub_addloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        adc     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modsub_addloop
+bignum_modsub_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
index a3552679a2..538cce9af7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
 //
-//    extern void bignum_mul
+//    extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
-//     (uint64_t k, uint64_t *z,
+//                           uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the "z := x * y" operation where x is m digits, y is n, result z is k.
 // Truncates the result in general unless k >= m + n
@@ -59,7 +60,7 @@
 S2N_BN_SYMBOL(bignum_mul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul):
 // If we did a multiply-add variant, however, then we could
        test    p, p
-        jz      end
+        jz      bignum_mul_end
 // Set initial 2-part sum to zero (we zero c inside the body)
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul):
        xor     k, k
-outerloop:
+bignum_mul_outerloop:
 // Zero our carry term first; we eventually want it and a zero is useful now
 // Set a =  max 0 (k + 1 - n), i = min (k + 1) m
@@ -125,11 +126,11 @@ outerloop:
        mov     d, k
        sub     d, i
        sub     i, a
-        jbe     innerend
+        jbe     bignum_mul_innerend
        lea     x,[rcx+8*a]
        lea     y,[r9+8*d-8]
-innerloop:
+bignum_mul_innerloop:
        mov     rax, [y+8*i]
        mul     QWORD PTR  [x]
        add     x, 8
@@ -137,9 +138,9 @@ innerloop:
        adc     h, rdx
        adc     c, 0
        dec     i
-        jnz     innerloop
+        jnz     bignum_mul_innerloop
-innerend:
+bignum_mul_innerend:
        mov     [z], l
        mov     l, h
@@ -147,9 +148,9 @@ innerend:
        add     z, 8
        cmp     k, p
-        jc      outerloop
+        jc      bignum_mul_outerloop
-end:
+bignum_mul_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..d6ad514020
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,187 @@
+// $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
+//                               const uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 4 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 4 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 4 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 4 == 3)
+        adcx    r11, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 4 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 4 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 4 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 4 == 3)
+        mov     [z+8*\arg1],r11
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+.if (\arg1 % 4 == 0)
+        mulx    r8, rax, [x+24]
+        adcx    r11, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 4 == 1)
+        mulx    r9, rax, [x+24]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 4 == 2)
+        mulx    r10, rax, [x+24]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 4 == 3)
+        mulx    r11, rax, [x+24]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r11,r10,r9 as y[0] * x from 1..4
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r8, rbx, [x+24]
+        adcx    r11, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+// Now write back the additional columns
+        mov     [z+32], r8
+        mov     [z+40], r9
+        mov     [z+48], r10
+        mov     [z+56], r11
+// Restore registers and return
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
index 70ff69e372..2592d1d658 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
 //
-//    extern void bignum_mul_4_8_alt
+//    extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4],
-//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//                                   const uint64_t y[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +74,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..56cbdf06e0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,223 @@
+// $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
+//                                const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcx    r13, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 6 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 6 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 6 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 6 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 6 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 6 == 5)
+        mov     [z+8*\arg1],r13
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+.if (\arg1 % 6 == 0)
+        mulx    r8, rax, [x+40]
+        adcx    r13, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 6 == 1)
+        mulx    r9, rax, [x+40]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 6 == 2)
+        mulx    r10, rax, [x+40]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 6 == 3)
+        mulx    r11, rax, [x+40]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.elseif (\arg1 % 6 == 4)
+        mulx    r12, rax, [x+40]
+        adcx    r11, rax
+        adox    r12, zero
+        adcx    r12, zero
+.elseif (\arg1 % 6 == 5)
+        mulx    r13, rax, [x+40]
+        adcx    r12, rax
+        adox    r13, zero
+        adcx    r13, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r12, rbx, [x+24]
+        adcx    r11, rbx
+        mulx    r13, rbx, [x+32]
+        adcx    r12, rbx
+        mulx    r8, rbx, [x+40]
+        adcx    r13, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+// Now write back the additional columns
+        mov     [z+48], r8
+        mov     [z+56], r9
+        mov     [z+64], r10
+        mov     [z+72], r11
+        mov     [z+80], r12
+        mov     [z+88], r13
+// Restore registers and return
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..077c52b38e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,199 @@
+// $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6],
+//                                    const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// This is moved from rdx to free it for muls
+#define y rcx
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A minutely shorter form for when c = 0 initially
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+// A short form where we don't expect a top carry
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Result term 0
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+// Result term 3
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+// Result term 4
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+// Result term 5
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+// Result term 6
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        mov     [z+48], t2
+// Result term 7
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40],[y+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..faa0196d8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,273 @@
+// $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
+//                                const uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rbx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rbx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rbx
+.endif
+.endm
+// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd 0, \arg1
+.if (\arg1 % 8 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 8 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 8 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 8 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 8 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 8 == 5)
+        mov     [z+8*\arg1],r13
+.elseif (\arg1 % 8 == 6)
+        mov     [z+8*\arg1],r14
+.elseif (\arg1 % 8 == 7)
+        mov     [z+8*\arg1],r15
+.endif
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+.if (\arg1 % 8 == 0)
+        adc     r8, zero
+.elseif (\arg1 % 8 == 1)
+        adc     r9, zero
+.elseif (\arg1 % 8 == 2)
+        adc     r10, zero
+.elseif (\arg1 % 8 == 3)
+        adc     r11, zero
+.elseif (\arg1 % 8 == 4)
+        adc     r12, zero
+.elseif (\arg1 % 8 == 5)
+        adc     r13, zero
+.elseif (\arg1 % 8 == 6)
+        adc     r14, zero
+.elseif (\arg1 % 8 == 7)
+        adc     r15, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adc     r9, rbx
+        mulx    r11, rbx, [x+16]
+        adc     r10, rbx
+        mulx    r12, rbx, [x+24]
+        adc     r11, rbx
+        mulx    r13, rbx, [x+32]
+        adc     r12, rbx
+        mulx    r14, rbx, [x+40]
+        adc     r13, rbx
+        mulx    r15, rbx, [x+48]
+        adc     r14, rbx
+        mulx    r8, rbx, [x+56]
+        adc     r15, rbx
+        adc     r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+        addrow  6
+        addrow  7
+// Now write back the additional columns
+        mov     [z+64], r8
+        mov     [z+72], r9
+        mov     [z+80], r10
+        mov     [z+88], r11
+        mov     [z+96], r12
+        mov     [z+104], r13
+        mov     [z+112], r14
+        mov     [z+120], r15
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
index 066403b074..0e30b9170f 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,9 @@
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
 //
-//    extern void bignum_mul_8_16_alt
+//    extern void bignum_mul_8_16_alt(uint64_t z[static 16],
-//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//                                    const uint64_t x[static 8],
+//                                    const uint64_t y[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +75,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
index 54e3f59442..86f1af2ac4 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,7 @@
 // Square z := x^2
 // Input x[n]; output z[k]
 //
-//    extern void bignum_sqr
+//    extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
-//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
 //
 // Does the "z := x^2" operation where x is n digits and result z is k.
 // Truncates the result in general unless k >= 2 * n
@@ -62,7 +63,7 @@
 #define llshort ebp
 S2N_BN_SYMBOL(bignum_sqr):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr):
 // If p = 0 the result is trivial and nothing needs doing
        test    p, p
-        jz      end
+        jz      bignum_sqr_end
 // initialize (hh,ll) = 0
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr):
        xor     k, k
-outerloop:
+bignum_sqr_outerloop:
 // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
 // We want to accumulate all x[i] * x[k - i] for bot <= i < top
@@ -122,7 +123,7 @@ outerloop:
 // If htop <= bot then main doubled part of the sum is empty
        cmp     i, htop
-        jnc     nosumming
+        jnc     bignum_sqr_nosumming
 // Use a moving pointer for [y] = x[k-i] for the cofactor
@@ -132,7 +133,7 @@ outerloop:
 // Do the main part of the sum x[i] * x[k - i] for 2 * i < k
-innerloop:
+bignum_sqr_innerloop:
        mov     a, [x+8*i]
        mul     QWORD PTR [y]
        add     l, a
@@ -141,7 +142,7 @@ innerloop:
        sub     y, 8
        inc     i
        cmp     i, htop
-        jc      innerloop
+        jc      bignum_sqr_innerloop
 // Now double it
@@ -151,11 +152,11 @@ innerloop:
 // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
-nosumming:
+bignum_sqr_nosumming:
        test    k, 1
-        jnz     innerend
+        jnz     bignum_sqr_innerend
        cmp     i, n
-        jnc     innerend
+        jnc     bignum_sqr_innerend
        mov     a, [x+8*i]
        mul     a
@@ -165,7 +166,7 @@ nosumming:
 // Now add the local sum into the global sum, store and shift
-innerend:
+bignum_sqr_innerend:
        add     l, ll
        mov     [z+8*k], l
        adc     h, hh
@@ -175,11 +176,11 @@ innerend:
        inc     k
        cmp     k, p
-        jc      outerloop
+        jc      bignum_sqr_outerloop
 // Restore registers and return
-end:
+bignum_sqr_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..25664782f7
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,158 @@
+// $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+S2N_BN_SYMBOL(bignum_sqr_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+// Set up an initial window [d6;...d1] = [23;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mov     rdx, [x+16]
+        mulx    d6, d5, [x+24]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+24]
+        mulx    rcx, rax, [x+8]
+        adcx    d4, rax
+        adox    d5, rcx
+        adcx    d5, zero
+        adox    d6, zero
+        adcx    d6, zero
+// In principle this is otiose as CF and OF carries are absorbed at this point
+// However it seems helpful for the OOO engine to be told it's a fresh start
+        xor     zeroe, zeroe
+// Double and add to the 00 + 11 + 22 + 33 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        mov     [z+32], d4
+        adcx    d6, d6
+        mov     [z+40], d5
+        adox    d6, rax
+        mov     [z+48], d6
+        adcx    rdx, zero
+        adox    rdx, zero
+        mov     [z+56], rdx
+// Restore saved registers and return
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
index 7c534ae907..7eafac3284 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Square, z := x^2
 // Input x[4]; output z[8]
 //
-//    extern void bignum_sqr_4_8_alt
+//    extern void bignum_sqr_4_8_alt(uint64_t z[static 8],
-//      (uint64_t z[static 8], uint64_t x[static 4]);
+//                                   const uint64_t x[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -71,7 +73,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..3f055e8b75
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,227 @@
+// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+#define d7 r14
+#define d8 r15
+#define d9 rbx
+// Care is needed: re-using the zero register
+#define d10 rbp
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mulx    d6, d5, [x+40]
+        mov     rdx, [x+24]
+        mulx    d8, d7, [x+32]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+8]
+        mulx    rcx, rax, [x+24]
+        adcx    d4, rax
+        adox    d5, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d6, rax
+        adox    d7, rcx
+        adcx    d7, zero
+        adox    d8, zero
+        adcx    d8, zero
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+        xor     zeroe, zeroe
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+        mov     rdx, [x+32]
+        mulx    rcx, rax, [x]
+        adcx    d4, rax
+        adox    d5, rcx
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x+24]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d6, rax
+        adox    d7, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d7, rax
+        adox    d8, rcx
+        mov     rdx, [x+24]
+        mulx    d9, rax, [x+40]
+        adcx    d8, rax
+        adox    d9, zero
+        mov     rdx, [x+32]
+        mulx    d10, rax, [x+40]
+        adcx    d9, rax
+        mov     eax, 0
+        adox    d10, rax
+        adcx    d10, rax
+// Again, just for a clear fresh start for the flags
+        xor     eax, eax
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        adcx    d6, d6
+        adox    d6, rax
+        adcx    d7, d7
+        adox    d7, rdx
+        mov     rdx, [x+32]
+        mov     [z+32], d4
+        mulx    rdx, rax, rdx
+        adcx    d8, d8
+        adox    d8, rax
+        adcx    d9, d9
+        adox    d9, rdx
+        mov     rdx, [x+40]
+        mov     [z+40], d5
+        mulx    rdx, rax, rdx
+        mov     [z+48], d6
+        adcx    d10, d10
+        mov     [z+56], d7
+        adox    d10, rax
+        mov     [z+64], d8
+        mov     eax, 0
+        mov     [z+72], d9
+        adcx    rdx, rax
+        mov     [z+80], d10
+        adox    rdx, rax
+        mov     [z+88], rdx
+// Restore saved registers and return
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..eb43b0a15b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,210 @@
+// $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+// Input arguments
+#define z rdi
+#define x rsi
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Additional temporaries for local windows to share doublings
+#define u0 rcx
+#define u1 r11
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// Set up initial window (c,h,l) = numa * numb
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+// Square term incorporation (c,h,l) += numba^2
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A short form where we don't expect a top carry
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+// A version doubling directly before adding, for single non-square terms
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Result term 0
+        mov     rax, [x]
+        mul     rax
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+// Result term 3
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+// Result term 4
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+// Result term 5
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+// Result term 6
+        combaddz(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+// Result term 7
+        combaddz(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+24],[x+40])
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+32],[x+40])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..41277b5b6a
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,311 @@
+// $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds rdx * x[i] into the window  at the i+j point
+.macro mulpadd arg1,arg2
+        mulx    rcx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rcx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rcx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rcx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rcx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rcx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rcx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rcx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rcx
+.endif
+.endm
+// mulpade i, j adds rdx * x[i] into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+.macro diagonals
+        xor     zeroe, zeroe
+// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+        mov     rdx, [x]
+        mulx    rax, r9, [x+8]
+        mov     [z+8], r9
+        mulx    rcx, r10, [x+16]
+        adcx    r10, rax
+        mov     [z+16], r10
+        mulx    rax, r11, [x+24]
+        adcx    r11, rcx
+        mulx    rcx, r12, [x+32]
+        adcx    r12, rax
+        mulx    rax, r13, [x+40]
+        adcx    r13, rcx
+        mulx    rcx, r14, [x+48]
+        adcx    r14, rax
+        mulx    r8, r15, [x+56]
+        adcx    r15, rcx
+        adcx    r8, zero
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+        xor     zeroe, zeroe
+        mov     rdx, [x+8]
+        mulpadd 2, 1
+        mov     [z+24], r11
+        mulpadd 3, 1
+        mov     [z+32], r12
+        mulpadd 4, 1
+        mulpadd 5, 1
+        mulpadd 6, 1
+        mulpade 7, 1
+        mov     rdx, [x+32]
+        mulpade 5, 4
+        adcx    r10, zero
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+        xor     zeroe, zeroe
+        mov     rdx, [x+16]
+        mulpadd 3, 2
+        mov     [z+40], r13
+        mulpadd 4, 2
+        mov     [z+48], r14
+        mulpadd 5, 2
+        mulpadd 6, 2
+        mulpadd 7, 2
+        mov     rdx, [x+48]
+        mulpade 4, 6
+        mulpade 5, 6
+        adcx    r12, zero
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+        xor     zeroe, zeroe
+        mov     rdx, [x+24]
+        mulpadd 4, 3
+        mov     [z+56], r15
+        mulpadd 5, 3
+        mov     [z+64], r8
+        mulpadd 6, 3
+        mulpadd 7, 3
+        mov     rdx, [x+56]
+        mulpadd 4, 7
+        mulpade 5, 7
+        mulpade 6, 7
+        adcx    r14, zero
+// Double and add things; use z[1]..z[8] and thereafter the registers
+// r9..r15 which haven't been written back yet
+        xor     zeroe, zeroe
+        mov     rdx, [x]
+        mulx    rcx, rax, rdx
+        mov     [z], rax
+        mov     rax, [z+8]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+8], rax
+        mov     rax, [z+16]
+        mov     rdx, [x+8]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+16], rax
+        mov     rax, [z+24]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+24], rax
+        mov     rax, [z+32]
+        mov     rdx, [x+16]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+32], rax
+        mov     rax, [z+40]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+40], rax
+        mov     rax, [z+48]
+        mov     rdx, [x+24]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+48], rax
+        mov     rax, [z+56]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+56], rax
+        mov     rax, [z+64]
+        mov     rdx, [x+32]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+64], rax
+        adcx    r9, r9
+        adox    r9, rcx
+        mov     [z+72], r9
+        mov     rdx, [x+40]
+        mulx    rcx, rdx, rdx
+        adcx    r10, r10
+        adox    r10, rdx
+        mov     [z+80], r10
+        adcx    r11, r11
+        adox    r11, rcx
+        mov     [z+88], r11
+        mov     rdx, [x+48]
+        mulx    rcx, rdx, rdx
+        adcx    r12, r12
+        adox    r12, rdx
+        mov     [z+96], r12
+        adcx    r13, r13
+        adox    r13, rcx
+        mov     [z+104], r13
+        mov     rdx, [x+56]
+        mulx    r15, rdx, rdx
+        adcx    r14, r14
+        adox    r14, rdx
+        mov     [z+112], r14
+        adcx    r15, zero
+        adox    r15, zero
+        mov     [z+120], r15
+.endm
+S2N_BN_SYMBOL(bignum_sqr_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Do the multiplication
+        diagonals
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
index ac0b6f96c2..cb10ba2a12 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,8 @@
 // Square, z := x^2
 // Input x[8]; output z[16]
 //
-//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//    extern void bignum_sqr_8_16_alt(uint64_t z[static 16],
+//                                    const uint64_t x[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -103,7 +106,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
index 3ff8a30510..7324d3a71e 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_sub
+//    extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x - y operation, truncating modulo p words in general and
 // returning a top borrow (0 or 1) in the p'th place, only subtracting input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_sub):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_sub_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_sub_xtest
-xmainloop:
+bignum_sub_xmainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_sub_xmainloop
-        jmp     xtest
+        jmp     bignum_sub_xtest
-xtoploop:
+bignum_sub_xtoploop:
        mov     a, [x+8*i]
        sbb     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_sub_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_sub_xtoploop
        sbb     a, a
        test    p, p
-        jz      tailskip
+        jz      bignum_sub_tailskip
-tailloop:
+bignum_sub_tailloop:
        mov     [z+8*i],a
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
-tailskip:
+bignum_sub_tailskip:
        neg     a
 #if WINDOWS_ABI
        pop    rsi
@@ -118,29 +119,29 @@ tailskip:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_sub_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_sub_ytoploop
-ymainloop:
+bignum_sub_ymainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_sub_ymainloop
-ytoploop:
+bignum_sub_ytoploop:
        mov     ashort, 0
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_sub_ytoploop
        sbb     a, a
        test    p, p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
        neg     a
 #if WINDOWS_ABI
        pop    rsi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
index a377a05681..9ff8920ca2 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */
+/*      $OpenBSD: bn_arch.c,v 1.12 2025/08/14 15:29:17 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -19,6 +19,7 @@
 #include "bn_arch.h"
 #include "bn_local.h"
+#include "crypto_arch.h"
 #include "s2n_bignum.h"
 #ifdef HAVE_BN_ADD
@@ -26,8 +27,8 @@ BN_ULONG
 bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
 }
 #endif
@@ -46,8 +47,8 @@ BN_ULONG
 bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -55,8 +56,28 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MOD_ADD_WORDS
+void
+bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
+}
+#endif
+#ifdef HAVE_BN_MOD_SUB_WORDS
+void
+bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
 }
 #endif
@@ -64,7 +85,7 @@ bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 BN_ULONG
 bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
@@ -72,25 +93,52 @@ bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 BN_ULONG
 bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
 #ifdef HAVE_BN_MUL_COMBA4
 void
-bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MUL_COMBA6
+void
+bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
 #ifdef HAVE_BN_MUL_COMBA8
 void
-bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
@@ -98,7 +146,7 @@ bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
 int
 bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 {
-        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d);
+        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (const uint64_t *)a->d);
        return 1;
 }
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 void
 bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad);
+}
+#endif
+#ifdef HAVE_BN_SQR_COMBA6
+void
+bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
@@ -117,8 +182,12 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 void
 bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 927cd75208..7359f993a7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.16 2025/08/14 15:22:54 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -27,13 +27,18 @@
 #define HAVE_BN_DIV_WORDS
+#define HAVE_BN_MOD_ADD_WORDS
+#define HAVE_BN_MOD_SUB_WORDS
 #define HAVE_BN_MUL_ADD_WORDS
 #define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA6
 #define HAVE_BN_MUL_COMBA8
 #define HAVE_BN_MUL_WORDS
 #define HAVE_BN_SQR
 #define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA6
 #define HAVE_BN_SQR_COMBA8
 #define HAVE_BN_SUB
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
index 3926fcd4b0..705fbdbbda 100644
--- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S
+++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
@@ -1,3 +1,5 @@
+// $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,7 @@
 // Count leading zero bits in a single word
 // Input a; output function return
 //
-//    extern uint64_t word_clz (uint64_t a);
+//    extern uint64_t word_clz(uint64_t a);
 //
 // Standard x86-64 ABI: RDI = a, returns RAX
 // Microsoft x64 ABI:   RCX = a, returns RAX
@@ -30,7 +32,7 @@
        .text
 S2N_BN_SYMBOL(word_clz):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi