57 files changed, 4103 insertions, 1350 deletions
diff --git a/src/lib/libcrypto/Makefile b/src/lib/libcrypto/Makefile
index b0ab507983..459b0c9235 100644
--- a/src/lib/libcrypto/Makefile
+++ b/src/lib/libcrypto/Makefile
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile,v 1.241 2025/07/16 15:59:26 tb Exp $
+# $OpenBSD: Makefile,v 1.242 2025/08/14 15:48:48 beck Exp $
 LIB=    crypto
 LIBREBUILD=y
@@ -374,8 +374,10 @@ SRCS+= md4.c
 SRCS+= md5.c
 # mlkem/
+SRCS+= mlkem.c
 SRCS+= mlkem768.c
 SRCS+= mlkem1024.c
+SRCS+= mlkem_key.c
 # modes/
 SRCS+= cbc128.c
@@ -668,6 +670,7 @@ HDRS=\
        ${LCRYPTO_SRC}/lhash/lhash.h \
        ${LCRYPTO_SRC}/md4/md4.h \
        ${LCRYPTO_SRC}/md5/md5.h \
+        ${LCRYPTO_SRC}/mlkem/mlkem.h \
        ${LCRYPTO_SRC}/modes/modes.h \
        ${LCRYPTO_SRC}/objects/objects.h \
        ${LCRYPTO_SRC}/ocsp/ocsp.h \
diff --git a/src/lib/libcrypto/arch/aarch64/opensslconf.h b/src/lib/libcrypto/arch/aarch64/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/aarch64/opensslconf.h
+++ b/src/lib/libcrypto/arch/aarch64/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/alpha/opensslconf.h b/src/lib/libcrypto/arch/alpha/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/alpha/opensslconf.h
+++ b/src/lib/libcrypto/arch/alpha/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc
index 649c507189..de9666afdb 100644
--- a/src/lib/libcrypto/arch/amd64/Makefile.inc
+++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.inc,v 1.41 2025/06/28 12:39:10 jsing Exp $
+# $OpenBSD: Makefile.inc,v 1.42 2025/08/14 15:12:53 jsing Exp $
 # amd64-specific libcrypto build rules
@@ -26,11 +26,21 @@ SRCS += bn_arch.c
 SRCS += bignum_add.S
 SRCS += bignum_cmadd.S
 SRCS += bignum_cmul.S
+SRCS += bignum_modadd.S
+SRCS += bignum_modsub.S
 SRCS += bignum_mul.S
+SRCS += bignum_mul_4_8.S
 SRCS += bignum_mul_4_8_alt.S
+SRCS += bignum_mul_6_12.S
+SRCS += bignum_mul_6_12_alt.S
+SRCS += bignum_mul_8_16.S
 SRCS += bignum_mul_8_16_alt.S
 SRCS += bignum_sqr.S
+SRCS += bignum_sqr_4_8.S
 SRCS += bignum_sqr_4_8_alt.S
+SRCS += bignum_sqr_6_12.S
+SRCS += bignum_sqr_6_12_alt.S
+SRCS += bignum_sqr_8_16.S
 SRCS += bignum_sqr_8_16_alt.S
 SRCS += bignum_sub.S
 SRCS += word_clz.S
diff --git a/src/lib/libcrypto/arch/amd64/crypto_arch.h b/src/lib/libcrypto/arch/amd64/crypto_arch.h
index e869fbba35..a8f64cf235 100644
--- a/src/lib/libcrypto/arch/amd64/crypto_arch.h
+++ b/src/lib/libcrypto/arch/amd64/crypto_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: crypto_arch.h,v 1.13 2025/07/22 09:18:02 jsing Exp $ */
+/*      $OpenBSD: crypto_arch.h,v 1.14 2025/08/14 15:11:01 jsing Exp $ */
 /*
 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
 *
@@ -26,9 +26,10 @@
 extern uint64_t crypto_cpu_caps_amd64;
 #endif
-#define CRYPTO_CPU_CAPS_AMD64_AES       (1ULL << 0)
+#define CRYPTO_CPU_CAPS_AMD64_ADX       (1ULL << 0)
-#define CRYPTO_CPU_CAPS_AMD64_CLMUL     (1ULL << 1)
+#define CRYPTO_CPU_CAPS_AMD64_AES       (1ULL << 1)
-#define CRYPTO_CPU_CAPS_AMD64_SHA       (1ULL << 2)
+#define CRYPTO_CPU_CAPS_AMD64_CLMUL     (1ULL << 2)
+#define CRYPTO_CPU_CAPS_AMD64_SHA       (1ULL << 3)
 #ifndef OPENSSL_NO_ASM
diff --git a/src/lib/libcrypto/arch/amd64/crypto_cpu_caps.c b/src/lib/libcrypto/arch/amd64/crypto_cpu_caps.c
index 0bc440d34f..51a2da4616 100644
--- a/src/lib/libcrypto/arch/amd64/crypto_cpu_caps.c
+++ b/src/lib/libcrypto/arch/amd64/crypto_cpu_caps.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: crypto_cpu_caps.c,v 1.7 2025/07/22 09:18:02 jsing Exp $ */
+/*      $OpenBSD: crypto_cpu_caps.c,v 1.8 2025/08/14 15:11:01 jsing Exp $ */
 /*
 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
 *
@@ -119,6 +119,10 @@ crypto_cpu_caps_init(void)
        if (max_cpuid >= 7) {
                cpuid(7, NULL, &ebx, NULL, NULL);
+                /* Intel ADX feature bit - ebx[19]. */
+                if (((ebx >> 19) & 1) != 0)
+                        crypto_cpu_caps_amd64 |= CRYPTO_CPU_CAPS_AMD64_ADX;
                /* Intel SHA extensions feature bit - ebx[29]. */
                if (((ebx >> 29) & 1) != 0)
                        crypto_cpu_caps_amd64 |= CRYPTO_CPU_CAPS_AMD64_SHA;
diff --git a/src/lib/libcrypto/arch/amd64/opensslconf.h b/src/lib/libcrypto/arch/amd64/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/amd64/opensslconf.h
+++ b/src/lib/libcrypto/arch/amd64/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/arm/opensslconf.h b/src/lib/libcrypto/arch/arm/opensslconf.h
index dcbe113864..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/arm/opensslconf.h
+++ b/src/lib/libcrypto/arch/arm/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#undef RC4_CHUNK
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/hppa/opensslconf.h b/src/lib/libcrypto/arch/hppa/opensslconf.h
index dcbe113864..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/hppa/opensslconf.h
+++ b/src/lib/libcrypto/arch/hppa/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#undef RC4_CHUNK
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/i386/opensslconf.h b/src/lib/libcrypto/arch/i386/opensslconf.h
index dcbe113864..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/i386/opensslconf.h
+++ b/src/lib/libcrypto/arch/i386/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#undef RC4_CHUNK
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/m88k/opensslconf.h b/src/lib/libcrypto/arch/m88k/opensslconf.h
index dcbe113864..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/m88k/opensslconf.h
+++ b/src/lib/libcrypto/arch/m88k/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#undef RC4_CHUNK
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/mips64/opensslconf.h b/src/lib/libcrypto/arch/mips64/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/mips64/opensslconf.h
+++ b/src/lib/libcrypto/arch/mips64/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/powerpc/opensslconf.h b/src/lib/libcrypto/arch/powerpc/opensslconf.h
index dcbe113864..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/powerpc/opensslconf.h
+++ b/src/lib/libcrypto/arch/powerpc/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#undef RC4_CHUNK
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/powerpc64/opensslconf.h b/src/lib/libcrypto/arch/powerpc64/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/powerpc64/opensslconf.h
+++ b/src/lib/libcrypto/arch/powerpc64/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/riscv64/opensslconf.h b/src/lib/libcrypto/arch/riscv64/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/riscv64/opensslconf.h
+++ b/src/lib/libcrypto/arch/riscv64/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/sh/opensslconf.h b/src/lib/libcrypto/arch/sh/opensslconf.h
index dcbe113864..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/sh/opensslconf.h
+++ b/src/lib/libcrypto/arch/sh/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#undef RC4_CHUNK
-#endif
-#endif
diff --git a/src/lib/libcrypto/arch/sparc64/opensslconf.h b/src/lib/libcrypto/arch/sparc64/opensslconf.h
index 868066c75e..c31bcc01ad 100644
--- a/src/lib/libcrypto/arch/sparc64/opensslconf.h
+++ b/src/lib/libcrypto/arch/sparc64/opensslconf.h
@@ -11,13 +11,3 @@
 #define OPENSSL_LINE __LINE__
 #endif
 #endif
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
diff --git a/src/lib/libcrypto/asn1/tasn_fre.c b/src/lib/libcrypto/asn1/tasn_fre.c
index 0e259a13ab..c3de668483 100644
--- a/src/lib/libcrypto/asn1/tasn_fre.c
+++ b/src/lib/libcrypto/asn1/tasn_fre.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tasn_fre.c,v 1.24 2024/12/11 11:22:06 tb Exp $ */
+/* $OpenBSD: tasn_fre.c,v 1.25 2025/08/14 19:02:17 tb Exp $ */
 /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
 * project 2000.
 */
@@ -147,8 +147,9 @@ asn1_item_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
                                return;
                }
                asn1_enc_cleanup(pval, it);
-                /* If we free up as normal we will invalidate any
+                /*
-                 * ANY DEFINED BY field and we wont be able to
+                 * If we free up as normal, we will invalidate any
+                 * ANY DEFINED BY field and we won't be able to
                 * determine the type of the field it defines. So
                 * free up in reverse order.
                 */
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
index 5fe4aae7a1..1d4e6d08ef 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_add.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_add
+//    extern uint64_t bignum_add(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x + y operation, truncating modulo p words in general and
 // returning a top carry (0 or 1) in the p'th place, only adding the input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_add):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_add):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_add_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,27 +84,27 @@ S2N_BN_SYMBOL(bignum_add):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_add_xtest
-xmainloop:
+bignum_add_xmainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_add_xmainloop
-        jmp     xtest
+        jmp     bignum_add_xtest
-xtoploop:
+bignum_add_xtoploop:
        mov     a, [x+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_add_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_add_xtoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -112,30 +113,30 @@ xtest:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_add_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_add_ytoploop
-ymainloop:
+bignum_add_ymainloop:
        mov     a, [x+8*i]
        adc     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_add_ymainloop
-ytoploop:
+bignum_add_ytoploop:
        mov     a, [y+8*i]
        adc     a, 0
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_add_ytoploop
        mov     ashort, 0
        adc     a, 0
        test    p, p
-        jnz     tails
+        jnz     bignum_add_tails
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
@@ -144,16 +145,16 @@ ytoploop:
 // Adding a non-trivial tail, when p > max(m,n)
-tails:
+bignum_add_tails:
        mov     [z+8*i],a
        xor     a, a
-        jmp     tail
+        jmp     bignum_add_tail
-tailloop:
+bignum_add_tailloop:
        mov     [z+8*i],a
-tail:
+bignum_add_tail:
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_add_tailloop
 #if WINDOWS_ABI
        pop    rsi
        pop    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
index 25ba17bce2..a611919603 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmadd.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmadd
+//    extern uint64_t bignum_cmadd(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                 const uint64_t *y);
 //
 // Does the "z := z + c * y" operation where y is n digits, result z is p.
 // Truncates the result in general.
@@ -54,7 +56,7 @@
 S2N_BN_SYMBOL(bignum_cmadd):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -82,7 +84,7 @@ S2N_BN_SYMBOL(bignum_cmadd):
        xor     h, h
        test    n, n
-        jz      end
+        jz      bignum_cmadd_end
 // Move c into a safer register as multiplies overwrite rdx
@@ -96,11 +98,11 @@ S2N_BN_SYMBOL(bignum_cmadd):
        mov     h, rdx
        mov     ishort, 1
        dec     n
-        jz      hightail
+        jz      bignum_cmadd_hightail
 // Main loop, where we always have CF + previous high part h to add in
-loop:
+bignum_cmadd_loop:
        adc     h, [z+8*i]
        sbb     r, r
        mov     rax, [x+8*i]
@@ -111,36 +113,36 @@ loop:
        mov     h, rdx
        inc     i
        dec     n
-        jnz     loop
+        jnz     bignum_cmadd_loop
-hightail:
+bignum_cmadd_hightail:
        adc     h, 0
 // Propagate the carry all the way to the end with h as extra carry word
-tail:
+bignum_cmadd_tail:
        test    p, p
-        jz      end
+        jz      bignum_cmadd_end
        add     [z+8*i], h
        mov     hshort, 0
        inc     i
        dec     p
-        jz      highend
+        jz      bignum_cmadd_highend
-tloop:
+bignum_cmadd_tloop:
        adc     [z+8*i], h
        inc     i
        dec     p
-        jnz     tloop
+        jnz     bignum_cmadd_tloop
-highend:
+bignum_cmadd_highend:
        adc     h, 0
 // Return the high/carry word
-end:
+bignum_cmadd_end:
        mov     rax, h
        pop     rbx
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
index 12f785d63a..eb71d9da44 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_cmul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
 //
-//    extern uint64_t bignum_cmul
+//    extern uint64_t bignum_cmul(uint64_t k, uint64_t *z, uint64_t c, uint64_t n,
-//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//                                const uint64_t *y);
 //
 // Does the "z := c * y" operation where y is n digits, result z is p.
 // Truncates the result in general unless p >= n + 1.
@@ -51,7 +53,7 @@
 S2N_BN_SYMBOL(bignum_cmul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -76,7 +78,7 @@ S2N_BN_SYMBOL(bignum_cmul):
        xor     h, h
        xor     i, i
        test    n, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Move c into a safer register as multiplies overwrite rdx
@@ -90,11 +92,11 @@ S2N_BN_SYMBOL(bignum_cmul):
        mov     h, rdx
        inc     i
        cmp     i, n
-        jz      tail
+        jz      bignum_cmul_tail
 // Main loop doing the multiplications
-loop:
+bignum_cmul_loop:
        mov     rax, [x+8*i]
        mul     c
        add     rax, h
@@ -103,28 +105,28 @@ loop:
        mov     h, rdx
        inc     i
        cmp     i, n
-        jc      loop
+        jc      bignum_cmul_loop
 // Add a tail when the destination is longer
-tail:
+bignum_cmul_tail:
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
        mov     [z+8*i], h
        xor     h, h
        inc     i
        cmp     i, p
-        jnc     end
+        jnc     bignum_cmul_end
-tloop:
+bignum_cmul_tloop:
        mov     [z+8*i], h
        inc     i
        cmp     i, p
-        jc      tloop
+        jc      bignum_cmul_tloop
 // Return the high/carry word
-end:
+bignum_cmul_end:
        mov     rax, h
 #if WINDOWS_ABI
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
new file mode 100644
index 0000000000..baf27fdc7f
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modadd.S
@@ -0,0 +1,112 @@
+// $OpenBSD: bignum_modadd.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modadd(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modadd):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modadd_end
+// First just add (c::z) := x + y
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modadd_addloop:
+        mov     a, [x+8*i]
+        adc     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modadd_addloop
+        adc     c, 0
+// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
+        mov     j, k
+        xor     i, i
+bignum_modadd_cmploop:
+        mov     a, [z+8*i]
+        sbb     a, [m+8*i]
+        inc     i
+        dec     j
+        jnz     bignum_modadd_cmploop
+        sbb     c, 0
+        not     c
+// Now do a masked subtraction z := z - [c] * m
+        xor     i, i
+bignum_modadd_subloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        sbb     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modadd_subloop
+bignum_modadd_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
new file mode 100644
index 0000000000..63b3230e35
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_modsub.S
@@ -0,0 +1,99 @@
+// $OpenBSD: bignum_modsub.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modsub(uint64_t k, uint64_t *z, const uint64_t *x,
+//                              const uint64_t *y, const uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
+        .text
+#define k rdi
+#define z rsi
+#define x rdx
+#define y rcx
+#define m r8
+#define i r9
+#define j r10
+#define a rax
+#define c r11
+S2N_BN_SYMBOL(bignum_modsub):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+// If k = 0 do nothing
+        test    k, k
+        jz      bignum_modsub_end
+// Subtract z := x - y and record a mask for the carry x - y < 0
+        xor     c, c
+        mov     j, k
+        xor     i, i
+bignum_modsub_subloop:
+        mov     a, [x+8*i]
+        sbb     a, [y+8*i]
+        mov     [z+8*i], a
+        inc     i
+        dec     j
+        jnz     bignum_modsub_subloop
+        sbb     c, c
+// Now do a masked addition z := z + [c] * m
+        xor     i, i
+bignum_modsub_addloop:
+        mov     a, [m+8*i]
+        and     a, c
+        neg     j
+        adc     [z+8*i], a
+        sbb     j, j
+        inc     i
+        cmp     i, k
+        jc      bignum_modsub_addloop
+bignum_modsub_end:
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
index a3552679a2..538cce9af7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
 //
-//    extern void bignum_mul
+//    extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
-//     (uint64_t k, uint64_t *z,
+//                           uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the "z := x * y" operation where x is m digits, y is n, result z is k.
 // Truncates the result in general unless k >= m + n
@@ -59,7 +60,7 @@
 S2N_BN_SYMBOL(bignum_mul):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -88,7 +89,7 @@ S2N_BN_SYMBOL(bignum_mul):
 // If we did a multiply-add variant, however, then we could
        test    p, p
-        jz      end
+        jz      bignum_mul_end
 // Set initial 2-part sum to zero (we zero c inside the body)
@@ -99,7 +100,7 @@ S2N_BN_SYMBOL(bignum_mul):
        xor     k, k
-outerloop:
+bignum_mul_outerloop:
 // Zero our carry term first; we eventually want it and a zero is useful now
 // Set a =  max 0 (k + 1 - n), i = min (k + 1) m
@@ -125,11 +126,11 @@ outerloop:
        mov     d, k
        sub     d, i
        sub     i, a
-        jbe     innerend
+        jbe     bignum_mul_innerend
        lea     x,[rcx+8*a]
        lea     y,[r9+8*d-8]
-innerloop:
+bignum_mul_innerloop:
        mov     rax, [y+8*i]
        mul     QWORD PTR  [x]
        add     x, 8
@@ -137,9 +138,9 @@ innerloop:
        adc     h, rdx
        adc     c, 0
        dec     i
-        jnz     innerloop
+        jnz     bignum_mul_innerloop
-innerend:
+bignum_mul_innerend:
        mov     [z], l
        mov     l, h
@@ -147,9 +148,9 @@ innerend:
        add     z, 8
        cmp     k, p
-        jc      outerloop
+        jc      bignum_mul_outerloop
-end:
+bignum_mul_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
new file mode 100644
index 0000000000..d6ad514020
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S
@@ -0,0 +1,187 @@
+// $OpenBSD: bignum_mul_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4],
+//                               const uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 4 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 4 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 4 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 4 == 3)
+        adcx    r11, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 4 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 4 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 4 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 4 == 3)
+        mov     [z+8*\arg1],r11
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+.if (\arg1 % 4 == 0)
+        mulx    r8, rax, [x+24]
+        adcx    r11, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 4 == 1)
+        mulx    r9, rax, [x+24]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 4 == 2)
+        mulx    r10, rax, [x+24]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 4 == 3)
+        mulx    r11, rax, [x+24]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r11,r10,r9 as y[0] * x from 1..4
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r8, rbx, [x+24]
+        adcx    r11, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+// Now write back the additional columns
+        mov     [z+32], r8
+        mov     [z+40], r9
+        mov     [z+48], r10
+        mov     [z+56], r11
+// Restore registers and return
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
index 70ff69e372..2592d1d658 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
 //
-//    extern void bignum_mul_4_8_alt
+//    extern void bignum_mul_4_8_alt(uint64_t z[static 8], const uint64_t x[static 4],
-//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//                                   const uint64_t y[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +74,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..56cbdf06e0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,223 @@
+// $OpenBSD: bignum_mul_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
+//                                const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcx    r13, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 6 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 6 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 6 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 6 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 6 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 6 == 5)
+        mov     [z+8*\arg1],r13
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+.if (\arg1 % 6 == 0)
+        mulx    r8, rax, [x+40]
+        adcx    r13, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 6 == 1)
+        mulx    r9, rax, [x+40]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 6 == 2)
+        mulx    r10, rax, [x+40]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 6 == 3)
+        mulx    r11, rax, [x+40]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.elseif (\arg1 % 6 == 4)
+        mulx    r12, rax, [x+40]
+        adcx    r11, rax
+        adox    r12, zero
+        adcx    r12, zero
+.elseif (\arg1 % 6 == 5)
+        mulx    r13, rax, [x+40]
+        adcx    r12, rax
+        adox    r13, zero
+        adcx    r13, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r12, rbx, [x+24]
+        adcx    r11, rbx
+        mulx    r13, rbx, [x+32]
+        adcx    r12, rbx
+        mulx    r8, rbx, [x+40]
+        adcx    r13, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+// Now write back the additional columns
+        mov     [z+48], r8
+        mov     [z+56], r9
+        mov     [z+64], r10
+        mov     [z+72], r11
+        mov     [z+80], r12
+        mov     [z+88], r13
+// Restore registers and return
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..077c52b38e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,199 @@
+// $OpenBSD: bignum_mul_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6],
+//                                    const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// This is moved from rdx to free it for muls
+#define y rcx
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A minutely shorter form for when c = 0 initially
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+// A short form where we don't expect a top carry
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Result term 0
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+// Result term 3
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+// Result term 4
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+// Result term 5
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+// Result term 6
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        mov     [z+48], t2
+// Result term 7
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40],[y+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
new file mode 100644
index 0000000000..faa0196d8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S
@@ -0,0 +1,273 @@
+// $OpenBSD: bignum_mul_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8],
+//                                const uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rbx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rbx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rbx
+.endif
+.endm
+// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd 0, \arg1
+.if (\arg1 % 8 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 8 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 8 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 8 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 8 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 8 == 5)
+        mov     [z+8*\arg1],r13
+.elseif (\arg1 % 8 == 6)
+        mov     [z+8*\arg1],r14
+.elseif (\arg1 % 8 == 7)
+        mov     [z+8*\arg1],r15
+.endif
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+.if (\arg1 % 8 == 0)
+        adc     r8, zero
+.elseif (\arg1 % 8 == 1)
+        adc     r9, zero
+.elseif (\arg1 % 8 == 2)
+        adc     r10, zero
+.elseif (\arg1 % 8 == 3)
+        adc     r11, zero
+.elseif (\arg1 % 8 == 4)
+        adc     r12, zero
+.elseif (\arg1 % 8 == 5)
+        adc     r13, zero
+.elseif (\arg1 % 8 == 6)
+        adc     r14, zero
+.elseif (\arg1 % 8 == 7)
+        adc     r15, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adc     r9, rbx
+        mulx    r11, rbx, [x+16]
+        adc     r10, rbx
+        mulx    r12, rbx, [x+24]
+        adc     r11, rbx
+        mulx    r13, rbx, [x+32]
+        adc     r12, rbx
+        mulx    r14, rbx, [x+40]
+        adc     r13, rbx
+        mulx    r15, rbx, [x+48]
+        adc     r14, rbx
+        mulx    r8, rbx, [x+56]
+        adc     r15, rbx
+        adc     r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+        addrow  6
+        addrow  7
+// Now write back the additional columns
+        mov     [z+64], r8
+        mov     [z+72], r9
+        mov     [z+80], r10
+        mov     [z+88], r11
+        mov     [z+96], r12
+        mov     [z+104], r13
+        mov     [z+112], r14
+        mov     [z+120], r15
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
index 066403b074..0e30b9170f 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_mul_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,9 @@
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
 //
-//    extern void bignum_mul_8_16_alt
+//    extern void bignum_mul_8_16_alt(uint64_t z[static 16],
-//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//                                    const uint64_t x[static 8],
+//                                    const uint64_t y[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
 // Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
@@ -72,7 +75,7 @@
        adc     h, rdx
 S2N_BN_SYMBOL(bignum_mul_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
index 54e3f59442..86f1af2ac4 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,7 @@
 // Square z := x^2
 // Input x[n]; output z[k]
 //
-//    extern void bignum_sqr
+//    extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
-//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
 //
 // Does the "z := x^2" operation where x is n digits and result z is k.
 // Truncates the result in general unless k >= 2 * n
@@ -62,7 +63,7 @@
 #define llshort ebp
 S2N_BN_SYMBOL(bignum_sqr):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -86,7 +87,7 @@ S2N_BN_SYMBOL(bignum_sqr):
 // If p = 0 the result is trivial and nothing needs doing
        test    p, p
-        jz      end
+        jz      bignum_sqr_end
 // initialize (hh,ll) = 0
@@ -97,7 +98,7 @@ S2N_BN_SYMBOL(bignum_sqr):
        xor     k, k
-outerloop:
+bignum_sqr_outerloop:
 // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
 // We want to accumulate all x[i] * x[k - i] for bot <= i < top
@@ -122,7 +123,7 @@ outerloop:
 // If htop <= bot then main doubled part of the sum is empty
        cmp     i, htop
-        jnc     nosumming
+        jnc     bignum_sqr_nosumming
 // Use a moving pointer for [y] = x[k-i] for the cofactor
@@ -132,7 +133,7 @@ outerloop:
 // Do the main part of the sum x[i] * x[k - i] for 2 * i < k
-innerloop:
+bignum_sqr_innerloop:
        mov     a, [x+8*i]
        mul     QWORD PTR [y]
        add     l, a
@@ -141,7 +142,7 @@ innerloop:
        sub     y, 8
        inc     i
        cmp     i, htop
-        jc      innerloop
+        jc      bignum_sqr_innerloop
 // Now double it
@@ -151,11 +152,11 @@ innerloop:
 // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
-nosumming:
+bignum_sqr_nosumming:
        test    k, 1
-        jnz     innerend
+        jnz     bignum_sqr_innerend
        cmp     i, n
-        jnc     innerend
+        jnc     bignum_sqr_innerend
        mov     a, [x+8*i]
        mul     a
@@ -165,7 +166,7 @@ nosumming:
 // Now add the local sum into the global sum, store and shift
-innerend:
+bignum_sqr_innerend:
        add     l, ll
        mov     [z+8*k], l
        adc     h, hh
@@ -175,11 +176,11 @@ innerend:
        inc     k
        cmp     k, p
-        jc      outerloop
+        jc      bignum_sqr_outerloop
 // Restore registers and return
-end:
+bignum_sqr_end:
        pop     r15
        pop     r14
        pop     r13
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
new file mode 100644
index 0000000000..25664782f7
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S
@@ -0,0 +1,158 @@
+// $OpenBSD: bignum_sqr_4_8.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+S2N_BN_SYMBOL(bignum_sqr_4_8):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+// Set up an initial window [d6;...d1] = [23;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mov     rdx, [x+16]
+        mulx    d6, d5, [x+24]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+24]
+        mulx    rcx, rax, [x+8]
+        adcx    d4, rax
+        adox    d5, rcx
+        adcx    d5, zero
+        adox    d6, zero
+        adcx    d6, zero
+// In principle this is otiose as CF and OF carries are absorbed at this point
+// However it seems helpful for the OOO engine to be told it's a fresh start
+        xor     zeroe, zeroe
+// Double and add to the 00 + 11 + 22 + 33 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        mov     [z+32], d4
+        adcx    d6, d6
+        mov     [z+40], d5
+        adox    d6, rax
+        mov     [z+48], d6
+        adcx    rdx, zero
+        adox    rdx, zero
+        mov     [z+56], rdx
+// Restore saved registers and return
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
index 7c534ae907..7eafac3284 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_4_8_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,8 +18,8 @@
 // Square, z := x^2
 // Input x[4]; output z[8]
 //
-//    extern void bignum_sqr_4_8_alt
+//    extern void bignum_sqr_4_8_alt(uint64_t z[static 8],
-//      (uint64_t z[static 8], uint64_t x[static 4]);
+//                                   const uint64_t x[static 4]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -71,7 +73,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..3f055e8b75
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,227 @@
+// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+#define d7 r14
+#define d8 r15
+#define d9 rbx
+// Care is needed: re-using the zero register
+#define d10 rbp
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mulx    d6, d5, [x+40]
+        mov     rdx, [x+24]
+        mulx    d8, d7, [x+32]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+8]
+        mulx    rcx, rax, [x+24]
+        adcx    d4, rax
+        adox    d5, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d6, rax
+        adox    d7, rcx
+        adcx    d7, zero
+        adox    d8, zero
+        adcx    d8, zero
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+        xor     zeroe, zeroe
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+        mov     rdx, [x+32]
+        mulx    rcx, rax, [x]
+        adcx    d4, rax
+        adox    d5, rcx
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x+24]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d6, rax
+        adox    d7, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d7, rax
+        adox    d8, rcx
+        mov     rdx, [x+24]
+        mulx    d9, rax, [x+40]
+        adcx    d8, rax
+        adox    d9, zero
+        mov     rdx, [x+32]
+        mulx    d10, rax, [x+40]
+        adcx    d9, rax
+        mov     eax, 0
+        adox    d10, rax
+        adcx    d10, rax
+// Again, just for a clear fresh start for the flags
+        xor     eax, eax
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        adcx    d6, d6
+        adox    d6, rax
+        adcx    d7, d7
+        adox    d7, rdx
+        mov     rdx, [x+32]
+        mov     [z+32], d4
+        mulx    rdx, rax, rdx
+        adcx    d8, d8
+        adox    d8, rax
+        adcx    d9, d9
+        adox    d9, rdx
+        mov     rdx, [x+40]
+        mov     [z+40], d5
+        mulx    rdx, rax, rdx
+        mov     [z+48], d6
+        adcx    d10, d10
+        mov     [z+56], d7
+        adox    d10, rax
+        mov     [z+64], d8
+        mov     eax, 0
+        mov     [z+72], d9
+        adcx    rdx, rax
+        mov     [z+80], d10
+        adox    rdx, rax
+        mov     [z+88], rdx
+// Restore saved registers and return
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..eb43b0a15b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,210 @@
+// $OpenBSD: bignum_sqr_6_12_alt.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+// Input arguments
+#define z rdi
+#define x rsi
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Additional temporaries for local windows to share doublings
+#define u0 rcx
+#define u1 r11
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// Set up initial window (c,h,l) = numa * numb
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+// Square term incorporation (c,h,l) += numba^2
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A short form where we don't expect a top carry
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+// A version doubling directly before adding, for single non-square terms
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Result term 0
+        mov     rax, [x]
+        mul     rax
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+// Result term 3
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+// Result term 4
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+// Result term 5
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+// Result term 6
+        combaddz(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+// Result term 7
+        combaddz(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+24],[x+40])
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+32],[x+40])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
new file mode 100644
index 0000000000..41277b5b6a
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S
@@ -0,0 +1,311 @@
+// $OpenBSD: bignum_sqr_8_16.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
+//
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "s2n_bignum_internal.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// mulpadd i, j adds rdx * x[i] into the window  at the i+j point
+.macro mulpadd arg1,arg2
+        mulx    rcx, rax, [x+8*\arg1]
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcx    r8, rax
+        adox    r9, rcx
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcx    r9, rax
+        adox    r10, rcx
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcx    r10, rax
+        adox    r11, rcx
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcx    r11, rax
+        adox    r12, rcx
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcx    r12, rax
+        adox    r13, rcx
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcx    r13, rax
+        adox    r14, rcx
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcx    r14, rax
+        adox    r15, rcx
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcx    r15, rax
+        adox    r8, rcx
+.endif
+.endm
+// mulpade i, j adds rdx * x[i] into the window at i+j
+// but re-creates the top word assuming nothing to add there
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulx    r9, rax, [x+8*\arg1]
+        adcx    r8, rax
+        adox    r9, zero
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulx    r10, rax, [x+8*\arg1]
+        adcx    r9, rax
+        adox    r10, zero
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulx    r11, rax, [x+8*\arg1]
+        adcx    r10, rax
+        adox    r11, zero
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulx    r12, rax, [x+8*\arg1]
+        adcx    r11, rax
+        adox    r12, zero
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulx    r13, rax, [x+8*\arg1]
+        adcx    r12, rax
+        adox    r13, zero
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulx    r14, rax, [x+8*\arg1]
+        adcx    r13, rax
+        adox    r14, zero
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulx    r15, rax, [x+8*\arg1]
+        adcx    r14, rax
+        adox    r15, zero
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulx    r8, rax, [x+8*\arg1]
+        adcx    r15, rax
+        adox    r8, zero
+.endif
+.endm
+.macro diagonals
+        xor     zeroe, zeroe
+// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+        mov     rdx, [x]
+        mulx    rax, r9, [x+8]
+        mov     [z+8], r9
+        mulx    rcx, r10, [x+16]
+        adcx    r10, rax
+        mov     [z+16], r10
+        mulx    rax, r11, [x+24]
+        adcx    r11, rcx
+        mulx    rcx, r12, [x+32]
+        adcx    r12, rax
+        mulx    rax, r13, [x+40]
+        adcx    r13, rcx
+        mulx    rcx, r14, [x+48]
+        adcx    r14, rax
+        mulx    r8, r15, [x+56]
+        adcx    r15, rcx
+        adcx    r8, zero
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+        xor     zeroe, zeroe
+        mov     rdx, [x+8]
+        mulpadd 2, 1
+        mov     [z+24], r11
+        mulpadd 3, 1
+        mov     [z+32], r12
+        mulpadd 4, 1
+        mulpadd 5, 1
+        mulpadd 6, 1
+        mulpade 7, 1
+        mov     rdx, [x+32]
+        mulpade 5, 4
+        adcx    r10, zero
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+        xor     zeroe, zeroe
+        mov     rdx, [x+16]
+        mulpadd 3, 2
+        mov     [z+40], r13
+        mulpadd 4, 2
+        mov     [z+48], r14
+        mulpadd 5, 2
+        mulpadd 6, 2
+        mulpadd 7, 2
+        mov     rdx, [x+48]
+        mulpade 4, 6
+        mulpade 5, 6
+        adcx    r12, zero
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+        xor     zeroe, zeroe
+        mov     rdx, [x+24]
+        mulpadd 4, 3
+        mov     [z+56], r15
+        mulpadd 5, 3
+        mov     [z+64], r8
+        mulpadd 6, 3
+        mulpadd 7, 3
+        mov     rdx, [x+56]
+        mulpadd 4, 7
+        mulpade 5, 7
+        mulpade 6, 7
+        adcx    r14, zero
+// Double and add things; use z[1]..z[8] and thereafter the registers
+// r9..r15 which haven't been written back yet
+        xor     zeroe, zeroe
+        mov     rdx, [x]
+        mulx    rcx, rax, rdx
+        mov     [z], rax
+        mov     rax, [z+8]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+8], rax
+        mov     rax, [z+16]
+        mov     rdx, [x+8]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+16], rax
+        mov     rax, [z+24]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+24], rax
+        mov     rax, [z+32]
+        mov     rdx, [x+16]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+32], rax
+        mov     rax, [z+40]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+40], rax
+        mov     rax, [z+48]
+        mov     rdx, [x+24]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+48], rax
+        mov     rax, [z+56]
+        adcx    rax, rax
+        adox    rax, rcx
+        mov     [z+56], rax
+        mov     rax, [z+64]
+        mov     rdx, [x+32]
+        mulx    rcx, rdx, rdx
+        adcx    rax, rax
+        adox    rax, rdx
+        mov     [z+64], rax
+        adcx    r9, r9
+        adox    r9, rcx
+        mov     [z+72], r9
+        mov     rdx, [x+40]
+        mulx    rcx, rdx, rdx
+        adcx    r10, r10
+        adox    r10, rdx
+        mov     [z+80], r10
+        adcx    r11, r11
+        adox    r11, rcx
+        mov     [z+88], r11
+        mov     rdx, [x+48]
+        mulx    rcx, rdx, rdx
+        adcx    r12, r12
+        adox    r12, rdx
+        mov     [z+96], r12
+        adcx    r13, r13
+        adox    r13, rcx
+        mov     [z+104], r13
+        mov     rdx, [x+56]
+        mulx    r15, rdx, rdx
+        adcx    r14, r14
+        adox    r14, rdx
+        mov     [z+112], r14
+        adcx    r15, zero
+        adox    r15, zero
+        mov     [z+120], r15
+.endm
+S2N_BN_SYMBOL(bignum_sqr_8_16):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Do the multiplication
+        diagonals
+// Real epilog
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
index ac0b6f96c2..cb10ba2a12 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sqr_8_16_alt.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,8 @@
 // Square, z := x^2
 // Input x[8]; output z[16]
 //
-//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//    extern void bignum_sqr_8_16_alt(uint64_t z[static 16],
+//                                    const uint64_t x[static 8]);
 //
 // Standard x86-64 ABI: RDI = z, RSI = x
 // Microsoft x64 ABI:   RCX = z, RDX = x
@@ -103,7 +106,7 @@
        adc     c, 0
 S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
index 3ff8a30510..7324d3a71e 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -1,3 +1,5 @@
+// $OpenBSD: bignum_sub.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,9 +18,8 @@
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
 //
-//    extern uint64_t bignum_sub
+//    extern uint64_t bignum_sub(uint64_t p, uint64_t *z, uint64_t m,
-//     (uint64_t p, uint64_t *z,
+//                               const uint64_t *x, uint64_t n, const uint64_t *y);
-//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
 //
 // Does the z := x - y operation, truncating modulo p words in general and
 // returning a top borrow (0 or 1) in the p'th place, only subtracting input
@@ -49,7 +50,7 @@
 S2N_BN_SYMBOL(bignum_sub):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
@@ -75,7 +76,7 @@ S2N_BN_SYMBOL(bignum_sub):
        cmp     p, n
        cmovc   n, p
        cmp     m, n
-        jc      ylonger
+        jc      bignum_sub_ylonger
 // The case where x is longer or of the same size (p >= m >= n)
@@ -83,32 +84,32 @@ S2N_BN_SYMBOL(bignum_sub):
        sub     m, n
        inc     m
        test    n, n
-        jz      xtest
+        jz      bignum_sub_xtest
-xmainloop:
+bignum_sub_xmainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     xmainloop
+        jnz     bignum_sub_xmainloop
-        jmp     xtest
+        jmp     bignum_sub_xtest
-xtoploop:
+bignum_sub_xtoploop:
        mov     a, [x+8*i]
        sbb     a, 0
        mov     [z+8*i],a
        inc     i
-xtest:
+bignum_sub_xtest:
        dec     m
-        jnz     xtoploop
+        jnz     bignum_sub_xtoploop
        sbb     a, a
        test    p, p
-        jz      tailskip
+        jz      bignum_sub_tailskip
-tailloop:
+bignum_sub_tailloop:
        mov     [z+8*i],a
        inc     i
        dec     p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
-tailskip:
+bignum_sub_tailskip:
        neg     a
 #if WINDOWS_ABI
        pop    rsi
@@ -118,29 +119,29 @@ tailskip:
 // The case where y is longer (p >= n > m)
-ylonger:
+bignum_sub_ylonger:
        sub     p, n
        sub     n, m
        test    m, m
-        jz      ytoploop
+        jz      bignum_sub_ytoploop
-ymainloop:
+bignum_sub_ymainloop:
        mov     a, [x+8*i]
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     m
-        jnz     ymainloop
+        jnz     bignum_sub_ymainloop
-ytoploop:
+bignum_sub_ytoploop:
        mov     ashort, 0
        sbb     a, [y+8*i]
        mov     [z+8*i],a
        inc     i
        dec     n
-        jnz     ytoploop
+        jnz     bignum_sub_ytoploop
        sbb     a, a
        test    p, p
-        jnz     tailloop
+        jnz     bignum_sub_tailloop
        neg     a
 #if WINDOWS_ABI
        pop    rsi
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
index 8eb3670def..9ff8920ca2 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.c,v 1.8 2025/08/05 15:01:13 jsing Exp $ */
+/*      $OpenBSD: bn_arch.c,v 1.12 2025/08/14 15:29:17 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -19,6 +19,7 @@
 #include "bn_arch.h"
 #include "bn_local.h"
+#include "crypto_arch.h"
 #include "s2n_bignum.h"
 #ifdef HAVE_BN_ADD
@@ -26,8 +27,8 @@ BN_ULONG
 bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_add(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -36,8 +37,8 @@ bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_add(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
 }
 #endif
@@ -46,8 +47,8 @@ BN_ULONG
 bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
    int b_len)
 {
-        return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+        return bignum_sub(r_len, (uint64_t *)r, a_len, (const uint64_t *)a,
-            b_len, (uint64_t *)b);
+            b_len, (const uint64_t *)b);
 }
 #endif
@@ -55,8 +56,28 @@ bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
 BN_ULONG
 bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 {
-        return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+        return bignum_sub(n, (uint64_t *)rd, n, (const uint64_t *)ad, n,
-            (uint64_t *)bd);
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MOD_ADD_WORDS
+void
+bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        bignum_modadd(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
+}
+#endif
+#ifdef HAVE_BN_MOD_SUB_WORDS
+void
+bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+    const BN_ULONG *m, size_t n)
+{
+        bignum_modsub(n, (uint64_t *)r, (const uint64_t *)a,
+            (const uint64_t *)b, (const uint64_t *)m);
 }
 #endif
@@ -64,7 +85,7 @@ bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
 BN_ULONG
 bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        return bignum_cmadd(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
@@ -72,7 +93,7 @@ bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 BN_ULONG
 bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 {
-        return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+        return bignum_cmul(num, (uint64_t *)rd, w, num, (const uint64_t *)ad);
 }
 #endif
@@ -80,8 +101,29 @@ bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
 void
 bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_4_8((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_4_8_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
+}
+#endif
+#ifdef HAVE_BN_MUL_COMBA6
+void
+bn_mul_comba6(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_mul_6_12((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_6_12_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
@@ -89,8 +131,14 @@ bn_mul_comba4(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 void
 bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+                bignum_mul_8_16((uint64_t *)rd, (const uint64_t *)ad,
+                    (const uint64_t *)bd);
+                return;
+        }
+        bignum_mul_8_16_alt((uint64_t *)rd, (const uint64_t *)ad,
+            (const uint64_t *)bd);
 }
 #endif
@@ -98,7 +146,7 @@ bn_mul_comba8(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd)
 int
 bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 {
-        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d);
+        bignum_sqr(r_len, (uint64_t *)r->d, a->top, (const uint64_t *)a->d);
        return 1;
 }
@@ -108,8 +156,25 @@ bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
 void
 bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_4_8((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_4_8_alt((uint64_t *)rd, (const uint64_t *)ad);
+}
+#endif
+#ifdef HAVE_BN_SQR_COMBA6
+void
+bn_sqr_comba6(BN_ULONG *rd, const BN_ULONG *ad)
+{
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
+                bignum_sqr_6_12((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_6_12_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
@@ -117,8 +182,12 @@ bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
 void
 bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
 {
-        /* XXX - consider using non-alt on CPUs that have the ADX extension. */
+        if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_ADX) != 0) {
-        bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad);
+                bignum_sqr_8_16((uint64_t *)rd, (const uint64_t *)ad);
+                return;
+        }
+        bignum_sqr_8_16_alt((uint64_t *)rd, (const uint64_t *)ad);
 }
 #endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 927cd75208..7359f993a7 100644
--- a/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/src/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: bn_arch.h,v 1.14 2024/03/26 06:09:25 jsing Exp $ */
+/*      $OpenBSD: bn_arch.h,v 1.16 2025/08/14 15:22:54 jsing Exp $ */
 /*
 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
 *
@@ -27,13 +27,18 @@
 #define HAVE_BN_DIV_WORDS
+#define HAVE_BN_MOD_ADD_WORDS
+#define HAVE_BN_MOD_SUB_WORDS
 #define HAVE_BN_MUL_ADD_WORDS
 #define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA6
 #define HAVE_BN_MUL_COMBA8
 #define HAVE_BN_MUL_WORDS
 #define HAVE_BN_SQR
 #define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA6
 #define HAVE_BN_SQR_COMBA8
 #define HAVE_BN_SUB
diff --git a/src/lib/libcrypto/bn/arch/amd64/word_clz.S b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
index 3926fcd4b0..705fbdbbda 100644
--- a/src/lib/libcrypto/bn/arch/amd64/word_clz.S
+++ b/src/lib/libcrypto/bn/arch/amd64/word_clz.S
@@ -1,3 +1,5 @@
+// $OpenBSD: word_clz.S,v 1.7 2025/08/11 14:13:56 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -16,7 +18,7 @@
 // Count leading zero bits in a single word
 // Input a; output function return
 //
-//    extern uint64_t word_clz (uint64_t a);
+//    extern uint64_t word_clz(uint64_t a);
 //
 // Standard x86-64 ABI: RDI = a, returns RAX
 // Microsoft x64 ABI:   RCX = a, returns RAX
@@ -30,7 +32,7 @@
        .text
 S2N_BN_SYMBOL(word_clz):
-        _CET_ENDBR
+        _CET_ENDBR
 #if WINDOWS_ABI
        push    rdi
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
index 70f6534b8f..a30d05fb02 100644
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ b/src/lib/libcrypto/bn/bn_mul.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mul.c,v 1.42 2025/08/05 15:06:13 jsing Exp $ */
+/* $OpenBSD: bn_mul.c,v 1.43 2025/08/14 15:15:04 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -407,6 +407,8 @@ BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
        if (a->top == 4 && b->top == 4) {
                bn_mul_comba4(rr->d, a->d, b->d);
+        } else if (a->top == 6 && b->top == 6) {
+                bn_mul_comba6(rr->d, a->d, b->d);
        } else if (a->top == 8 && b->top == 8) {
                bn_mul_comba8(rr->d, a->d, b->d);
        } else {
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
index ab1282e3b1..2f7f71f819 100644
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ b/src/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_sqr.c,v 1.37 2025/08/05 15:08:13 jsing Exp $ */
+/* $OpenBSD: bn_sqr.c,v 1.38 2025/08/14 15:15:04 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -326,6 +326,8 @@ BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
        if (a->top == 4) {
                bn_sqr_comba4(rr->d, a->d);
+        } else if (a->top == 6) {
+                bn_sqr_comba6(rr->d, a->d);
        } else if (a->top == 8) {
                bn_sqr_comba8(rr->d, a->d);
        } else {
diff --git a/src/lib/libcrypto/bn/s2n_bignum.h b/src/lib/libcrypto/bn/s2n_bignum.h
index ce6e8cdc94..7d77894cdc 100644
--- a/src/lib/libcrypto/bn/s2n_bignum.h
+++ b/src/lib/libcrypto/bn/s2n_bignum.h
@@ -1,3 +1,5 @@
+// $OpenBSD: s2n_bignum.h,v 1.4 2025/08/12 10:01:37 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -34,182 +36,240 @@
 //        throughput, generally offering higher performance there.
 // ----------------------------------------------------------------------------
+#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
+#define S2N_BIGNUM_STATIC
+#else
+#define S2N_BIGNUM_STATIC static
+#endif
 // Add, z := x + y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
-extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_add_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_add_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_add_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_add_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_add_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_add_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Compute "amontification" constant z :== 2^{128k} (congruent mod m)
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_amontifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_amontifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
 // Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_amontmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_amontmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
 // Inputs x[n], m[k], p; output z[k]
-extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
 // Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_amontsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_amontsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Convert 4-digit (256-bit) bignum to/from big-endian form
 // Input x[4]; output z[4]
-extern void bignum_bigendian_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_bigendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to/from big-endian form
 // Input x[6]; output z[6]
-extern void bignum_bigendian_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_bigendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Select bitfield starting at bit n with length l <= 64
 // Inputs x[k], n, l; output function return
-extern uint64_t bignum_bitfield (uint64_t k, uint64_t *x, uint64_t n, uint64_t l);
+extern uint64_t bignum_bitfield (uint64_t k, const uint64_t *x, uint64_t n, uint64_t l);
 // Return size of bignum in bits
 // Input x[k]; output function return
-extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x);
+extern uint64_t bignum_bitsize (uint64_t k, const uint64_t *x);
 // Divide by a single (nonzero) word, z := x / m and return x mod m
 // Inputs x[n], m; outputs function return (remainder) and z[k]
-extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
 // Divide by a single word, z := x / m when known to be exact
 // Inputs x[n], m; output z[k]
-extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
 // Count leading zero digits (64-bit words)
 // Input x[k]; output function return
-extern uint64_t bignum_cld (uint64_t k, uint64_t *x);
+extern uint64_t bignum_cld (uint64_t k, const uint64_t *x);
 // Count leading zero bits
 // Input x[k]; output function return
-extern uint64_t bignum_clz (uint64_t k, uint64_t *x);
+extern uint64_t bignum_clz (uint64_t k, const uint64_t *x);
 // Multiply-add with single-word multiplier, z := z + c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Negated multiply-add with single-word multiplier, z := z - c * y
 // Inputs c, y[n]; outputs function return (negative carry-out) and z[k]
-extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Find modulus of bignum w.r.t. single nonzero word m, returning x mod m
 // Input x[k], m; output function return
-extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m);
+extern uint64_t bignum_cmod (uint64_t k, const uint64_t *x, uint64_t m);
 // Multiply by a single word, z := c * y
 // Inputs c, y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
 // Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p25519 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p25519_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p256 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p256_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced
 // Inputs c, x[4]; output z[4]
-extern void bignum_cmul_p256k1 (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_cmul_p256k1_alt (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+extern void bignum_cmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced
 // Inputs c, x[6]; output z[6]
-extern void bignum_cmul_p384 (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
+extern void bignum_cmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_cmul_p384_alt (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
+extern void bignum_cmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced
 // Inputs c, x[9]; output z[9]
-extern void bignum_cmul_p521 (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
+extern void bignum_cmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_cmul_p521_alt (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
+extern void bignum_cmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test bignums for coprimality, gcd(x,y) = 1
 // Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
-extern uint64_t bignum_coprime (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t);
+extern uint64_t bignum_coprime (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y, uint64_t *t);
 // Copy bignum with zero-extension or truncation, z := x
 // Input x[n]; output z[k]
-extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, uint64_t height,
+        uint64_t width, uint64_t idx);
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1]. width must be a multiple of 8.
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table_8n (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t width, uint64_t idx);
+// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*16]; output z[16]
+extern void bignum_copy_row_from_table_16 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
+// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*32]; output z[32]
+extern void bignum_copy_row_from_table_32 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
 // Count trailing zero digits (64-bit words)
 // Input x[k]; output function return
-extern uint64_t bignum_ctd (uint64_t k, uint64_t *x);
+extern uint64_t bignum_ctd (uint64_t k, const uint64_t *x);
 // Count trailing zero bits
 // Input x[k]; output function return
-extern uint64_t bignum_ctz (uint64_t k, uint64_t *x);
+extern uint64_t bignum_ctz (uint64_t k, const uint64_t *x);
 // Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_deamont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_deamont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_deamont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_deamont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_deamont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_deamont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_deamont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_deamont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert from almost-Montgomery form z := (x / 2^576) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_deamont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_deamont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_deamont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_demont (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_demont (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_demont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_demont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_demont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_demont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_demont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_demont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_demont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_demont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_demont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_demont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_demont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Select digit x[n]
 // Inputs x[k], n; output function return
-extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n);
+extern uint64_t bignum_digit (uint64_t k, const uint64_t *x, uint64_t n);
 // Return size of bignum in digits (64-bit word)
 // Input x[k]; output function return
-extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x);
+extern uint64_t bignum_digitsize (uint64_t k, const uint64_t *x);
 // Divide bignum by 10: z' := z div 10, returning remainder z mod 10
 // Inputs z[k]; outputs function return (remainder) and z[k]
@@ -217,294 +277,391 @@ extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
 // Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_double_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_double_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_double_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_double_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_double_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_double_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Extended Montgomery reduce, returning results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
 // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, const uint64_t *m,
+                                          uint64_t w, uint64_t *m_precalc);
 // Test bignums for equality, x = y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_eq (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_eq (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Test bignum for even-ness
 // Input x[k]; output function return
-extern uint64_t bignum_even (uint64_t k, uint64_t *x);
+extern uint64_t bignum_even (uint64_t k, const uint64_t *x);
 // Convert 4-digit (256-bit) bignum from big-endian bytes
 // Input x[32] (bytes); output z[4]
-extern void bignum_frombebytes_4 (uint64_t z[static 4], uint8_t x[static 32]);
+extern void bignum_frombebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
 // Convert 6-digit (384-bit) bignum from big-endian bytes
 // Input x[48] (bytes); output z[6]
-extern void bignum_frombebytes_6 (uint64_t z[static 6], uint8_t x[static 48]);
+extern void bignum_frombebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
 // Convert 4-digit (256-bit) bignum from little-endian bytes
 // Input x[32] (bytes); output z[4]
-extern void bignum_fromlebytes_4 (uint64_t z[static 4], uint8_t x[static 32]);
+extern void bignum_fromlebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
 // Convert 6-digit (384-bit) bignum from little-endian bytes
 // Input x[48] (bytes); output z[6]
-extern void bignum_fromlebytes_6 (uint64_t z[static 6], uint8_t x[static 48]);
+extern void bignum_fromlebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
 // Convert little-endian bytes to 9-digit 528-bit bignum
 // Input x[66] (bytes); output z[9]
-extern void bignum_fromlebytes_p521 (uint64_t z[static 9],uint8_t x[static 66]);
+extern void bignum_fromlebytes_p521 (uint64_t z[S2N_BIGNUM_STATIC 9],const uint8_t x[S2N_BIGNUM_STATIC 66]);
 // Compare bignums, x >= y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_ge (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_ge (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Compare bignums, x > y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_gt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_gt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_half_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_half_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_half_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_half_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_half_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_half_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_half_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_half_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_half_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_25519 = 2^255 - 19
+// Input x[4]; output z[4]
+extern void bignum_inv_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_inv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+// Modular inverse modulo p_521 = 2^521 - 1
+// Input x[9]; output z[9]
+extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9],const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Inverse square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_invsqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test bignum for zero-ness, x = 0
 // Input x[k]; output function return
-extern uint64_t bignum_iszero (uint64_t k, uint64_t *x);
+extern uint64_t bignum_iszero (uint64_t k, const uint64_t *x);
 // Multiply z := x * y
 // Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
-extern void bignum_kmul_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], uint64_t t[static 32]);
+extern void bignum_kmul_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], const uint64_t y[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 32]);
 // Multiply z := x * y
 // Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
-extern void bignum_kmul_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], uint64_t t[static 96]);
+extern void bignum_kmul_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], const uint64_t y[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 96]);
 // Square, z := x^2
 // Input x[16]; output z[32]; temporary buffer t[>=24]
-extern void bignum_ksqr_16_32 (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]);
+extern void bignum_ksqr_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 24]);
 // Square, z := x^2
 // Input x[32]; output z[64]; temporary buffer t[>=72]
-extern void bignum_ksqr_32_64 (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]);
+extern void bignum_ksqr_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 72]);
 // Compare bignums, x <= y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_le (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_le (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Convert 4-digit (256-bit) bignum to/from little-endian form
 // Input x[4]; output z[4]
-extern void bignum_littleendian_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_littleendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to/from little-endian form
 // Input x[6]; output z[6]
-extern void bignum_littleendian_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_littleendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Compare bignums, x < y
 // Inputs x[m], y[n]; output function return
-extern uint64_t bignum_lt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_lt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Multiply-add, z := z + x * y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
+// Inputs x[4], y[4], c[4]; output z[4]
+extern void bignum_madd_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+extern void bignum_madd_n25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+// Reduce modulo group order, z := x mod m_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_m25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[k]; output z[4]
+extern void bignum_mod_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_n25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_256
 // Input x[k]; output z[4]
-extern void bignum_mod_n256 (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
-extern void bignum_mod_n256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
 // Reduce modulo group order, z := x mod n_256
 // Input x[4]; output z[4]
-extern void bignum_mod_n256_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_n256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_256k1
 // Input x[4]; output z[4]
-extern void bignum_mod_n256k1_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_n256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo group order, z := x mod n_384
 // Input x[k]; output z[6]
-extern void bignum_mod_n384 (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
-extern void bignum_mod_n384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
 // Reduce modulo group order, z := x mod n_384
 // Input x[6]; output z[6]
-extern void bignum_mod_n384_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_mod_n384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Reduce modulo group order, z := x mod n_521
 // Input x[9]; output z[9]
-extern void bignum_mod_n521_9 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_n521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mod_n521_9_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_n521_9_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_nsm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+extern void bignum_mod_nsm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_nsm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_25519
 // Input x[4]; output z[4]
-extern void bignum_mod_p25519_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_256
 // Input x[k]; output z[4]
-extern void bignum_mod_p256 (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
-extern void bignum_mod_p256_alt (uint64_t z[static 4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
 // Reduce modulo field characteristic, z := x mod p_256
 // Input x[4]; output z[4]
-extern void bignum_mod_p256_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_mod_p256k1_4 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_mod_p256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Reduce modulo field characteristic, z := x mod p_384
 // Input x[k]; output z[6]
-extern void bignum_mod_p384 (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
-extern void bignum_mod_p384_alt (uint64_t z[static 6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
 // Reduce modulo field characteristic, z := x mod p_384
 // Input x[6]; output z[6]
-extern void bignum_mod_p384_6 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_mod_p384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Reduce modulo field characteristic, z := x mod p_521
 // Input x[9]; output z[9]
-extern void bignum_mod_p521_9 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_mod_p521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_sm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Add modulo m, z := (x + y) mod m, assuming x and y reduced
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_modadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_modadd (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Double modulo m, z := (2 * x) mod m, assuming x reduced
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_moddouble (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_moddouble (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+// Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m
+// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
+extern void bignum_modexp(uint64_t k,uint64_t *z, const uint64_t *a,const uint64_t *p,const uint64_t *m,uint64_t *t);
 // Compute "modification" constant z := 2^{64k} mod m
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_modifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_modifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
 // Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b
 // Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
-extern void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+extern void bignum_modinv (uint64_t k, uint64_t *z, const uint64_t *a, const uint64_t *b, uint64_t *t);
 // Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[k], m[k]; output z[k]
-extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m);
+extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x, const uint64_t *m);
 // Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_modsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_modsub (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Compute "montification" constant z := 2^{128k} mod m
 // Input m[k]; output z[k]; temporary buffer t[>=k]
-extern void bignum_montifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+extern void bignum_montifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
+// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^{64k}) mod m
 // Inputs x[k], y[k], m[k]; output z[k]
-extern void bignum_montmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+extern void bignum_montmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
 // Montgomery multiply, z := (x * y / 2^256) mod p_256
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_montmul_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montmul_p256_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^256) mod p_256k1
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_montmul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montmul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_montmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery multiply, z := (x * y / 2^384) mod p_384
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_montmul_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_montmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montmul_p384_alt (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_montmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Montgomery multiply, z := (x * y / 2^576) mod p_521
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_montmul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_montmul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_montmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Montgomery reduce, z := (x' / 2^{64p}) MOD m
 // Inputs x[n], m[k], p; output z[k]
-extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
 // Montgomery square, z := (x^2 / 2^{64k}) mod m
 // Inputs x[k], m[k]; output z[k]
-extern void bignum_montsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+extern void bignum_montsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
 // Montgomery square, z := (x^2 / 2^256) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_montsqr_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montsqr_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery square, z := (x^2 / 2^256) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_montsqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_montsqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_montsqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery square, z := (x^2 / 2^384) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_montsqr_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_montsqr_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montsqr_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_montsqr_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Montgomery square, z := (x^2 / 2^576) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_montsqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_montsqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_montsqr_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montsqr_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Multiply z := x * y
 // Inputs x[m], y[n]; output z[k]
-extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Multiply z := x * y
 // Inputs x[4], y[4]; output z[8]
-extern void bignum_mul_4_8 (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_4_8_alt (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply z := x * y
 // Inputs x[6], y[6]; output z[12]
-extern void bignum_mul_6_12 (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mul_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_mul_6_12_alt (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mul_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Multiply z := x * y
 // Inputs x[8], y[8]; output z[16]
-extern void bignum_mul_8_16 (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+extern void bignum_mul_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
-extern void bignum_mul_8_16_alt (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+extern void bignum_mul_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
 // Multiply modulo p_25519, z := (x * y) mod p_25519
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_mul_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_p25519_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply modulo p_256k1, z := (x * y) mod p_256k1
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_mul_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
-extern void bignum_mul_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_mul_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mul_p521_alt (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
 // Multiply bignum by 10 and add word: z := 10 * z + d
 // Inputs z[k], d; outputs function return (carry) and z[k]
@@ -512,55 +669,59 @@ extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
 // Multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[k], y[k]; output z[k]
-extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y);
+extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y);
 // 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[4], y[4]; output z[4]
-extern void bignum_mux_4 (uint64_t p, uint64_t z[static 4],uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_mux_4 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
 // Inputs p, x[6], y[6]; output z[6]
-extern void bignum_mux_6 (uint64_t p, uint64_t z[static 6],uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_mux_6 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Select element from 16-element table, z := xs[k*i]
 // Inputs xs[16*k], i; output z[k]
-extern void bignum_mux16 (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i);
+extern void bignum_mux16 (uint64_t k, uint64_t *z, const uint64_t *xs, uint64_t i);
 // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
 // Input x[4]; output z[4]
-extern void bignum_neg_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_neg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced
 // Input x[6]; output z[6]
-extern void bignum_neg_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_neg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_neg_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_neg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Negated modular inverse, z := (-1/x) mod 2^{64k}
 // Input x[k]; output z[k]
-extern void bignum_negmodinv (uint64_t k, uint64_t *z, uint64_t *x);
+extern void bignum_negmodinv (uint64_t k, uint64_t *z, const uint64_t *x);
 // Test bignum for nonzero-ness x =/= 0
 // Input x[k]; output function return
-extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x);
+extern uint64_t bignum_nonzero (uint64_t k, const uint64_t *x);
 // Test 256-bit bignum for nonzero-ness x =/= 0
 // Input x[4]; output function return
-extern uint64_t bignum_nonzero_4(uint64_t x[static 4]);
+extern uint64_t bignum_nonzero_4(const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Test 384-bit bignum for nonzero-ness x =/= 0
 // Input x[6]; output function return
-extern uint64_t bignum_nonzero_6(uint64_t x[static 6]);
+extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Normalize bignum in-place by shifting left till top bit is 1
 // Input z[k]; outputs function return (bits shifted left) and z[k]
@@ -568,7 +729,7 @@ extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
 // Test bignum for odd-ness
 // Input x[k]; output function return
-extern uint64_t bignum_odd (uint64_t k, uint64_t *x);
+extern uint64_t bignum_odd (uint64_t k, const uint64_t *x);
 // Convert single digit to bignum, z := n
 // Input n; output z[k]
@@ -576,39 +737,43 @@ extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
 // Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
 // Inputs p, x[k]; outputs function return (nonzero input) and z[k]
-extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x);
+extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x);
 // Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p25519 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p256 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[4]; output z[4]
-extern void bignum_optneg_p256k1 (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+extern void bignum_optneg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[6]; output z[6]
-extern void bignum_optneg_p384 (uint64_t z[static 6], uint64_t p, uint64_t x[static 6]);
+extern void bignum_optneg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced
 // Inputs p, x[9]; output z[9]
-extern void bignum_optneg_p521 (uint64_t z[static 9], uint64_t p, uint64_t x[static 9]);
+extern void bignum_optneg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
 // Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
 // Return bignum of power of 2, z := 2^n
 // Input n; output z[k]
@@ -616,216 +781,376 @@ extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
 // Shift bignum left by c < 64 bits z := x * 2^c
 // Inputs x[n], c; outputs function return (carry-out) and z[k]
-extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
 // Shift bignum right by c < 64 bits z := floor(x / 2^c)
 // Inputs x[n], c; outputs function return (bits shifted out) and z[k]
-extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
 // Square, z := x^2
 // Input x[n]; output z[k]
-extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
 // Square, z := x^2
 // Input x[4]; output z[8]
-extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]);
+extern void bignum_sqr_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_4_8_alt (uint64_t z[static 8], uint64_t x[static 4]);
+extern void bignum_sqr_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square, z := x^2
 // Input x[6]; output z[12]
-extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]);
+extern void bignum_sqr_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]);
+extern void bignum_sqr_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Square, z := x^2
 // Input x[8]; output z[16]
-extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]);
+extern void bignum_sqr_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
-extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+extern void bignum_sqr_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
 // Square modulo p_25519, z := (x^2) mod p_25519
 // Input x[4]; output z[4]
-extern void bignum_sqr_p25519 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_p25519_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square modulo p_256k1, z := (x^2) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_sqr_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_sqr_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_sqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_sqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_sqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_sqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Subtract, z := x - y
 // Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
-extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
 // Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p25519 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p256 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced
 // Inputs x[4], y[4]; output z[4]
-extern void bignum_sub_p256k1 (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+extern void bignum_sub_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
-extern void bignum_sub_p384 (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
+extern void bignum_sub_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
 // Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
-extern void bignum_sub_p521 (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
+extern void bignum_sub_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+// Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
 // Convert 4-digit (256-bit) bignum to big-endian bytes
 // Input x[4]; output z[32] (bytes)
-extern void bignum_tobebytes_4 (uint8_t z[static 32], uint64_t x[static 4]);
+extern void bignum_tobebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to big-endian bytes
 // Input x[6]; output z[48] (bytes)
-extern void bignum_tobebytes_6 (uint8_t z[static 48], uint64_t x[static 6]);
+extern void bignum_tobebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert 4-digit (256-bit) bignum to little-endian bytes
 // Input x[4]; output z[32] (bytes)
-extern void bignum_tolebytes_4 (uint8_t z[static 32], uint64_t x[static 4]);
+extern void bignum_tolebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert 6-digit (384-bit) bignum to little-endian bytes
 // Input x[6]; output z[48] (bytes)
-extern void bignum_tolebytes_6 (uint8_t z[static 48], uint64_t x[static 6]);
+extern void bignum_tolebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert 9-digit 528-bit bignum to little-endian bytes
 // Input x[6]; output z[66] (bytes)
-extern void bignum_tolebytes_p521 (uint8_t z[static 66], uint64_t x[static 9]);
+extern void bignum_tolebytes_p521 (uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
 // Convert to Montgomery form z := (2^256 * x) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_tomont_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_tomont_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert to Montgomery form z := (2^256 * x) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_tomont_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_tomont_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_tomont_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Convert to Montgomery form z := (2^384 * x) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_tomont_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_tomont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_tomont_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_tomont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Convert to Montgomery form z := (2^576 * x) mod p_521
 // Input x[9]; output z[9]
-extern void bignum_tomont_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_tomont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Convert to Montgomery form z := (2^256 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_tomont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_256, z := (3 * x) mod p_256
 // Input x[4]; output z[4]
-extern void bignum_triple_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_triple_p256_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_256k1, z := (3 * x) mod p_256k1
 // Input x[4]; output z[4]
-extern void bignum_triple_p256k1 (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-extern void bignum_triple_p256k1_alt (uint64_t z[static 4], uint64_t x[static 4]);
+extern void bignum_triple_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Triple modulo p_384, z := (3 * x) mod p_384
 // Input x[6]; output z[6]
-extern void bignum_triple_p384 (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_triple_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_triple_p384_alt (uint64_t z[static 6], uint64_t x[static 6]);
+extern void bignum_triple_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
 // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
-extern void bignum_triple_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_triple_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_triple_p521_alt (uint64_t z[static 9], uint64_t x[static 9]);
+extern void bignum_triple_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_triple_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_triple_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
 // Montgomery ladder step for curve25519
 // Inputs point[8], pp[16], b; output rr[16]
-extern void curve25519_ladderstep(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
-extern void curve25519_ladderstep_alt(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep_alt(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
 // Projective scalar multiplication, x coordinate only, for curve25519
 // Inputs scalar[4], point[4]; output res[8]
-extern void curve25519_pxscalarmul(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_pxscalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_pxscalarmul_alt(uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_pxscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
 // x25519 function for curve25519
 // Inputs scalar[4], point[4]; output res[4]
-extern void curve25519_x25519(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_x25519(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_x25519_alt(uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]);
+extern void curve25519_x25519_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+// x25519 function for curve25519 (byte array arguments)
+// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
 // x25519 function for curve25519 on base element 9
 // Input scalar[4]; output res[4]
-extern void curve25519_x25519base(uint64_t res[static 4],uint64_t scalar[static 4]);
+extern void curve25519_x25519base(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
-extern void curve25519_x25519base_alt(uint64_t res[static 4],uint64_t scalar[static 4]);
+extern void curve25519_x25519base_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+// x25519 function for curve25519 on base element 9 (byte array arguments)
+// Input scalar[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+// Decode compressed 256-bit form of edwards25519 point
+// Input c[32] (bytes); output function return and z[8]
+extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+// Encode edwards25519 point into compressed form as 256-bit number
+// Input p[8]; output z[32] (bytes)
+extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t p[S2N_BIGNUM_STATIC 8]);
 // Extended projective addition for edwards25519
 // Inputs p1[16], p2[16]; output p3[16]
-extern void edwards25519_epadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]);
+extern void edwards25519_epadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
-extern void edwards25519_epadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]);
+extern void edwards25519_epadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
 // Extended projective doubling for edwards25519
 // Inputs p1[12]; output p3[16]
-extern void edwards25519_epdouble(uint64_t p3[static 16],uint64_t p1[static 12]);
+extern void edwards25519_epdouble(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_epdouble_alt(uint64_t p3[static 16],uint64_t p1[static 12]);
+extern void edwards25519_epdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Projective doubling for edwards25519
 // Inputs p1[12]; output p3[12]
-extern void edwards25519_pdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void edwards25519_pdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_pdouble_alt(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void edwards25519_pdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Extended projective + precomputed mixed addition for edwards25519
 // Inputs p1[16], p2[12]; output p3[16]
-extern void edwards25519_pepadd(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]);
+extern void edwards25519_pepadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
-extern void edwards25519_pepadd_alt(uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]);
+extern void edwards25519_pepadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Scalar multiplication by standard basepoint for edwards25519 (Ed25519)
+// Input scalar[4]; output res[8]
+extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+// Double scalar multiplication for edwards25519, fresh and base point
+// Input scalar[4], point[8], bscalar[4]; output res[8]
+extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k2(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 512],const int16_t b[S2N_BIGNUM_STATIC 512],const int16_t bt[S2N_BIGNUM_STATIC 256]);
+// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k3(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 768],const int16_t b[S2N_BIGNUM_STATIC 768],const int16_t bt[S2N_BIGNUM_STATIC 384]);
+// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache
+// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words)
+extern void mlkem_basemul_k4(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 1024],const int16_t b[S2N_BIGNUM_STATIC 1024],const int16_t bt[S2N_BIGNUM_STATIC 512]);
+// Inverse number-theoretic transform from ML-KEM
+// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_intt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
+// Precompute the mulcache data for a polynomial in the NTT domain
+// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words)
+extern void mlkem_mulcache_compute(int16_t x[S2N_BIGNUM_STATIC 128],const int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z[S2N_BIGNUM_STATIC 128],const int16_t t[S2N_BIGNUM_STATIC 128]);
+// Forward number-theoretic transform from ML-KEM
+// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_ntt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]);
+// Canonical modular reduction of polynomial coefficients for ML-KEM
+// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_reduce(int16_t a[S2N_BIGNUM_STATIC 256]);
+// Pack ML-KEM polynomial coefficients as 12-bit numbers
+// Input a[256] (signed 16-bit words); output r[384] (bytes)
+extern void mlkem_tobytes(uint8_t r[S2N_BIGNUM_STATIC 384],const int16_t a[S2N_BIGNUM_STATIC 256]);
+// Conversion of ML-KEM polynomial coefficients to Montgomery form
+// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words)
+extern void mlkem_tomont(int16_t a[S2N_BIGNUM_STATIC 256]);
+// Uniform rejection sampling for ML-KEM
+// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return
+extern uint64_t mlkem_rej_uniform_VARIABLE_TIME(int16_t r[S2N_BIGNUM_STATIC 256],const uint8_t *buf,uint64_t buflen,const uint8_t *table);
 // Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12], p2[12]; output p3[12]
-extern void p256_montjadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
 // Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12]; output p3[12]
-extern void p256_montjdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void p256_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
 // Inputs p1[12], p2[8]; output p3[12]
-extern void p256_montjmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+extern void p256_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void p256_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Montgomery-Jacobian form scalar multiplication for P-256
+// Input scalar[4], point[12]; output res[12]
+extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+// Scalar multiplication for NIST curve P-256
+// Input scalar[4], point[8]; output res[8]
+extern void p256_scalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+extern void p256_scalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+extern void p256_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
+extern void p256_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
 // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18], p2[18]; output p3[18]
-extern void p384_montjadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]);
+extern void p384_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
 // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18]; output p3[18]
-extern void p384_montjdouble(uint64_t p3[static 18],uint64_t p1[static 18]);
+extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
 // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
 // Inputs p1[18], p2[12]; output p3[18]
-extern void p384_montjmixadd(uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
+extern void p384_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p384_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Montgomery-Jacobian form scalar multiplication for P-384
+// Input scalar[6], point[18]; output res[18]
+extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
 // Point addition on NIST curve P-521 in Jacobian coordinates
 // Inputs p1[27], p2[27]; output p3[27]
-extern void p521_jadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]);
+extern void p521_jadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
+extern void p521_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
 // Point doubling on NIST curve P-521 in Jacobian coordinates
 // Input p1[27]; output p3[27]
-extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]);
+extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
+extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
 // Point mixed addition on NIST curve P-521 in Jacobian coordinates
 // Inputs p1[27], p2[18]; output p3[27]
-extern void p521_jmixadd(uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]);
+extern void p521_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p521_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+// Jacobian form scalar multiplication for P-521
+// Input scalar[9], point[27]; output res[27]
+extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
+extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
 // Point addition on SECG curve secp256k1 in Jacobian coordinates
 // Inputs p1[12], p2[12]; output p3[12]
-extern void secp256k1_jadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+extern void secp256k1_jadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
 // Point doubling on SECG curve secp256k1 in Jacobian coordinates
 // Input p1[12]; output p3[12]
-extern void secp256k1_jdouble(uint64_t p3[static 12],uint64_t p1[static 12]);
+extern void secp256k1_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
 // Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
 // Inputs p1[12], p2[8]; output p3[12]
-extern void secp256k1_jmixadd(uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+extern void secp256k1_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void secp256k1_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Keccak-f1600 permutation for SHA3
+// Inputs a[25], rc[24]; output a[25]
+extern void sha3_keccak_f1600(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 25],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Batched 2-way Keccak-f1600 permutation for SHA3
+// Inputs a[50], rc[24]; output a[50]
+extern void sha3_keccak2_f1600(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak2_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 50],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Batched 4-way Keccak-f1600 permutation for SHA3
+// Inputs a[100], rc[24]; output a[100]
+extern void sha3_keccak4_f1600(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak4_f1600_alt(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+extern void sha3_keccak4_f1600_alt2(uint64_t a[S2N_BIGNUM_STATIC 100],const uint64_t rc[S2N_BIGNUM_STATIC 24]);
+// Point addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[12]; output p3[12]
+extern void sm2_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+// Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12]; output p3[12]
+extern void sm2_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+// Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[8]; output p3[12]
+extern void sm2_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void sm2_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+// Montgomery-Jacobian form scalar multiplication for CC curve SM2
+// Input scalar[4], point[12]; output res[12]
+extern void sm2_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
 // Reverse the bytes in a single word
 // Input a; output function return
@@ -839,6 +1164,10 @@ extern uint64_t word_clz (uint64_t a);
 // Input a; output function return
 extern uint64_t word_ctz (uint64_t a);
+// Perform 59 "divstep" iterations and return signed matrix of updates
+// Inputs d, f, g; output m[2][2] and function return
+extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
 // Return maximum of two unsigned 64-bit words
 // Inputs a, b; output function return
 extern uint64_t word_max (uint64_t a, uint64_t b);
@@ -851,6 +1180,10 @@ extern uint64_t word_min (uint64_t a, uint64_t b);
 // Input a; output function return
 extern uint64_t word_negmodinv (uint64_t a);
+// Count number of set bits in a single 64-bit word (population count)
+// Input a; output function return
+extern uint64_t word_popcount (uint64_t a);
 // Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set
 // Input a; output function return
 extern uint64_t word_recip (uint64_t a);
diff --git a/src/lib/libcrypto/bn/s2n_bignum_internal.h b/src/lib/libcrypto/bn/s2n_bignum_internal.h
index b82db7d019..37eebb4fd6 100644
--- a/src/lib/libcrypto/bn/s2n_bignum_internal.h
+++ b/src/lib/libcrypto/bn/s2n_bignum_internal.h
@@ -1,3 +1,5 @@
+// $OpenBSD: s2n_bignum_internal.h,v 1.5 2025/08/12 10:01:37 jsing Exp $
+//
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
@@ -14,14 +16,14 @@
 #ifdef __APPLE__
 #   define S2N_BN_SYMBOL(NAME) _##NAME
+#   if defined(__AARCH64EL__) || defined(__ARMEL__)
+#     define __LF %%
+#   else
+#     define __LF ;
+#   endif
 #else
 #   define S2N_BN_SYMBOL(name) name
-#endif
+#   define __LF ;
-#ifdef __CET__
-#   include <cet.h>
-#else
-#   define _CET_ENDBR
 #endif
 #define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name)
@@ -34,3 +36,24 @@
 #else
 #   define S2N_BN_SYM_PRIVACY_DIRECTIVE(name)  /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */
 #endif
+// Enable indirect branch tracking support unless explicitly disabled
+// with -DNO_IBT. If the platform supports CET, simply inherit this from
+// the usual header. Otherwise manually define _CET_ENDBR, used at each
+// x86 entry point, to be the ENDBR64 instruction, with an explicit byte
+// sequence for compilers/assemblers that don't know about it. Note that
+// it is safe to use ENDBR64 on all platforms, since the encoding is by
+// design interpreted as a NOP on all pre-CET x86_64 processors. The only
+// downside is a small increase in code size and potentially a modest
+// slowdown from executing one more instruction.
+#if NO_IBT
+#   if defined(_CET_ENDBR)
+#     error "The s2n-bignum build option NO_IBT was configured, but _CET_ENDBR is defined in this compilation unit. That is weird, so failing the build."
+#   endif
+#   define _CET_ENDBR
+#elif defined(__CET__)
+#   include <cet.h>
+#elif !defined(_CET_ENDBR)
+#   define _CET_ENDBR .byte 0xf3,0x0f,0x1e,0xfa
+#endif
diff --git a/src/lib/libcrypto/cert.pem b/src/lib/libcrypto/cert.pem
index a7fd3519fb..aadf2deb9b 100644
--- a/src/lib/libcrypto/cert.pem
+++ b/src/lib/libcrypto/cert.pem
@@ -1,4 +1,4 @@
-# $OpenBSD: cert.pem,v 1.31 2025/03/16 07:44:35 tb Exp $
+# $OpenBSD: cert.pem,v 1.32 2025/08/06 09:45:53 sthen Exp $
 ### /C=ES/CN=Autoridad de Certificacion Firmaprofesional CIF A62634068
 === /C=ES/CN=Autoridad de Certificacion Firmaprofesional CIF A62634068
@@ -960,49 +960,6 @@ AgEGMAoGCCqGSM49BAMDA2gAMGUCMBq8W9f+qdJUDkpd0m2xQNz0Q9XSSpkZElaA
 43j4ptZLvZuHjw/l1lOWqzzIQNph91Oj9w==
 -----END CERTIFICATE-----
-### Baltimore
-=== /C=IE/O=Baltimore/OU=CyberTrust/CN=Baltimore CyberTrust Root
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number: 33554617 (0x20000b9)
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: May 12 18:46:00 2000 GMT
-            Not After : May 12 23:59:00 2025 GMT
-        Subject: C=IE, O=Baltimore, OU=CyberTrust, CN=Baltimore CyberTrust Root
-        X509v3 extensions:
-            X509v3 Subject Key Identifier:
-                E5:9D:59:30:82:47:58:CC:AC:FA:08:54:36:86:7B:3A:B5:04:4D:F0
-            X509v3 Basic Constraints: critical
-                CA:TRUE, pathlen:3
-            X509v3 Key Usage: critical
-                Certificate Sign, CRL Sign
-SHA1 Fingerprint=D4:DE:20:D0:5E:66:FC:53:FE:1A:50:88:2C:78:DB:28:52:CA:E4:74
-SHA256 Fingerprint=16:AF:57:A9:F6:76:B0:AB:12:60:95:AA:5E:BA:DE:F2:2A:B3:11:19:D6:44:AC:95:CD:4B:93:DB:F3:F2:6A:EB
-----BEGIN CERTIFICATE-----
-MIIDdzCCAl+gAwIBAgIEAgAAuTANBgkqhkiG9w0BAQUFADBaMQswCQYDVQQGEwJJ
-RTESMBAGA1UEChMJQmFsdGltb3JlMRMwEQYDVQQLEwpDeWJlclRydXN0MSIwIAYD
-VQQDExlCYWx0aW1vcmUgQ3liZXJUcnVzdCBSb290MB4XDTAwMDUxMjE4NDYwMFoX
-DTI1MDUxMjIzNTkwMFowWjELMAkGA1UEBhMCSUUxEjAQBgNVBAoTCUJhbHRpbW9y
-ZTETMBEGA1UECxMKQ3liZXJUcnVzdDEiMCAGA1UEAxMZQmFsdGltb3JlIEN5YmVy
-VHJ1c3QgUm9vdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAKMEuyKr
-mD1X6CZymrV51Cni4eiVgLGw41uOKymaZN+hXe2wCQVt2yguzmKiYv60iNoS6zjr
-IZ3AQSsBUnuId9Mcj8e6uYi1agnnc+gRQKfRzMpijS3ljwumUNKoUMMo6vWrJYeK
-mpYcqWe4PwzV9/lSEy/CG9VwcPCPwBLKBsua4dnKM3p31vjsufFoREJIE9LAwqSu
-XmD+tqYF/LTdB1kC1FkYmGP1pWPgkAx9XbIGevOF6uvUA65ehD5f/xXtabz5OTZy
-dc93Uk3zyZAsuT3lySNTPx8kmCFcB5kpvcY67Oduhjprl3RjM71oGDHweI12v/ye
-jl0qhqdNkNwnGjkCAwEAAaNFMEMwHQYDVR0OBBYEFOWdWTCCR1jMrPoIVDaGezq1
-BE3wMBIGA1UdEwEB/wQIMAYBAf8CAQMwDgYDVR0PAQH/BAQDAgEGMA0GCSqGSIb3
-DQEBBQUAA4IBAQCFDF2O5G9RaEIFoN27TyclhAO992T9Ldcw46QQF+vaKSm2eT92
-9hkTI7gQCvlYpNRhcL0EYWoSihfVCr3FvDB81ukMJY2GQE/szKN+OMY3EU/t3Wgx
-jkzSswF07r51XgdIGn9w/xZchMB5hbgF/X++ZRGjD8ACtPhSNzkE1akxehi/oCr0
-Epn3o0WC4zxe9Z2etciefC7IpJ5OCBRLbf1wbWsaY71k5h+3zvDyny67G7fyUIhz
-ksLi4xaNmjICq44Y3ekQEe5+NauQrz4wlHrQMz2nZQ/1/I6eYs9HRCwBXbsdtTLS
-R9I4LtD+gdwyah617jzV/OeBHRnDJELqYzmp
-----END CERTIFICATE-----
 ### Buypass AS-983163327
 === /C=NO/O=Buypass AS-983163327/CN=Buypass Class 2 Root CA
@@ -1728,61 +1685,6 @@ v64fG9PiO/yzcnMcmyiQiRM9HcEARwmWmjgb3bHPDcK0RPOWlc4yOo80nOAXx17O
 rg3bhzjlP1v9mxnhMUF6cKojawHhRUzNlM47ni3niAIi9G7oyOzWPPO5std3eqx7
 -----END CERTIFICATE-----
-### Comodo CA Limited
-=== /C=GB/ST=Greater Manchester/L=Salford/O=Comodo CA Limited/CN=AAA Certificate Services
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number: 1 (0x1)
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: Jan  1 00:00:00 2004 GMT
-            Not After : Dec 31 23:59:59 2028 GMT
-        Subject: C=GB, ST=Greater Manchester, L=Salford, O=Comodo CA Limited, CN=AAA Certificate Services
-        X509v3 extensions:
-            X509v3 Subject Key Identifier:
-                A0:11:0A:23:3E:96:F1:07:EC:E2:AF:29:EF:82:A5:7F:D0:30:A4:B4
-            X509v3 Key Usage: critical
-                Certificate Sign, CRL Sign
-            X509v3 Basic Constraints: critical
-                CA:TRUE
-            X509v3 CRL Distribution Points:
-                Full Name:
-                  URI:http://crl.comodoca.com/AAACertificateServices.crl
-                Full Name:
-                  URI:http://crl.comodo.net/AAACertificateServices.crl
-SHA1 Fingerprint=D1:EB:23:A4:6D:17:D6:8F:D9:25:64:C2:F1:F1:60:17:64:D8:E3:49
-SHA256 Fingerprint=D7:A7:A0:FB:5D:7E:27:31:D7:71:E9:48:4E:BC:DE:F7:1D:5F:0C:3E:0A:29:48:78:2B:C8:3E:E0:EA:69:9E:F4
-----BEGIN CERTIFICATE-----
-MIIEMjCCAxqgAwIBAgIBATANBgkqhkiG9w0BAQUFADB7MQswCQYDVQQGEwJHQjEb
-MBkGA1UECAwSR3JlYXRlciBNYW5jaGVzdGVyMRAwDgYDVQQHDAdTYWxmb3JkMRow
-GAYDVQQKDBFDb21vZG8gQ0EgTGltaXRlZDEhMB8GA1UEAwwYQUFBIENlcnRpZmlj
-YXRlIFNlcnZpY2VzMB4XDTA0MDEwMTAwMDAwMFoXDTI4MTIzMTIzNTk1OVowezEL
-MAkGA1UEBhMCR0IxGzAZBgNVBAgMEkdyZWF0ZXIgTWFuY2hlc3RlcjEQMA4GA1UE
-BwwHU2FsZm9yZDEaMBgGA1UECgwRQ29tb2RvIENBIExpbWl0ZWQxITAfBgNVBAMM
-GEFBQSBDZXJ0aWZpY2F0ZSBTZXJ2aWNlczCCASIwDQYJKoZIhvcNAQEBBQADggEP
-ADCCAQoCggEBAL5AnfRu4ep2hxxNRUSOvkbIgwadwSr+GB+O5AL686tdUIoWMQua
-BtDFcCLNSS1UY8y2bmhGC1Pqy0wkwLxyTurxFa70VJoSCsN6sjNg4tqJVfMiWPPe
-3M/vg4aijJRPn2jymJBGhCfHdr/jzDUsi14HZGWCwEiwqJH5YZ92IFCokcdmtet4
-YgNW8IoaE+oxox6gmf049vYnMlhvB/VruPsUK6+3qszWY19zjNoFmag4qMsXeDZR
-rOme9Hg6jc8P2ULimAyrL58OAd7vn5lJ8S3frHRNG5i1R8XlKdH5kBjHYpy+g8cm
-ez6KJcfA3Z3mNWgQIJ2P2N7Sw4ScDV7oL8kCAwEAAaOBwDCBvTAdBgNVHQ4EFgQU
-oBEKIz6W8Qfs4q8p74Klf9AwpLQwDgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQF
-MAMBAf8wewYDVR0fBHQwcjA4oDagNIYyaHR0cDovL2NybC5jb21vZG9jYS5jb20v
-QUFBQ2VydGlmaWNhdGVTZXJ2aWNlcy5jcmwwNqA0oDKGMGh0dHA6Ly9jcmwuY29t
-b2RvLm5ldC9BQUFDZXJ0aWZpY2F0ZVNlcnZpY2VzLmNybDANBgkqhkiG9w0BAQUF
-AAOCAQEACFb8AvCb6P+k+tZ7xkSAzk/ExfYAWMymtrwUSWgEdujm7l3sAg9g1o1Q
-GE8mTgHj5rCl7r+8dFRBv/38ErjHT1r0iWAFf2C3BUrz9vHCv8S5dIa2LX1rzNLz
-Rt0vxuBqw8M0Ayx9lt1awg6nCpnBBYurDC/zXDrPbDdVCYfeU0BsWO/8tqtlbgT2
-G9w84FoVxp7Z8VlIMCFlA2zs6SFz7JsDoeA3raAVGI/6ugLOpyypEBMs1OUIJqsi
-l2D4kF501KKaU73yqWjgom7C12yxow+ev+to51byrvLjKzg6CYG1a4XXvi3tPxq3
-smPi9WIsgtRqAEFQ8TmDn5XpNpaYbg==
-----END CERTIFICATE-----
 ### Cybertrust Japan Co., Ltd.
 === /C=JP/O=Cybertrust Japan Co., Ltd./CN=SecureSign Root CA12
@@ -3070,53 +2972,6 @@ eu6FSqdQgPCnXEqULl8FmTxSQeDNtGPPAUO6nIPcj2A781q0tHuu2guQOHXvgR1m
 0vdXcDazv/wor3ElhVsT/h5/WrQ8
 -----END CERTIFICATE-----
-### Entrust.net
-=== /O=Entrust.net/OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)/OU=(c) 1999 Entrust.net Limited/CN=Entrust.net Certification Authority (2048)
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number: 946069240 (0x3863def8)
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: Dec 24 17:50:51 1999 GMT
-            Not After : Jul 24 14:15:12 2029 GMT
-        Subject: O=Entrust.net, OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.), OU=(c) 1999 Entrust.net Limited, CN=Entrust.net Certification Authority (2048)
-        X509v3 extensions:
-            X509v3 Key Usage: critical
-                Certificate Sign, CRL Sign
-            X509v3 Basic Constraints: critical
-                CA:TRUE
-            X509v3 Subject Key Identifier:
-                55:E4:81:D1:11:80:BE:D8:89:B9:08:A3:31:F9:A1:24:09:16:B9:70
-SHA1 Fingerprint=50:30:06:09:1D:97:D4:F5:AE:39:F7:CB:E7:92:7D:7D:65:2D:34:31
-SHA256 Fingerprint=6D:C4:71:72:E0:1C:BC:B0:BF:62:58:0D:89:5F:E2:B8:AC:9A:D4:F8:73:80:1E:0C:10:B9:C8:37:D2:1E:B1:77
-----BEGIN CERTIFICATE-----
-MIIEKjCCAxKgAwIBAgIEOGPe+DANBgkqhkiG9w0BAQUFADCBtDEUMBIGA1UEChML
-RW50cnVzdC5uZXQxQDA+BgNVBAsUN3d3dy5lbnRydXN0Lm5ldC9DUFNfMjA0OCBp
-bmNvcnAuIGJ5IHJlZi4gKGxpbWl0cyBsaWFiLikxJTAjBgNVBAsTHChjKSAxOTk5
-IEVudHJ1c3QubmV0IExpbWl0ZWQxMzAxBgNVBAMTKkVudHJ1c3QubmV0IENlcnRp
-ZmljYXRpb24gQXV0aG9yaXR5ICgyMDQ4KTAeFw05OTEyMjQxNzUwNTFaFw0yOTA3
-MjQxNDE1MTJaMIG0MRQwEgYDVQQKEwtFbnRydXN0Lm5ldDFAMD4GA1UECxQ3d3d3
-LmVudHJ1c3QubmV0L0NQU18yMDQ4IGluY29ycC4gYnkgcmVmLiAobGltaXRzIGxp
-YWIuKTElMCMGA1UECxMcKGMpIDE5OTkgRW50cnVzdC5uZXQgTGltaXRlZDEzMDEG
-A1UEAxMqRW50cnVzdC5uZXQgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkgKDIwNDgp
-MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArU1LqRKGsuqjIAcVFmQq
-K0vRvwtKTY7tgHalZ7d4QMBzQshowNtTK91euHaYNZOLGp18EzoOH1u3Hs/lJBQe
-sYGpjX24zGtLA/ECDNyrpUAkAH90lKGdCCmziAv1h3edVc3kw37XamSrhRSGlVuX
-MlBvPci6Zgzj/L24ScF2iUkZ/cCovYmjZy/Gn7xxGWC4LeksyZB2ZnuU4q941mVT
-XTzWnLLPKQP5L6RQstRIzgUyVYr9smRMDuSYB3Xbf9+5CFVghTAp+XtIpGmG4zU/
-HoZdenoVve8AjhUiVBcAkCaTvA5JaJG/+EfTnZVCwQ5N328mz8MYIWJmQ3DW1cAH
-4QIDAQABo0IwQDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNV
-HQ4EFgQUVeSB0RGAvtiJuQijMfmhJAkWuXAwDQYJKoZIhvcNAQEFBQADggEBADub
-j1abMOdTmXx6eadNl9cZlZD7Bh/KM3xGY4+WZiT6QBshJ8rmcnPyT/4xmf3IDExo
-U8aAghOY+rat2l098c5u9hURlIIM7j+VrxGrD9cv3h8Dj1csHsm7mhpElesYT6Yf
-zX1XEC+bBAlahLVu2B064dae0Wx5XnkcFMXj0EyTO2U87d89vqbllRrDtRnDvV5b
-u/8j72gZyxKTJ1wDLW8w0B62GqzeWvfRqqgnpv55gcR5mTNXuhKwqeBCbJPKVt7+
-bYQLCIt+jerXmCHG8+c8eS9enNFMFY3h7CI3zJpDC5fcgJCNs2ebb0gIFVbPv/Er
-fF6adulZkMV8gzURZVE=
-----END CERTIFICATE-----
 ### FNMT-RCM
 === /C=ES/O=FNMT-RCM/OU=AC RAIZ FNMT-RCM
@@ -3559,47 +3414,6 @@ u+YfjyW6hY0XHgL+XVAEV8/+LbzvXMAaq7afJMbfc2hIkCwU9D9SGuTSyxTDYWnP
 N3ec592kD3ZDZopD8p/7DEJ4Y9HiD2971KE9dJeFt0g5QdYg/NA6s/rob8SKunE3
 vouXsXgxT7PntgMTzlSdriVZzH81Xwj3QEUxeCp6
 -----END CERTIFICATE-----
-=== /C=BE/O=GlobalSign nv-sa/OU=Root CA/CN=GlobalSign Root CA
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number:
-            04:00:00:00:00:01:15:4b:5a:c3:94
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: Sep  1 12:00:00 1998 GMT
-            Not After : Jan 28 12:00:00 2028 GMT
-        Subject: C=BE, O=GlobalSign nv-sa, OU=Root CA, CN=GlobalSign Root CA
-        X509v3 extensions:
-            X509v3 Key Usage: critical
-                Certificate Sign, CRL Sign
-            X509v3 Basic Constraints: critical
-                CA:TRUE
-            X509v3 Subject Key Identifier:
-                60:7B:66:1A:45:0D:97:CA:89:50:2F:7D:04:CD:34:A8:FF:FC:FD:4B
-SHA1 Fingerprint=B1:BC:96:8B:D4:F4:9D:62:2A:A8:9A:81:F2:15:01:52:A4:1D:82:9C
-SHA256 Fingerprint=EB:D4:10:40:E4:BB:3E:C7:42:C9:E3:81:D3:1E:F2:A4:1A:48:B6:68:5C:96:E7:CE:F3:C1:DF:6C:D4:33:1C:99
-----BEGIN CERTIFICATE-----
-MIIDdTCCAl2gAwIBAgILBAAAAAABFUtaw5QwDQYJKoZIhvcNAQEFBQAwVzELMAkG
-A1UEBhMCQkUxGTAXBgNVBAoTEEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jv
-b3QgQ0ExGzAZBgNVBAMTEkdsb2JhbFNpZ24gUm9vdCBDQTAeFw05ODA5MDExMjAw
-MDBaFw0yODAxMjgxMjAwMDBaMFcxCzAJBgNVBAYTAkJFMRkwFwYDVQQKExBHbG9i
-YWxTaWduIG52LXNhMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJHbG9iYWxT
-aWduIFJvb3QgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDaDuaZ
-jc6j40+Kfvvxi4Mla+pIH/EqsLmVEQS98GPR4mdmzxzdzxtIK+6NiY6arymAZavp
-xy0Sy6scTHAHoT0KMM0VjU/43dSMUBUc71DuxC73/OlS8pF94G3VNTCOXkNz8kHp
-1Wrjsok6Vjk4bwY8iGlbKk3Fp1S4bInMm/k8yuX9ifUSPJJ4ltbcdG6TRGHRjcdG
-snUOhugZitVtbNV4FpWi6cgKOOvyJBNPc1STE4U6G7weNLWLBYy5d4ux2x8gkasJ
-U26Qzns3dLlwR5EiUWMWea6xrkEmCMgZK9FGqkjWZCrXgzT/LCrBbBlDSgeF59N8
-9iFo7+ryUp9/k5DPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8E
-BTADAQH/MB0GA1UdDgQWBBRge2YaRQ2XyolQL30EzTSo//z9SzANBgkqhkiG9w0B
-AQUFAAOCAQEA1nPnfE920I2/7LqivjTFKDK1fPxsnCwrvQmeU79rXqoRSLblCKOz
-yj1hTdNGCbM+w6DjY1Ub8rrvrTnhQ7k4o+YviiY776BQVvnGCv04zcQLcFGUl5gE
-38NflNUVyRRBnMRddWQVDf9VMOyGj/8N7yy5Y0b2qvzfvGn9LhJIZJrglfCm7ymP
-AbEVtQwdpf5pLGkkeB6zpxxxYu7KyJesF12KwvhHhm4qxFYxldBniYUr+WymXUad
-DKqC5JlR3XC321Y9YeRq4VzW9v493kHMB65jUr9TU/Qr6cf9tveCX4XSQRjbgbME
-HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A==
-----END CERTIFICATE-----
 ### GoDaddy.com, Inc.
@@ -5481,52 +5295,6 @@ CPyI6a6Lf+Ew9Dd+/cYy2i2eRDAwbO4H3tI0/NL/QPZL9GZGBlSm8jIKYyYwa5vR
 ### Starfield Technologies, Inc.
-=== /C=US/O=Starfield Technologies, Inc./OU=Starfield Class 2 Certification Authority
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number: 0 (0x0)
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: Jun 29 17:39:16 2004 GMT
-            Not After : Jun 29 17:39:16 2034 GMT
-        Subject: C=US, O=Starfield Technologies, Inc., OU=Starfield Class 2 Certification Authority
-        X509v3 extensions:
-            X509v3 Subject Key Identifier:
-                BF:5F:B7:D1:CE:DD:1F:86:F4:5B:55:AC:DC:D7:10:C2:0E:A9:88:E7
-            X509v3 Authority Key Identifier:
-                keyid:BF:5F:B7:D1:CE:DD:1F:86:F4:5B:55:AC:DC:D7:10:C2:0E:A9:88:E7
-                DirName:/C=US/O=Starfield Technologies, Inc./OU=Starfield Class 2 Certification Authority
-                serial:00
-            X509v3 Basic Constraints:
-                CA:TRUE
-SHA1 Fingerprint=AD:7E:1C:28:B0:64:EF:8F:60:03:40:20:14:C3:D0:E3:37:0E:B5:8A
-SHA256 Fingerprint=14:65:FA:20:53:97:B8:76:FA:A6:F0:A9:95:8E:55:90:E4:0F:CC:7F:AA:4F:B7:C2:C8:67:75:21:FB:5F:B6:58
-----BEGIN CERTIFICATE-----
-MIIEDzCCAvegAwIBAgIBADANBgkqhkiG9w0BAQUFADBoMQswCQYDVQQGEwJVUzEl
-MCMGA1UEChMcU3RhcmZpZWxkIFRlY2hub2xvZ2llcywgSW5jLjEyMDAGA1UECxMp
-U3RhcmZpZWxkIENsYXNzIDIgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkwHhcNMDQw
-NjI5MTczOTE2WhcNMzQwNjI5MTczOTE2WjBoMQswCQYDVQQGEwJVUzElMCMGA1UE
-ChMcU3RhcmZpZWxkIFRlY2hub2xvZ2llcywgSW5jLjEyMDAGA1UECxMpU3RhcmZp
-ZWxkIENsYXNzIDIgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkwggEgMA0GCSqGSIb3
-DQEBAQUAA4IBDQAwggEIAoIBAQC3Msj+6XGmBIWtDBFk385N78gDGIc/oav7PKaf
-8MOh2tTYbitTkPskpD6E8J7oX+zlJ0T1KKY/e97gKvDIr1MvnsoFAZMej2YcOadN
-+lq2cwQlZut3f+dZxkqZJRRU6ybH838Z1TBwj6+wRir/resp7defqgSHo9T5iaU0
-X9tDkYI22WY8sbi5gv2cOj4QyDvvBmVmepsZGD3/cVE8MC5fvj13c7JdBmzDI1aa
-K4UmkhynArPkPw2vCHmCuDY96pzTNbO8acr1zJ3o/WSNF4Azbl5KXZnJHoe0nRrA
-1W4TNSNe35tfPe/W93bC6j67eA0cQmdrBNj41tpvi/JEoAGrAgEDo4HFMIHCMB0G
-A1UdDgQWBBS/X7fRzt0fhvRbVazc1xDCDqmI5zCBkgYDVR0jBIGKMIGHgBS/X7fR
-zt0fhvRbVazc1xDCDqmI56FspGowaDELMAkGA1UEBhMCVVMxJTAjBgNVBAoTHFN0
-YXJmaWVsZCBUZWNobm9sb2dpZXMsIEluYy4xMjAwBgNVBAsTKVN0YXJmaWVsZCBD
-bGFzcyAyIENlcnRpZmljYXRpb24gQXV0aG9yaXR5ggEAMAwGA1UdEwQFMAMBAf8w
-DQYJKoZIhvcNAQEFBQADggEBAAWdP4id0ckaVaGsafPzWdqbAYcaT1epoXkJKtv3
-L7IezMdeatiDh6GX70k1PncGQVhiv45YuApnP+yz3SFmH8lU+nLMPUxA2IGvd56D
-eruix/U0F47ZEUD0/CwqTRV/p2JdLiXTAAsgGh1o+Re49L2L7ShZ3U0WixeDyLJl
-xy16paq8U4Zt3VekyvggQQto8PT7dL5WXXp59fkdheMtlb71cZBDzI0fmgAKhynp
-VSJYACPq4xJDKVtHCN2MQWplBqjlIapBtJUhlbl90TSrE9atvNziPTnNvT51cKEY
-WQPJIrSPnNVeKtelttQKbfi3QBFGmh95DmK/D5fs4C8fF5Q=
-----END CERTIFICATE-----
 === /C=US/ST=Arizona/L=Scottsdale/O=Starfield Technologies, Inc./CN=Starfield Root Certificate Authority - G2
 Certificate:
    Data:
@@ -6020,55 +5788,6 @@ HL/EVlP6Y2XQ8xwOFvVrhlhNGNTkDY6lnVuR3HYkUD/GKvvZt5y11ubQ2egZixVx
 SK236thZiNSQvxaz2emsWWFUyBy6ysHK4bkgTI86k4mloMy/0/Z1pHWWbVY=
 -----END CERTIFICATE-----
-### The Go Daddy Group, Inc.
-=== /C=US/O=The Go Daddy Group, Inc./OU=Go Daddy Class 2 Certification Authority
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number: 0 (0x0)
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: Jun 29 17:06:20 2004 GMT
-            Not After : Jun 29 17:06:20 2034 GMT
-        Subject: C=US, O=The Go Daddy Group, Inc., OU=Go Daddy Class 2 Certification Authority
-        X509v3 extensions:
-            X509v3 Subject Key Identifier:
-                D2:C4:B0:D2:91:D4:4C:11:71:B3:61:CB:3D:A1:FE:DD:A8:6A:D4:E3
-            X509v3 Authority Key Identifier:
-                keyid:D2:C4:B0:D2:91:D4:4C:11:71:B3:61:CB:3D:A1:FE:DD:A8:6A:D4:E3
-                DirName:/C=US/O=The Go Daddy Group, Inc./OU=Go Daddy Class 2 Certification Authority
-                serial:00
-            X509v3 Basic Constraints:
-                CA:TRUE
-SHA1 Fingerprint=27:96:BA:E6:3F:18:01:E2:77:26:1B:A0:D7:77:70:02:8F:20:EE:E4
-SHA256 Fingerprint=C3:84:6B:F2:4B:9E:93:CA:64:27:4C:0E:C6:7C:1E:CC:5E:02:4F:FC:AC:D2:D7:40:19:35:0E:81:FE:54:6A:E4
-----BEGIN CERTIFICATE-----
-MIIEADCCAuigAwIBAgIBADANBgkqhkiG9w0BAQUFADBjMQswCQYDVQQGEwJVUzEh
-MB8GA1UEChMYVGhlIEdvIERhZGR5IEdyb3VwLCBJbmMuMTEwLwYDVQQLEyhHbyBE
-YWRkeSBDbGFzcyAyIENlcnRpZmljYXRpb24gQXV0aG9yaXR5MB4XDTA0MDYyOTE3
-MDYyMFoXDTM0MDYyOTE3MDYyMFowYzELMAkGA1UEBhMCVVMxITAfBgNVBAoTGFRo
-ZSBHbyBEYWRkeSBHcm91cCwgSW5jLjExMC8GA1UECxMoR28gRGFkZHkgQ2xhc3Mg
-MiBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTCCASAwDQYJKoZIhvcNAQEBBQADggEN
-ADCCAQgCggEBAN6d1+pXGEmhW+vXX0iG6r7d/+TvZxz0ZWizV3GgXne77ZtJ6XCA
-PVYYYwhv2vLM0D9/AlQiVBDYsoHUwHU9S3/Hd8M+eKsaA7Ugay9qK7HFiH7Eux6w
-wdhFJ2+qN1j3hybX2C32qRe3H3I2TqYXP2WYktsqbl2i/ojgC95/5Y0V4evLOtXi
-EqITLdiOr18SPaAIBQi2XKVlOARFmR6jYGB0xUGlcmIbYsUfb18aQr4CUWWoriMY
-avx4A6lNf4DD+qta/KFApMoZFv6yyO9ecw3ud72a9nmYvLEHZ6IVDd2gWMZEewo+
-YihfukEHU1jPEX44dMX4/7VpkI+EdOqXG68CAQOjgcAwgb0wHQYDVR0OBBYEFNLE
-sNKR1EwRcbNhyz2h/t2oatTjMIGNBgNVHSMEgYUwgYKAFNLEsNKR1EwRcbNhyz2h
-/t2oatTjoWekZTBjMQswCQYDVQQGEwJVUzEhMB8GA1UEChMYVGhlIEdvIERhZGR5
-IEdyb3VwLCBJbmMuMTEwLwYDVQQLEyhHbyBEYWRkeSBDbGFzcyAyIENlcnRpZmlj
-YXRpb24gQXV0aG9yaXR5ggEAMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQAD
-ggEBADJL87LKPpH8EsahB4yOd6AzBhRckB4Y9wimPQoZ+YeAEW5p5JYXMP80kWNy
-OO7MHAGjHZQopDH2esRU1/blMVgDoszOYtuURXO1v0XJJLXVggKtI3lpjbi2Tc7P
-TMozI+gciKqdi0FuFskg5YmezTvacPd+mSYgFFQlq25zheabIZ0KbIIOqPjCDPoQ
-HmyW74cNxA9hi63ugyuV+I6ShHI56yDqg+2DzZduCLzrTia2cyvk0/ZM/iZx4mER
-dEr/VxqHD3VILs9RaRegAhJhldXRQLIQTO7ErBBDpqWeCtWVYpoNz4iCxTIM5Cuf
-ReYNnyicsbkqWletNw+vHX/bvZ8=
-----END CERTIFICATE-----
 ### The USERTRUST Network
 === /C=US/ST=New Jersey/L=Jersey City/O=The USERTRUST Network/CN=USERTrust ECC Certification Authority
@@ -6669,63 +6388,6 @@ rYy0UGYwEAYJKwYBBAGCNxUBBAMCAQAwCgYIKoZIzj0EAwMDaAAwZQIwJsdpW9zV
 Mgj/mkkCtojeFK9dbJlxjRo/i9fgojaGHAeCOnZT/cKi7e97sIBPWA9LUzm9
 -----END CERTIFICATE-----
-### XRamp Security Services Inc
-=== /C=US/OU=www.xrampsecurity.com/O=XRamp Security Services Inc/CN=XRamp Global Certification Authority
-Certificate:
-    Data:
-        Version: 3 (0x2)
-        Serial Number:
-            50:94:6c:ec:18:ea:d5:9c:4d:d5:97:ef:75:8f:a0:ad
-    Signature Algorithm: sha1WithRSAEncryption
-        Validity
-            Not Before: Nov  1 17:14:04 2004 GMT
-            Not After : Jan  1 05:37:19 2035 GMT
-        Subject: C=US, OU=www.xrampsecurity.com, O=XRamp Security Services Inc, CN=XRamp Global Certification Authority
-        X509v3 extensions:
-            1.3.6.1.4.1.311.20.2:
-                ...C.A
-            X509v3 Key Usage:
-                Digital Signature, Certificate Sign, CRL Sign
-            X509v3 Basic Constraints: critical
-                CA:TRUE
-            X509v3 Subject Key Identifier:
-                C6:4F:A2:3D:06:63:84:09:9C:CE:62:E4:04:AC:8D:5C:B5:E9:B6:1B
-            X509v3 CRL Distribution Points:
-                Full Name:
-                  URI:http://crl.xrampsecurity.com/XGCA.crl
-            1.3.6.1.4.1.311.21.1:
-                ...
-SHA1 Fingerprint=B8:01:86:D1:EB:9C:86:A5:41:04:CF:30:54:F3:4C:52:B7:E5:58:C6
-SHA256 Fingerprint=CE:CD:DC:90:50:99:D8:DA:DF:C5:B1:D2:09:B7:37:CB:E2:C1:8C:FB:2C:10:C0:FF:0B:CF:0D:32:86:FC:1A:A2
-----BEGIN CERTIFICATE-----
-MIIEMDCCAxigAwIBAgIQUJRs7Bjq1ZxN1ZfvdY+grTANBgkqhkiG9w0BAQUFADCB
-gjELMAkGA1UEBhMCVVMxHjAcBgNVBAsTFXd3dy54cmFtcHNlY3VyaXR5LmNvbTEk
-MCIGA1UEChMbWFJhbXAgU2VjdXJpdHkgU2VydmljZXMgSW5jMS0wKwYDVQQDEyRY
-UmFtcCBHbG9iYWwgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkwHhcNMDQxMTAxMTcx
-NDA0WhcNMzUwMTAxMDUzNzE5WjCBgjELMAkGA1UEBhMCVVMxHjAcBgNVBAsTFXd3
-dy54cmFtcHNlY3VyaXR5LmNvbTEkMCIGA1UEChMbWFJhbXAgU2VjdXJpdHkgU2Vy
-dmljZXMgSW5jMS0wKwYDVQQDEyRYUmFtcCBHbG9iYWwgQ2VydGlmaWNhdGlvbiBB
-dXRob3JpdHkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCYJB69FbS6
-38eMpSe2OAtp87ZOqCwuIR1cRN8hXX4jdP5efrRKt6atH67gBhbim1vZZ3RrXYCP
-KZ2GG9mcDZhtdhAoWORlsH9KmHmf4MMxfoArtYzAQDsRhtDLooY2YKTVMIJt2W7Q
-DxIEM5dfT2Fa8OT5kavnHTu86M/0ay00fOJIYRyO82FEzG+gSqmUsE3a56k0enI4
-qEHMPJQRfevIpoy3hsvKMzvZPTeL+3o+hiznc9cKV6xkmxnr9A8ECIqsAxcZZPRa
-JSKNNCyy9mgdEm3Tih4U2sSPpuIjhdV6Db1q4Ons7Be7QhtnqiXtRYMh/MHJfNVi
-PvryxS3T/dRlAgMBAAGjgZ8wgZwwEwYJKwYBBAGCNxQCBAYeBABDAEEwCwYDVR0P
-BAQDAgGGMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0OBBYEFMZPoj0GY4QJnM5i5ASs
-jVy16bYbMDYGA1UdHwQvMC0wK6ApoCeGJWh0dHA6Ly9jcmwueHJhbXBzZWN1cml0
-eS5jb20vWEdDQS5jcmwwEAYJKwYBBAGCNxUBBAMCAQEwDQYJKoZIhvcNAQEFBQAD
-ggEBAJEVOQMBG2f7Shz5CmBbodpNl2L5JFMn14JkTpAuw0kbK5rc/Kh4ZzXxHfAR
-vbdI4xD2Dd8/0sm2qlWkSLoC295ZLhVbO50WfUfXN+pfTXYSNrsf16GBBEYgoyxt
-qZ4Bfj8pzgCT3/3JknOJiWSe5yvkHJEs0rnOfc5vMZnT5r7SHpDwCRR5XCOrTdLa
-IR9NmXmd4c8nnxCbHIgNsIpkQTG4DmyQJKSbXHGPurt+HBvbaoAPIbzp26a3QPSy
-i6mx5O+aGtA9aZnuqCij4Tyz8LIRnM98QObd50N9otg6tamN8jSZxNQQ4Qb9CYQQ
-O+7ETPTsJ3xCwnR8gooJybQDJbw=
-----END CERTIFICATE-----
 ### certSIGN
 === /C=RO/O=certSIGN/OU=certSIGN ROOT CA
diff --git a/src/lib/libcrypto/hidden/crypto_namespace.h b/src/lib/libcrypto/hidden/crypto_namespace.h
index 741ad08549..43c8718ed0 100644
--- a/src/lib/libcrypto/hidden/crypto_namespace.h
+++ b/src/lib/libcrypto/hidden/crypto_namespace.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: crypto_namespace.h,v 1.4 2024/07/11 21:31:52 miod Exp $       */
+/*      $OpenBSD: crypto_namespace.h,v 1.5 2025/08/18 16:00:05 tb Exp $ */
 /*
 * Copyright (c) 2016 Philip Guenther <guenther@openbsd.org>
 *
@@ -45,7 +45,11 @@
 # define LCRYPTO_UNUSED(x)
 # define LCRYPTO_USED(x)
 # define LCRYPTO_ALIAS1(pre,x)
+#ifdef _MSC_VER
+# define LCRYPTO_ALIAS(x)
+#else
 # define LCRYPTO_ALIAS(x)       asm("")
+#endif /* _MSC_VER */
 #endif
 #endif  /* _LIBCRYPTO_CRYPTO_NAMESPACE_H_ */
diff --git a/src/lib/libcrypto/hidden/openssl/mlkem.h b/src/lib/libcrypto/hidden/openssl/mlkem.h
index 8cd80eb3af..3807b3fa1e 100644
--- a/src/lib/libcrypto/hidden/openssl/mlkem.h
+++ b/src/lib/libcrypto/hidden/openssl/mlkem.h
@@ -1,6 +1,6 @@
-/* $OpenBSD: mlkem.h,v 1.4 2024/12/20 15:10:31 tb Exp $ */
+/* $OpenBSD: mlkem.h,v 1.5 2025/08/14 15:48:48 beck Exp $ */
 /*
- * Copyright (c) 2024 Bob Beck <beck@obtuse.com>
+ * Copyright (c) 2025 Bob Beck <beck@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
@@ -18,9 +18,6 @@
 #ifndef _LIBCRYPTO_MLKEM_H
 #define _LIBCRYPTO_MLKEM_H
-/* Undo when making public */
-#ifdef LIBRESSL_HAS_MLKEM
 #ifndef _MSC_VER
 #include_next <openssl/mlkem.h>
 #else
@@ -28,22 +25,21 @@
 #endif
 #include "crypto_namespace.h"
-LCRYPTO_USED(MLKEM768_generate_key);
+LCRYPTO_USED(MLKEM_private_key_new);
-LCRYPTO_USED(MLKEM768_public_from_private);
+LCRYPTO_USED(MLKEM_private_key_free);
-LCRYPTO_USED(MLKEM768_encap);
+LCRYPTO_USED(MLKEM_private_key_ciphertext_length);
-LCRYPTO_USED(MLKEM768_decap);
+LCRYPTO_USED(MLKEM_private_key_encoded_length);
-LCRYPTO_USED(MLKEM768_marshal_public_key);
+LCRYPTO_USED(MLKEM_public_key_new);
-LCRYPTO_USED(MLKEM768_parse_public_key);
+LCRYPTO_USED(MLKEM_public_key_free);
-LCRYPTO_USED(MLKEM768_private_key_from_seed);
+LCRYPTO_USED(MLKEM_public_key_ciphertext_length);
-LCRYPTO_USED(MLKEM768_parse_private_key);
+LCRYPTO_USED(MLKEM_public_key_encoded_length);
-LCRYPTO_USED(MLKEM1024_generate_key);
+LCRYPTO_USED(MLKEM_generate_key);
-LCRYPTO_USED(MLKEM1024_public_from_private);
+LCRYPTO_USED(MLKEM_private_key_from_seed);
-LCRYPTO_USED(MLKEM1024_encap);
+LCRYPTO_USED(MLKEM_public_from_private);
-LCRYPTO_USED(MLKEM1024_decap);
+LCRYPTO_USED(MLKEM_encap);
-LCRYPTO_USED(MLKEM1024_marshal_public_key);
+LCRYPTO_USED(MLKEM_decap);
-LCRYPTO_USED(MLKEM1024_parse_public_key);
+LCRYPTO_USED(MLKEM_marshal_public_key);
-LCRYPTO_USED(MLKEM1024_private_key_from_seed);
+LCRYPTO_USED(MLKEM_parse_public_key);
-LCRYPTO_USED(MLKEM1024_parse_private_key);
+LCRYPTO_USED(MLKEM_parse_private_key);
-#endif /* LIBRESSL_HAS_MLKEM */
 #endif /* _LIBCRYPTO_MLKEM_H */
diff --git a/src/lib/libcrypto/mlkem/mlkem.c b/src/lib/libcrypto/mlkem/mlkem.c
new file mode 100644
index 0000000000..bf53e5d77a
--- /dev/null
+++ b/src/lib/libcrypto/mlkem/mlkem.c
@@ -0,0 +1,416 @@
+/*      $OpenBSD: mlkem.c,v 1.2 2025/08/14 16:04:01 beck Exp $ */
+/*
+ * Copyright (c) 2025, Bob Beck <beck@obtuse.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <stdlib.h>
+#include <openssl/mlkem.h>
+#include "mlkem_internal.h"
+static inline int
+private_key_is_new(const MLKEM_private_key *key)
+{
+        return (key != NULL &&
+            key->state == MLKEM_PRIVATE_KEY_UNINITIALIZED &&
+            (key->rank == RANK768 || key->rank == RANK1024));
+}
+static inline int
+private_key_is_valid(const MLKEM_private_key *key)
+{
+        return (key != NULL &&
+            key->state == MLKEM_PRIVATE_KEY_INITIALIZED &&
+            (key->rank == RANK768 || key->rank == RANK1024));
+}
+static inline int
+public_key_is_new(const MLKEM_public_key *key)
+{
+        return (key != NULL &&
+            key->state == MLKEM_PUBLIC_KEY_UNINITIALIZED &&
+            (key->rank == RANK768 || key->rank == RANK1024));
+}
+static inline int
+public_key_is_valid(const MLKEM_public_key *key)
+{
+        return (key != NULL &&
+            key->state == MLKEM_PUBLIC_KEY_INITIALIZED &&
+            (key->rank == RANK768 || key->rank == RANK1024));
+}
+/*
+ * ML-KEM operations
+ */
+int
+MLKEM_generate_key_external_entropy(MLKEM_private_key *private_key,
+    uint8_t **out_encoded_public_key, size_t *out_encoded_public_key_len,
+    const uint8_t *entropy)
+{
+        uint8_t *k = NULL;
+        size_t k_len = 0;
+        int ret = 0;
+        if (*out_encoded_public_key != NULL)
+                goto err;
+        if (!private_key_is_new(private_key))
+                goto err;
+        k_len = MLKEM768_PUBLIC_KEY_BYTES;
+        if (private_key->rank == RANK1024)
+                k_len = MLKEM1024_PUBLIC_KEY_BYTES;
+        if ((k = calloc(1, k_len)) == NULL)
+                goto err;
+        switch (private_key->rank) {
+        case RANK768:
+                if (!MLKEM768_generate_key_external_entropy(k, private_key,
+                    entropy))
+                        goto err;
+                break;
+        case RANK1024:
+                if (!MLKEM1024_generate_key_external_entropy(k, private_key,
+                    entropy))
+                        goto err;
+                break;
+        }
+        private_key->state = MLKEM_PRIVATE_KEY_INITIALIZED;
+        *out_encoded_public_key = k;
+        *out_encoded_public_key_len = k_len;
+        k = NULL;
+        ret = 1;
+ err:
+        freezero(k, k_len);
+        return ret;
+}
+int
+MLKEM_generate_key(MLKEM_private_key *private_key,
+    uint8_t **out_encoded_public_key, size_t *out_encoded_public_key_len,
+    uint8_t **out_optional_seed, size_t *out_optional_seed_len)
+{
+        uint8_t *entropy_buf = NULL;
+        int ret = 0;
+        if (*out_encoded_public_key != NULL)
+                goto err;
+        if (out_optional_seed != NULL && *out_optional_seed != NULL)
+                goto err;
+        if ((entropy_buf = calloc(1, MLKEM_SEED_LENGTH)) == NULL)
+                goto err;
+        arc4random_buf(entropy_buf, MLKEM_SEED_LENGTH);
+        if (!MLKEM_generate_key_external_entropy(private_key,
+            out_encoded_public_key, out_encoded_public_key_len,
+            entropy_buf))
+                goto err;
+        if (out_optional_seed != NULL) {
+                *out_optional_seed = entropy_buf;
+                *out_optional_seed_len = MLKEM_SEED_LENGTH;
+                entropy_buf = NULL;
+        }
+        ret = 1;
+ err:
+        freezero(entropy_buf, MLKEM_SEED_LENGTH);
+        return ret;
+}
+LCRYPTO_ALIAS(MLKEM_generate_key);
+int
+MLKEM_private_key_from_seed(MLKEM_private_key *private_key,
+    const uint8_t *seed, size_t seed_len)
+{
+        int ret = 0;
+        if (!private_key_is_new(private_key))
+                goto err;
+        if (seed_len != MLKEM_SEED_LENGTH)
+                goto err;
+        switch (private_key->rank) {
+        case RANK768:
+                if (!MLKEM768_private_key_from_seed(seed,
+                    seed_len, private_key))
+                        goto err;
+                break;
+        case RANK1024:
+                if (!MLKEM1024_private_key_from_seed(private_key,
+                    seed, seed_len))
+                        goto err;
+                break;
+        }
+        private_key->state = MLKEM_PRIVATE_KEY_INITIALIZED;
+        ret = 1;
+ err:
+        return ret;
+}
+LCRYPTO_ALIAS(MLKEM_private_key_from_seed);
+int
+MLKEM_public_from_private(const MLKEM_private_key *private_key,
+    MLKEM_public_key *public_key)
+{
+        if (!private_key_is_valid(private_key))
+                return 0;
+        if (!public_key_is_new(public_key))
+                return 0;
+        if (public_key->rank != private_key->rank)
+                return 0;
+        switch (private_key->rank) {
+        case RANK768:
+                MLKEM768_public_from_private(private_key, public_key);
+                break;
+        case RANK1024:
+                MLKEM1024_public_from_private(private_key, public_key);
+                break;
+        }
+        public_key->state = MLKEM_PUBLIC_KEY_INITIALIZED;
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM_public_from_private);
+int
+MLKEM_encap_external_entropy(const MLKEM_public_key *public_key,
+    const uint8_t *entropy, uint8_t **out_ciphertext,
+    size_t *out_ciphertext_len, uint8_t **out_shared_secret,
+    size_t *out_shared_secret_len)
+{
+        uint8_t *secret = NULL;
+        uint8_t *ciphertext = NULL;
+        size_t ciphertext_len = 0;
+        int ret = 0;
+        if (*out_ciphertext != NULL)
+                goto err;
+        if (*out_shared_secret != NULL)
+                goto err;
+        if (!public_key_is_valid(public_key))
+                goto err;
+        if ((secret = calloc(1, MLKEM_SHARED_SECRET_LENGTH)) == NULL)
+                goto err;
+        ciphertext_len = MLKEM_public_key_ciphertext_length(public_key);
+        if ((ciphertext = calloc(1, ciphertext_len)) == NULL)
+                goto err;
+        switch (public_key->rank) {
+        case RANK768:
+                MLKEM768_encap_external_entropy(ciphertext, secret, public_key,
+                    entropy);
+                break;
+        case RANK1024:
+                MLKEM1024_encap_external_entropy(ciphertext, secret, public_key,
+                    entropy);
+                break;
+        }
+        *out_ciphertext = ciphertext;
+        *out_ciphertext_len = ciphertext_len;
+        ciphertext = NULL;
+        *out_shared_secret = secret;
+        *out_shared_secret_len = MLKEM_SHARED_SECRET_LENGTH;
+        secret = NULL;
+        ret = 1;
+ err:
+        freezero(secret, MLKEM_SHARED_SECRET_LENGTH);
+        freezero(ciphertext, ciphertext_len);
+        return ret;
+}
+int
+MLKEM_encap(const MLKEM_public_key *public_key,
+    uint8_t **out_ciphertext, size_t *out_ciphertext_len,
+    uint8_t **out_shared_secret, size_t *out_shared_secret_len)
+{
+        uint8_t entropy[MLKEM_ENCAP_ENTROPY];
+        arc4random_buf(entropy, MLKEM_ENCAP_ENTROPY);
+        return MLKEM_encap_external_entropy(public_key, entropy, out_ciphertext,
+            out_ciphertext_len, out_shared_secret, out_shared_secret_len);
+}
+LCRYPTO_ALIAS(MLKEM_encap);
+int
+MLKEM_decap(const MLKEM_private_key *private_key,
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    uint8_t **out_shared_secret, size_t *out_shared_secret_len)
+{
+        uint8_t *s = NULL;
+        int ret = 0;
+        if (*out_shared_secret != NULL)
+                goto err;
+        if (!private_key_is_valid(private_key))
+                goto err;
+        if (ciphertext_len != MLKEM_private_key_ciphertext_length(private_key))
+                goto err;
+        if ((s = calloc(1, MLKEM_SHARED_SECRET_LENGTH)) == NULL)
+                goto err;
+        switch (private_key->rank) {
+        case RANK768:
+                MLKEM768_decap(private_key, ciphertext, ciphertext_len, s);
+                break;
+        case RANK1024:
+                MLKEM1024_decap(private_key, ciphertext, ciphertext_len, s);
+                break;
+        }
+        *out_shared_secret = s;
+        *out_shared_secret_len = MLKEM_SHARED_SECRET_LENGTH;
+        s = NULL;
+        ret = 1;
+ err:
+        freezero(s, MLKEM_SHARED_SECRET_LENGTH);
+        return ret;
+}
+LCRYPTO_ALIAS(MLKEM_decap);
+int
+MLKEM_marshal_public_key(const MLKEM_public_key *public_key, uint8_t **out,
+    size_t *out_len)
+{
+        if (*out != NULL)
+                return 0;
+        if (!public_key_is_valid(public_key))
+                return 0;
+        switch (public_key->rank) {
+        case RANK768:
+                return MLKEM768_marshal_public_key(public_key, out, out_len);
+        case RANK1024:
+                return MLKEM1024_marshal_public_key(public_key, out, out_len);
+        default:
+                return 0;
+        }
+}
+LCRYPTO_ALIAS(MLKEM_marshal_public_key);
+/*
+ * Not exposed publicly, becuase the NIST private key format is gigantisch, and
+ * seeds should be used instead.  Used for the NIST tests.
+ */
+int
+MLKEM_marshal_private_key(const MLKEM_private_key *private_key, uint8_t **out,
+    size_t *out_len)
+{
+        if (*out != NULL)
+                return 0;
+        if (!private_key_is_valid(private_key))
+                return 0;
+        switch (private_key->rank) {
+        case RANK768:
+                return MLKEM768_marshal_private_key(private_key, out, out_len);
+        case RANK1024:
+                return MLKEM1024_marshal_private_key(private_key, out, out_len);
+        default:
+                return 0;
+        }
+}
+int
+MLKEM_parse_public_key(MLKEM_public_key *public_key, const uint8_t *in,
+    size_t in_len)
+{
+        if (!public_key_is_new(public_key))
+                return 0;
+        if (in_len != MLKEM_public_key_encoded_length(public_key))
+                return 0;
+        switch (public_key->rank) {
+        case RANK768:
+                if (!MLKEM768_parse_public_key(in, in_len,
+                    public_key))
+                        return 0;
+                break;
+        case RANK1024:
+                if (!MLKEM1024_parse_public_key(in, in_len,
+                    public_key))
+                        return 0;
+                break;
+        }
+        public_key->state = MLKEM_PUBLIC_KEY_INITIALIZED;
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM_parse_public_key);
+int
+MLKEM_parse_private_key(MLKEM_private_key *private_key, const uint8_t *in,
+    size_t in_len)
+{
+        if (!private_key_is_new(private_key))
+                return 0;
+        if (in_len != MLKEM_private_key_encoded_length(private_key))
+                return 0;
+        switch (private_key->rank) {
+        case RANK768:
+                if (!MLKEM768_parse_private_key(in, in_len, private_key))
+                        return 0;
+                break;
+        case RANK1024:
+                if (!MLKEM1024_parse_private_key(in, in_len, private_key))
+                        return 0;
+                break;
+        }
+        private_key->state = MLKEM_PRIVATE_KEY_INITIALIZED;
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM_parse_private_key);
diff --git a/src/lib/libcrypto/mlkem/mlkem.h b/src/lib/libcrypto/mlkem/mlkem.h
index a2c5d7fed0..31d4858195 100644
--- a/src/lib/libcrypto/mlkem/mlkem.h
+++ b/src/lib/libcrypto/mlkem/mlkem.h
@@ -1,6 +1,6 @@
-/*      $OpenBSD: mlkem.h,v 1.6 2025/05/19 06:47:40 beck Exp $ */
+/*      $OpenBSD: mlkem.h,v 1.7 2025/08/14 15:48:48 beck Exp $ */
 /*
- * Copyright (c) 2024, Google Inc.
+ * Copyright (c) 2025 Bob Beck <beck@obtuse.com>
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
@@ -26,253 +26,202 @@ extern "C" {
 #endif
 /*
- * ML-KEM-768
+ * ML-KEM constants
- *
- * This implements the Module-Lattice-Based Key-Encapsulation Mechanism from
- * https://csrc.nist.gov/pubs/fips/204/final
 */
-/*
+#define RANK768 3
- * MLKEM768_public_key contains a ML-KEM-768 public key. The contents of this
+#define RANK1024 4
- * object should never leave the address space since the format is unstable.
- */
-struct MLKEM768_public_key {
-        union {
-                uint8_t bytes[512 * (3 + 9) + 32 + 32];
-                uint16_t alignment;
-        } opaque;
-};
-/*
- * MLKEM768_private_key contains a ML-KEM-768 private key. The contents of this
- * object should never leave the address space since the format is unstable.
- */
-struct MLKEM768_private_key {
-        union {
-                uint8_t bytes[512 * (3 + 3 + 9) + 32 + 32 + 32];
-                uint16_t alignment;
-        } opaque;
-};
 /*
- * MLKEM768_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM768 public
+ * ML-KEM keys
- * key.
 */
-#define MLKEM768_PUBLIC_KEY_BYTES 1184
-/* MLKEM_SEED_BYTES is the number of bytes in an ML-KEM seed. */
+typedef struct MLKEM_private_key_st MLKEM_private_key;
-#define MLKEM_SEED_BYTES 64
+typedef struct MLKEM_public_key_st MLKEM_public_key;
 /*
- *  MLKEM_SHARED_SECRET_BYTES is the number of bytes in the ML-KEM768 shared
+ * MLKEM_private_key_new allocates a new uninitialized ML-KEM private key for
- * secret. Although the round-3 specification has a variable-length output, the
+ * |rank|, which must be RANK768 or RANK1024. It returns a pointer to an
- * final ML-KEM construction is expected to use a fixed 32-byte output. To
+ * allocated structure suitable for holding a generated private key of the
- * simplify the future transition, we apply the same restriction.
+ * corresponding rank on success, NULL is returned on failure. The caller is
+ * responsible for deallocating the resulting key with |MLKEM_private_key_free|.
 */
-#define MLKEM_SHARED_SECRET_BYTES 32
+MLKEM_private_key *MLKEM_private_key_new(int rank);
 /*
- * MLKEM_generate_key generates a random public/private key pair, writes the
+ * MLKEM_private_key_free zeroes and frees all memory for |key| if |key| is
- * encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+ * non NULL. If |key| is NULL it does nothing and returns.
- * the private key. If |optional_out_seed| is not NULL then the seed used to
- * generate the private key is written to it.
 */
-int MLKEM768_generate_key(
+void MLKEM_private_key_free(MLKEM_private_key *key);
-    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
-    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
-    struct MLKEM768_private_key *out_private_key);
 /*
- * MLKEM768_private_key_from_seed derives a private key from a seed that was
+ * MLKEM_private_key_encoded_length the number of bytes used by the encoded form
- * generated by |MLKEM768_generate_key|. It fails and returns 0 if |seed_len| is
+ * of |key|. Thie corresponds to the length of the buffer allocated for the
- * incorrect, otherwise it writes |*out_private_key| and returns 1.
+ * encoded_public_key from |MLKEM_marshal_private_key|. Zero is returned if
+ * |key| is NULL or has an invalid rank.
 */
-int MLKEM768_private_key_from_seed(struct MLKEM768_private_key *out_private_key,
+size_t MLKEM_private_key_encoded_length(const MLKEM_private_key *key);
-    const uint8_t *seed, size_t seed_len);
 /*
- * MLKEM_public_from_private sets |*out_public_key| to the public key that
+ * MLKEM_private_key_ciphertext_length returns the number of bytes of ciphertext
- * corresponds to |private_key|. (This is faster than parsing the output of
+ * required to decrypt a shared secret with |key| using |MLKEM_decap|. Zero is
- * |MLKEM_generate_key| if, for some reason, you need to encapsulate to a key
+ * returned if |key| is NULL or has an invalid rank.
- * that was just generated.)
 */
-void MLKEM768_public_from_private(struct MLKEM768_public_key *out_public_key,
+size_t MLKEM_private_key_ciphertext_length(const MLKEM_private_key *key);
-    const struct MLKEM768_private_key *private_key);
-/* MLKEM768_CIPHERTEXT_BYTES is number of bytes in the ML-KEM768 ciphertext. */
-#define MLKEM768_CIPHERTEXT_BYTES 1088
 /*
- * MLKEM768_encap encrypts a random shared secret for |public_key|, writes the
+ * MLKEM_public_key_new allocates a new uninitialized ML-KEM public key for
- * ciphertext to |out_ciphertext|, and writes the random shared secret to
+ * |rank|, which must be RANK768 or RANK1024. It returns a pointer to an
- * |out_shared_secret|.
+ * allocated structure suitable for holding a generated public key of the
+ * corresponding rank on success, NULL is returned on failure. The caller is
+ * responsible for deallocating the resulting key with |MLKEM_public_key_free|.
 */
-void MLKEM768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+MLKEM_public_key *MLKEM_public_key_new(int rank);
-    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
-    const struct MLKEM768_public_key *public_key);
 /*
- * MLKEM768_decap decrypts a shared secret from |ciphertext| using |private_key|
+ * MLKEM_public_key_free zeros and deallocates all memory for |key| if |key| is
- * and writes it to |out_shared_secret|. If |ciphertext_len| is incorrect it
+ * non NULL. If |key| is NULL it does nothing and returns.
- * returns 0, otherwise it rreturns 1. If |ciphertext| is invalid,
- * |out_shared_secret| is filled with a key that will always be the same for the
- * same |ciphertext| and |private_key|, but which appears to be random unless
- * one has access to |private_key|. These alternatives occur in constant time.
- * Any subsequent symmetric encryption using |out_shared_secret| must use an
- * authenticated encryption scheme in order to discover the decapsulation
- * failure.
 */
-int MLKEM768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+void MLKEM_public_key_free(MLKEM_public_key *key);
-    const uint8_t *ciphertext, size_t ciphertext_len,
-    const struct MLKEM768_private_key *private_key);
-/* Serialisation of keys. */
 /*
- * MLKEM768_marshal_public_key serializes |public_key| to |out| in the standard
+ * MLKEM_public_key_encoded_length the number of bytes used by the encoded form
- * format for ML-KEM public keys. It returns one on success or zero on allocation
+ * of |key|. Thie corresponds to the length of the buffer allocated for the
- * error.
+ * encoded_public_key from |MLKEM_generate_key| or |MLKEM_marshal_public_key|.
+ * Zero is returned if |key| is NULL or has an invalid rank.
 */
-int MLKEM768_marshal_public_key(uint8_t **output, size_t *output_len,
+size_t MLKEM_public_key_encoded_length(const MLKEM_public_key *key);
-    const struct MLKEM768_public_key *public_key);
 /*
- * MLKEM768_parse_public_key parses a public key, in the format generated by
+ * MLKEM_public_key_cipertext_length returns the number of bytes produced as the
- * |MLKEM_marshal_public_key|, from |in| and writes the result to
+ * ciphertext when encrypting a shared secret with |key| using |MLKEM_encap|. Zero
- * |out_public_key|. It returns one on success or zero on parse error or if
+ * is returned if |key| is NULL or has an invalid rank.
- * there are trailing bytes in |in|.
 */
-int MLKEM768_parse_public_key(struct MLKEM768_public_key *out_public_key,
+size_t MLKEM_public_key_ciphertext_length(const MLKEM_public_key *key);
-    const uint8_t *input, size_t input_len);
 /*
- * MLKEM_parse_private_key parses a private key, in the format generated by
+ * ML-KEM operations
- * |MLKEM_marshal_private_key|, from |in| and writes the result to
- * |out_private_key|. It returns one on success or zero on parse error or if
- * there are trailing bytes in |in|. This formate is verbose and should be avoided.
- * Private keys should be stored as seeds and parsed using |MLKEM768_private_key_from_seed|.
 */
-int MLKEM768_parse_private_key(struct MLKEM768_private_key *out_private_key,
-    const uint8_t *input, size_t input_len);
 /*
- * ML-KEM-1024
+ * MLKEM_generate_key generates a random private/public key pair, initializing
+ * |private_key|. It returns one on success, and zero on failure or error.
+ * |private_key| must be a new uninitialized key. |*out_encoded_public_key| and
+ * |*out_optional_seed|, if provided, must have the value of NULL. On success, a
+ * pointer to the encoded public key of the correct size for |key| is returned
+ * in |out_encoded_public_key|, and the length in bytes of
+ * |*out_encoded_public_key| is returned in |out_encoded_public_key_len|. If
+ * |out_optional_seed| is not NULL, a pointer to the seed used to generate the
+ * private key is returned in |*out_optional_seed| and the length in bytes of
+ * the seed is returned in |*out_optional_seed_len|. The caller is responsible
+ * for freeing the values returned in |out_encoded_public_key|, and
+ * |out_optional_seed|.
 *
- * ML-KEM-1024 also exists. You should prefer ML-KEM-768 where possible.
+ * In the event a private key needs to be saved, The normal best practice is to
- */
+ * save |out_optional_seed| as the private key, along with the ML-KEM rank value.
+ * An MLKEM_private_key of the correct rank can then be constructed using
-/*
+ * |MLKEM_private_key_from_seed|.
- * MLKEM1024_public_key contains an ML-KEM-1024 public key. The contents of this
- * object should never leave the address space since the format is unstable.
- */
-struct MLKEM1024_public_key {
-        union {
-                uint8_t bytes[512 * (4 + 16) + 32 + 32];
-                uint16_t alignment;
-        } opaque;
-};
-/*
- * MLKEM1024_private_key contains a ML-KEM-1024 private key. The contents of
- * this object should never leave the address space since the format is
- * unstable.
- */
-struct MLKEM1024_private_key {
-        union {
-                uint8_t bytes[512 * (4 + 4 + 16) + 32 + 32 + 32];
-                uint16_t alignment;
-        } opaque;
-};
-/*
- * MLKEM1024_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM-1024
- * public key.
 */
-#define MLKEM1024_PUBLIC_KEY_BYTES 1568
+int MLKEM_generate_key(MLKEM_private_key *private_key,
+    uint8_t **out_encoded_public_key, size_t *out_encoded_public_key_len,
+    uint8_t **out_optional_seed, size_t *out_optional_seed_len);
 /*
- * MLKEM1024_generate_key generates a random public/private key pair, writes the
+ * MLKEM_private_key_from_seed derives a private key from a seed that was
- * encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+ * generated by |MLKEM_generate_key| initializing |private_key|. It returns one
- * the private key. If |optional_out_seed| is not NULL then the seed used to
+ * on success, and zero on failure or error. |private_key| must be a new
- * generate the private key is written to it.
+ * uninitialized key. |seed_len| must be MLKEM_SEED_LENGTH.
+ *
+ * For |private_key| to match the key generated by |MLKEM_generate_key|,
+ * |private_key| must have been created with the same rank as used when generating
+ * the key.
 */
-int MLKEM1024_generate_key(
+int MLKEM_private_key_from_seed(MLKEM_private_key *private_key,
-    uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
+    const uint8_t *seed, size_t seed_len);
-    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
-    struct MLKEM1024_private_key *out_private_key);
 /*
- * MLKEM1024_private_key_from_seed derives a private key from a seed that was
+ * MLKEM_public_from_private initializes |public_key| with the public key that
- * generated by |MLKEM1024_generate_key|. It fails and returns 0 if |seed_len|
+ * corresponds to |private_key|. It returns one on success and zero on
- * is incorrect, otherwise it writes |*out_private_key| and returns 1.
+ * error. This is faster than parsing the output of |MLKEM_generate_key| if, for
+ * some reason, you need to encapsulate to a key that was just
+ * generated. |private key| must be a new uninitialized key, of the same rank as
+ * |public_key|.
 */
-int MLKEM1024_private_key_from_seed(
+int MLKEM_public_from_private(const MLKEM_private_key *private_key,
-    struct MLKEM1024_private_key *out_private_key, const uint8_t *seed,
+    MLKEM_public_key *public_key);
-    size_t seed_len);
 /*
- * MLKEM1024_public_from_private sets |*out_public_key| to the public key that
+ * MLKEM_encap encrypts a random shared secret for an initialized
- * corresponds to |private_key|. (This is faster than parsing the output of
+ * |public_key|. It returns one on success, and zero on failure or error. |*out
- * |MLKEM1024_generate_key| if, for some reason, you need to encapsulate to a
+ * ciphertext| and |*out_shared_secret| must have the value NULL. On success, a
- * key that was just generated.)
+ * pointer to the ciphertext of the correct size for |key| is returned in
+ * |out_ciphertext|, the length in bytes of |*out_ciphertext| is returned in
+ * |*out_ciphertext_len|, a pointer to the random shared secret is returned in
+ * |out_shared_secret|, and the length in bytes of |*out_shared_secret| is
+ * returned in |*out_ciphtertext_len|. The caller is responsible for zeroing and
+ * freeing the values returned in |out_ciphertext| and |out_shared_secret|
 */
-void MLKEM1024_public_from_private(struct MLKEM1024_public_key *out_public_key,
+int MLKEM_encap(const MLKEM_public_key *public_key,
-    const struct MLKEM1024_private_key *private_key);
+    uint8_t **out_ciphertext, size_t *out_ciphertext_len,
+    uint8_t **out_shared_secret, size_t *out_shared_secret_len);
-/* MLKEM1024_CIPHERTEXT_BYTES is number of bytes in the ML-KEM-1024 ciphertext. */
-#define MLKEM1024_CIPHERTEXT_BYTES 1568
 /*
- * MLKEM1024_encap encrypts a random shared secret for |public_key|, writes the
+ * MLKEM_decap decrypts a shared secret from |ciphertext| using an initialized
- * ciphertext to |out_ciphertext|, and writes the random shared secret to
+ * |private_key|. It returns a pointer to the shared secret|out_shared_secret|
- * |out_shared_secret|.
+ * and the length in bytes of |*out_shared_secret| in |*out_shared_secret_len|.
- */
+ *
-void MLKEM1024_encap(uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
+ * If |ciphertext_len| is incorrect for |private_key|, |*out_shared_secret| is
-    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+ * not NULL, or memory can not be allocated, it returns zero, otherwise it
-    const struct MLKEM1024_public_key *public_key);
+ * returns one. If |ciphertext| is invalid, a pointer is returned in
+ * |out_shared_secret| pointing to a key that will always be the same for the
-/*
+ * same |ciphertext| and |private_key|, but which appears to be random unless
- * MLKEM1024_decap decrypts a shared secret from |ciphertext| using
+ * one has access to |private_key|. These alternatives occur in constant time.
- * |private_key| and writes it to |out_shared_secret|. If |ciphertext_len| is
+ * Any subsequent symmetric encryption using |out_shared_secret| must use an
- * incorrect it returns 0, otherwise it returns 1. If |ciphertext| is invalid
+ * authenticated encryption scheme in order to discover the decapsulation
- * (but of the correct length), |out_shared_secret| is filled with a key that
+ * failure. The caller is responsible for zeroing and freeing the value returned
- * will always be the same for the same |ciphertext| and |private_key|, but
+ * in |out_shared_secret|.
- * which appears to be random unless one has access to |private_key|. These
- * alternatives occur in constant time. Any subsequent symmetric encryption
- * using |out_shared_secret| must use an authenticated encryption scheme in
- * order to discover the decapsulation failure.
 */
-int MLKEM1024_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+int MLKEM_decap(const MLKEM_private_key *private_key,
    const uint8_t *ciphertext, size_t ciphertext_len,
-    const struct MLKEM1024_private_key *private_key);
+    uint8_t **out_shared_secret, size_t *out_shared_secret_len);
+/* Serialization of ML-KEM keys. */
 /*
- * Serialisation of ML-KEM-1024 keys.
+ * MLKEM_marshal_public_key serializes an initialized |public_key| in the
- * MLKEM1024_marshal_public_key serializes |public_key| to |out| in the standard
+ * standard format for ML-KEM public keys. It returns one on success or zero on
- * format for ML-KEM-1024 public keys. It returns one on success or zero on
+ * allocation error or failure. |*out| must have the value NULL. On success a
- * allocation error.
+ * pointer is returned in |out| to the encoded public key matching |public_key|,
+ * and a pointer to the length in bytes of the encoded public key is stored in
+ * |out_len|. The caller is responsible for freeing the values returned in
+ * |out|.
 */
-int MLKEM1024_marshal_public_key(uint8_t **output, size_t *output_len,
+int MLKEM_marshal_public_key(const MLKEM_public_key *public_key, uint8_t **out,
-    const struct MLKEM1024_public_key *public_key);
+    size_t *out_len);
 /*
- * MLKEM1024_parse_public_key parses a public key, in the format generated by
+ * MLKEM_parse_public_key parses a public key, in the format generated by
- * |MLKEM1024_marshal_public_key|, from |in| and writes the result to
+ * |MLKEM_marshal_public_key|, from |in|. It returns one on success or zero on
- * |out_public_key|. It returns one on success or zero on parse error or if
+ * error or failure. |public_key| must be a new uninitialized key. |in_len| must
- * there are trailing bytes in |in|.
+ * be the correct length for the encoded format of |public_key. On success
+ * |public_key| is initialized to the value parsed from |in|.
 */
-int MLKEM1024_parse_public_key(struct MLKEM1024_public_key *out_public_key,
+int MLKEM_parse_public_key(MLKEM_public_key *public_key, const uint8_t *in,
-    const uint8_t *input, size_t input_len);
+    size_t in_len);
 /*
- * MLKEM1024_parse_private_key parses a private key, in NIST's format for
+ * MLKEM_parse_private_key parses a private key, in the format generated by
- * private keys, from |in| and writes the result to |out_private_key|. It
+ * |MLKEM_marshal_private_key|, from |in|. It returns one on success or zero on
- * returns one on success or zero on parse error or if there are trailing bytes
+ * error or failure. |private_key| must be a new uninitialized key. |in_len|
- * in |in|. This format is verbose and should be avoided. Private keys should be
+ * must be the correct length for the encoded format of |private_key. On success
- * stored as seeds and parsed using |MLKEM1024_private_key_from_seed|.
+ * |private_key| is initialized to the value parsed from |in|.
+ *
+ * This format is wastefully verbose and should be avoided. Private keys should
+ * be stored as seeds from |MLKEM_generate_key|, and then parsed using
+ * |MLKEM_private_key_from_seed|.
 */
-int MLKEM1024_parse_private_key(struct MLKEM1024_private_key *out_private_key,
+int MLKEM_parse_private_key(MLKEM_private_key *private_key, const uint8_t *in,
-    const uint8_t *input, size_t input_len);
+    size_t in_len);
 #if defined(__cplusplus)
 }
diff --git a/src/lib/libcrypto/mlkem/mlkem1024.c b/src/lib/libcrypto/mlkem/mlkem1024.c
index 26c4716539..8f4f41f8ff 100644
--- a/src/lib/libcrypto/mlkem/mlkem1024.c
+++ b/src/lib/libcrypto/mlkem/mlkem1024.c
@@ -1,7 +1,7 @@
-/* $OpenBSD: mlkem1024.c,v 1.11 2025/05/21 02:18:11 kenjiro Exp $ */
+/* $OpenBSD: mlkem1024.c,v 1.12 2025/08/14 15:48:48 beck Exp $ */
 /*
 * Copyright (c) 2024, Google Inc.
- * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
+ * Copyright (c) 2024, 2025 Bob Beck <beck@obtuse.com>
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
@@ -20,18 +20,14 @@
 #include <stdlib.h>
 #include <string.h>
-#include "bytestring.h"
+#include <openssl/mlkem.h>
-#include "mlkem.h"
+#include "bytestring.h"
 #include "sha3_internal.h"
 #include "mlkem_internal.h"
 #include "constant_time.h"
 #include "crypto_internal.h"
-/* Remove later */
-#undef LCRYPTO_ALIAS
-#define LCRYPTO_ALIAS(A)
 /*
 * See
 * https://csrc.nist.gov/pubs/fips/203/final
@@ -80,7 +76,6 @@ kdf(uint8_t out[MLKEM_SHARED_SECRET_BYTES], const uint8_t failure_secret[32],
 }
 #define DEGREE 256
-#define RANK1024 4
 static const size_t kBarrettMultiplier = 5039;
 static const unsigned kBarrettShift = 24;
@@ -809,9 +804,11 @@ struct public_key {
 CTASSERT(sizeof(struct MLKEM1024_public_key) == sizeof(struct public_key));
 static struct public_key *
-public_key_1024_from_external(const struct MLKEM1024_public_key *external)
+public_key_1024_from_external(const MLKEM_public_key *external)
 {
-        return (struct public_key *)external;
+        if (external->rank != RANK1024)
+                return NULL;
+        return (struct public_key *)external->key_1024;
 }
 struct private_key {
@@ -823,9 +820,11 @@ struct private_key {
 CTASSERT(sizeof(struct MLKEM1024_private_key) == sizeof(struct private_key));
 static struct private_key *
-private_key_1024_from_external(const struct MLKEM1024_private_key *external)
+private_key_1024_from_external(const MLKEM_private_key *external)
 {
-        return (struct private_key *)external;
+        if (external->rank != RANK1024)
+                return NULL;
+        return (struct private_key *)external->key_1024;
 }
 /*
@@ -835,7 +834,7 @@ private_key_1024_from_external(const struct MLKEM1024_private_key *external)
 int
 MLKEM1024_generate_key(uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
-    struct MLKEM1024_private_key *out_private_key)
+    MLKEM_private_key *out_private_key)
 {
        uint8_t entropy_buf[MLKEM_SEED_BYTES];
        uint8_t *entropy = optional_out_seed != NULL ? optional_out_seed :
@@ -845,10 +844,9 @@ MLKEM1024_generate_key(uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES
        return MLKEM1024_generate_key_external_entropy(out_encoded_public_key,
            out_private_key, entropy);
 }
-LCRYPTO_ALIAS(MLKEM1024_generate_key);
 int
-MLKEM1024_private_key_from_seed(struct MLKEM1024_private_key *out_private_key,
+MLKEM1024_private_key_from_seed(MLKEM_private_key *out_private_key,
    const uint8_t *seed, size_t seed_len)
 {
        uint8_t public_key_bytes[MLKEM1024_PUBLIC_KEY_BYTES];
@@ -859,7 +857,6 @@ MLKEM1024_private_key_from_seed(struct MLKEM1024_private_key *out_private_key,
        return MLKEM1024_generate_key_external_entropy(public_key_bytes,
            out_private_key, seed);
 }
-LCRYPTO_ALIAS(MLKEM1024_private_key_from_seed);
 static int
 mlkem_marshal_public_key(CBB *out, const struct public_key *pub)
@@ -872,7 +869,7 @@ mlkem_marshal_public_key(CBB *out, const struct public_key *pub)
 int
 MLKEM1024_generate_key_external_entropy(
    uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
-    struct MLKEM1024_private_key *out_private_key,
+    MLKEM_private_key *out_private_key,
    const uint8_t entropy[MLKEM_SEED_BYTES])
 {
        struct private_key *priv = private_key_1024_from_external(
@@ -920,8 +917,8 @@ MLKEM1024_generate_key_external_entropy(
 }
 void
-MLKEM1024_public_from_private(struct MLKEM1024_public_key *out_public_key,
+MLKEM1024_public_from_private(const MLKEM_private_key *private_key,
-    const struct MLKEM1024_private_key *private_key)
+    MLKEM_public_key *out_public_key)
 {
        struct public_key *const pub = public_key_1024_from_external(
            out_public_key);
@@ -930,7 +927,6 @@ MLKEM1024_public_from_private(struct MLKEM1024_public_key *out_public_key,
        *pub = priv->pub;
 }
-LCRYPTO_ALIAS(MLKEM1024_public_from_private);
 /*
 * Encrypts a message with given randomness to the ciphertext in |out|. Without
@@ -972,9 +968,9 @@ encrypt_cpa(uint8_t out[MLKEM1024_CIPHERTEXT_BYTES],
 /* Calls MLKEM1024_encap_external_entropy| with random bytes */
 void
-MLKEM1024_encap(uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
+MLKEM1024_encap(const MLKEM_public_key *public_key,
-    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
-    const struct MLKEM1024_public_key *public_key)
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES])
 {
        uint8_t entropy[MLKEM_ENCAP_ENTROPY];
@@ -982,14 +978,13 @@ MLKEM1024_encap(uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
        MLKEM1024_encap_external_entropy(out_ciphertext, out_shared_secret,
            public_key, entropy);
 }
-LCRYPTO_ALIAS(MLKEM1024_encap);
 /* See section 6.2 of the spec. */
 void
 MLKEM1024_encap_external_entropy(
    uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
-    const struct MLKEM1024_public_key *public_key,
+    const MLKEM_public_key *public_key,
    const uint8_t entropy[MLKEM_ENCAP_ENTROPY])
 {
        const struct public_key *pub = public_key_1024_from_external(public_key);
@@ -1025,10 +1020,10 @@ decrypt_cpa(uint8_t out[32], const struct private_key *priv,
 /* See section 6.3 */
 int
-MLKEM1024_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+MLKEM1024_decap(const MLKEM_private_key *private_key,
    const uint8_t *ciphertext, size_t ciphertext_len,
-    const struct MLKEM1024_private_key *private_key)
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES])
-{
+ {
        const struct private_key *priv = private_key_1024_from_external(
            private_key);
        uint8_t expected_ciphertext[MLKEM1024_CIPHERTEXT_BYTES];
@@ -1059,11 +1054,10 @@ MLKEM1024_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
        return 1;
 }
-LCRYPTO_ALIAS(MLKEM1024_decap);
 int
-MLKEM1024_marshal_public_key(uint8_t **output, size_t *output_len,
+MLKEM1024_marshal_public_key(const MLKEM_public_key *public_key,
-    const struct MLKEM1024_public_key *public_key)
+    uint8_t **output, size_t *output_len)
 {
        int ret = 0;
        CBB cbb;
@@ -1083,7 +1077,6 @@ MLKEM1024_marshal_public_key(uint8_t **output, size_t *output_len,
        return ret;
 }
-LCRYPTO_ALIAS(MLKEM1024_marshal_public_key);
 /*
 * mlkem_parse_public_key_no_hash parses |in| into |pub| but doesn't calculate
@@ -1107,8 +1100,8 @@ mlkem_parse_public_key_no_hash(struct public_key *pub, CBS *in)
 }
 int
-MLKEM1024_parse_public_key(struct MLKEM1024_public_key *public_key,
+MLKEM1024_parse_public_key(const uint8_t *input, size_t input_len,
-    const uint8_t *input, size_t input_len)
+    MLKEM_public_key *public_key)
 {
        struct public_key *pub = public_key_1024_from_external(public_key);
        CBS cbs;
@@ -1123,10 +1116,9 @@ MLKEM1024_parse_public_key(struct MLKEM1024_public_key *public_key,
        return 1;
 }
-LCRYPTO_ALIAS(MLKEM1024_parse_public_key);
 int
-MLKEM1024_marshal_private_key(const struct MLKEM1024_private_key *private_key,
+MLKEM1024_marshal_private_key(const MLKEM_private_key *private_key,
    uint8_t **out_private_key, size_t *out_private_key_len)
 {
        const struct private_key *const priv = private_key_1024_from_external(
@@ -1160,8 +1152,8 @@ MLKEM1024_marshal_private_key(const struct MLKEM1024_private_key *private_key,
 }
 int
-MLKEM1024_parse_private_key(struct MLKEM1024_private_key *out_private_key,
+MLKEM1024_parse_private_key(const uint8_t *input, size_t input_len,
-    const uint8_t *input, size_t input_len)
+    MLKEM_private_key *out_private_key)
 {
        struct private_key *const priv = private_key_1024_from_external(
            out_private_key);
@@ -1189,4 +1181,3 @@ MLKEM1024_parse_private_key(struct MLKEM1024_private_key *out_private_key,
        return 1;
 }
-LCRYPTO_ALIAS(MLKEM1024_parse_private_key);
diff --git a/src/lib/libcrypto/mlkem/mlkem768.c b/src/lib/libcrypto/mlkem/mlkem768.c
index 653b92d8d8..1a44b9413f 100644
--- a/src/lib/libcrypto/mlkem/mlkem768.c
+++ b/src/lib/libcrypto/mlkem/mlkem768.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mlkem768.c,v 1.12 2025/05/20 00:30:38 beck Exp $ */
+/* $OpenBSD: mlkem768.c,v 1.13 2025/08/14 15:48:48 beck Exp $ */
 /*
 * Copyright (c) 2024, Google Inc.
 * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
@@ -19,19 +19,16 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdio.h>
-#include "bytestring.h"
+#include <openssl/mlkem.h>
-#include "mlkem.h"
+#include "bytestring.h"
 #include "sha3_internal.h"
 #include "mlkem_internal.h"
 #include "constant_time.h"
 #include "crypto_internal.h"
-/* Remove later */
-#undef LCRYPTO_ALIAS
-#define LCRYPTO_ALIAS(A)
 /*
 * See
 * https://csrc.nist.gov/pubs/fips/203/final
@@ -80,7 +77,6 @@ kdf(uint8_t out[MLKEM_SHARED_SECRET_BYTES], const uint8_t failure_secret[32],
 }
 #define DEGREE 256
-#define RANK768 3
 static const size_t kBarrettMultiplier = 5039;
 static const unsigned kBarrettShift = 24;
@@ -809,9 +805,11 @@ struct public_key {
 CTASSERT(sizeof(struct MLKEM768_public_key) == sizeof(struct public_key));
 static struct public_key *
-public_key_768_from_external(const struct MLKEM768_public_key *external)
+public_key_768_from_external(const MLKEM_public_key *external)
 {
-        return (struct public_key *)external;
+        if (external->rank != RANK768)
+                return NULL;
+        return (struct public_key *)external->key_768;
 }
 struct private_key {
@@ -823,9 +821,11 @@ struct private_key {
 CTASSERT(sizeof(struct MLKEM768_private_key) == sizeof(struct private_key));
 static struct private_key *
-private_key_768_from_external(const struct MLKEM768_private_key *external)
+private_key_768_from_external(const MLKEM_private_key *external)
 {
-        return (struct private_key *)external;
+        if (external->rank != RANK768)
+                return NULL;
+        return (struct private_key *)external->key_768;
 }
 /*
@@ -835,7 +835,7 @@ private_key_768_from_external(const struct MLKEM768_private_key *external)
 int
 MLKEM768_generate_key(uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
-    struct MLKEM768_private_key *out_private_key)
+    MLKEM_private_key *out_private_key)
 {
        uint8_t entropy_buf[MLKEM_SEED_BYTES];
        uint8_t *entropy = optional_out_seed != NULL ? optional_out_seed :
@@ -845,12 +845,12 @@ MLKEM768_generate_key(uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
        return MLKEM768_generate_key_external_entropy(out_encoded_public_key,
            out_private_key, entropy);
 }
-LCRYPTO_ALIAS(MLKEM768_generate_key);
 int
-MLKEM768_private_key_from_seed(struct MLKEM768_private_key *out_private_key,
+MLKEM768_private_key_from_seed(const uint8_t *seed, size_t seed_len,
-    const uint8_t *seed, size_t seed_len)
+    MLKEM_private_key *out_private_key)
 {
+        /* XXX stack */
        uint8_t public_key_bytes[MLKEM768_PUBLIC_KEY_BYTES];
        if (seed_len != MLKEM_SEED_BYTES) {
@@ -859,7 +859,6 @@ MLKEM768_private_key_from_seed(struct MLKEM768_private_key *out_private_key,
        return MLKEM768_generate_key_external_entropy(public_key_bytes,
            out_private_key, seed);
 }
-LCRYPTO_ALIAS(MLKEM768_private_key_from_seed);
 static int
 mlkem_marshal_public_key(CBB *out, const struct public_key *pub)
@@ -872,7 +871,7 @@ mlkem_marshal_public_key(CBB *out, const struct public_key *pub)
 int
 MLKEM768_generate_key_external_entropy(
    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
-    struct MLKEM768_private_key *out_private_key,
+    MLKEM_private_key *out_private_key,
    const uint8_t entropy[MLKEM_SEED_BYTES])
 {
        struct private_key *priv = private_key_768_from_external(
@@ -920,9 +919,8 @@ MLKEM768_generate_key_external_entropy(
 }
 void
-MLKEM768_public_from_private(struct MLKEM768_public_key *out_public_key,
+MLKEM768_public_from_private(const MLKEM_private_key *private_key,
-    const struct MLKEM768_private_key *private_key)
+    MLKEM_public_key *out_public_key) {
-{
        struct public_key *const pub = public_key_768_from_external(
            out_public_key);
        const struct private_key *const priv = private_key_768_from_external(
@@ -930,7 +928,6 @@ MLKEM768_public_from_private(struct MLKEM768_public_key *out_public_key,
        *pub = priv->pub;
 }
-LCRYPTO_ALIAS(MLKEM768_public_from_private);
 /*
 * Encrypts a message with given randomness to the ciphertext in |out|. Without
@@ -972,9 +969,9 @@ encrypt_cpa(uint8_t out[MLKEM768_CIPHERTEXT_BYTES],
 /* Calls MLKEM768_encap_external_entropy| with random bytes */
 void
-MLKEM768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+MLKEM768_encap(const MLKEM_public_key *public_key,
-    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
-    const struct MLKEM768_public_key *public_key)
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES])
 {
        uint8_t entropy[MLKEM_ENCAP_ENTROPY];
@@ -982,14 +979,13 @@ MLKEM768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
        MLKEM768_encap_external_entropy(out_ciphertext,
            out_shared_secret, public_key, entropy);
 }
-LCRYPTO_ALIAS(MLKEM768_encap);
 /* See section 6.2 of the spec. */
 void
 MLKEM768_encap_external_entropy(
    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
-    const struct MLKEM768_public_key *public_key,
+    const MLKEM_public_key *public_key,
    const uint8_t entropy[MLKEM_ENCAP_ENTROPY])
 {
        const struct public_key *pub = public_key_768_from_external(public_key);
@@ -1025,9 +1021,8 @@ decrypt_cpa(uint8_t out[32], const struct private_key *priv,
 /* See section 6.3 */
 int
-MLKEM768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+MLKEM768_decap(const MLKEM_private_key *private_key, const uint8_t *ciphertext,
-    const uint8_t *ciphertext, size_t ciphertext_len,
+    size_t ciphertext_len, uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES])
-    const struct MLKEM768_private_key *private_key)
 {
        const struct private_key *priv = private_key_768_from_external(
            private_key);
@@ -1059,11 +1054,10 @@ MLKEM768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
        return 1;
 }
-LCRYPTO_ALIAS(MLKEM768_decap);
 int
-MLKEM768_marshal_public_key(uint8_t **output, size_t *output_len,
+MLKEM768_marshal_public_key(const MLKEM_public_key *public_key,
-    const struct MLKEM768_public_key *public_key)
+    uint8_t **output, size_t *output_len)
 {
        int ret = 0;
        CBB cbb;
@@ -1083,7 +1077,6 @@ MLKEM768_marshal_public_key(uint8_t **output, size_t *output_len,
        return ret;
 }
-LCRYPTO_ALIAS(MLKEM768_marshal_public_key);
 /*
 * mlkem_parse_public_key_no_hash parses |in| into |pub| but doesn't calculate
@@ -1107,8 +1100,8 @@ mlkem_parse_public_key_no_hash(struct public_key *pub, CBS *in)
 }
 int
-MLKEM768_parse_public_key(struct MLKEM768_public_key *public_key,
+MLKEM768_parse_public_key(const uint8_t *input, size_t input_len,
-    const uint8_t *input, size_t input_len)
+    MLKEM_public_key *public_key)
 {
        struct public_key *pub = public_key_768_from_external(public_key);
        CBS cbs;
@@ -1123,10 +1116,9 @@ MLKEM768_parse_public_key(struct MLKEM768_public_key *public_key,
        return 1;
 }
-LCRYPTO_ALIAS(MLKEM768_parse_public_key);
 int
-MLKEM768_marshal_private_key(const struct MLKEM768_private_key *private_key,
+MLKEM768_marshal_private_key(const MLKEM_private_key *private_key,
    uint8_t **out_private_key, size_t *out_private_key_len)
 {
        const struct private_key *const priv = private_key_768_from_external(
@@ -1160,8 +1152,8 @@ MLKEM768_marshal_private_key(const struct MLKEM768_private_key *private_key,
 }
 int
-MLKEM768_parse_private_key(struct MLKEM768_private_key *out_private_key,
+MLKEM768_parse_private_key(const uint8_t *input, size_t input_len,
-    const uint8_t *input, size_t input_len)
+    MLKEM_private_key *out_private_key)
 {
        struct private_key *const priv = private_key_768_from_external(
            out_private_key);
@@ -1189,4 +1181,3 @@ MLKEM768_parse_private_key(struct MLKEM768_private_key *out_private_key,
        return 1;
 }
-LCRYPTO_ALIAS(MLKEM768_parse_private_key);
diff --git a/src/lib/libcrypto/mlkem/mlkem_internal.h b/src/lib/libcrypto/mlkem/mlkem_internal.h
index 1e051970a8..776f8aac17 100644
--- a/src/lib/libcrypto/mlkem/mlkem_internal.h
+++ b/src/lib/libcrypto/mlkem/mlkem_internal.h
@@ -1,4 +1,4 @@
-/*      $OpenBSD: mlkem_internal.h,v 1.7 2025/05/20 00:33:40 beck Exp $ */
+/*      $OpenBSD: mlkem_internal.h,v 1.8 2025/08/14 15:48:48 beck Exp $ */
 /*
 * Copyright (c) 2023, Google Inc.
 *
@@ -26,6 +26,253 @@ extern "C" {
 #endif
 __BEGIN_HIDDEN_DECLS
+/*
+ * MLKEM_SEED_LENGTH is the number of bytes in an ML-KEM seed. An ML-KEM
+ * seed is normally used to represent a private key.
+ */
+#define MLKEM_SEED_LENGTH 64
+/*
+ * MLKEM_SHARED_SECRET_LENGTH is the number of bytes in an ML-KEM shared
+ * secret.
+ */
+#define MLKEM_SHARED_SECRET_LENGTH 32
+/*
+ * |MLKEM_encap_external_entropy| behaves exactly like the public |MLKEM_encap|
+ * with the entropy provided by the caller. It is directly called internally
+ * and by tests.
+ */
+int
+MLKEM_encap_external_entropy(const MLKEM_public_key *public_key,
+    const uint8_t *entropy, uint8_t **out_ciphertext,
+    size_t *out_ciphertext_len, uint8_t **out_shared_secret,
+    size_t *out_shared_secret_len);
+/*
+ * |MLKEM_generate_key_external_entropy| behaves exactly like the public
+ * |MLKEM_generate_key| with the entropy provided by the caller.
+ * It is directly called internally and by tests.
+ */
+int
+MLKEM_generate_key_external_entropy(MLKEM_private_key *private_key,
+    uint8_t **out_encoded_public_key, size_t *out_encoded_public_key_len,
+    const uint8_t *entropy);
+/*
+ * Marshals a private key to encoded format, used for NIST tests.
+ */
+int MLKEM_marshal_private_key(const MLKEM_private_key *private_key,
+    uint8_t **out, size_t *out_len);
+/*
+ * ML-KEM-768
+ *
+ * This implements the Module-Lattice-Based Key-Encapsulation Mechanism from
+ * https://csrc.nist.gov/pubs/fips/204/final
+ */
+/*
+ * MLKEM768_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM768 public
+ * key.
+ */
+#define MLKEM768_PUBLIC_KEY_BYTES 1184
+/* MLKEM_SEED_BYTES is the number of bytes in an ML-KEM seed. */
+#define MLKEM_SEED_BYTES 64
+/*
+ *  MLKEM_SHARED_SECRET_BYTES is the number of bytes in the ML-KEM768 shared
+ * secret. Although the round-3 specification has a variable-length output, the
+ * final ML-KEM construction is expected to use a fixed 32-byte output. To
+ * simplify the future transition, we apply the same restriction.
+ */
+#define MLKEM_SHARED_SECRET_BYTES 32
+/*
+ * MLKEM_generate_key generates a random public/private key pair, writes the
+ * encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+ * the private key. If |optional_out_seed| is not NULL then the seed used to
+ * generate the private key is written to it.
+ */
+int MLKEM768_generate_key(
+    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
+    MLKEM_private_key *out_private_key);
+/*
+ * MLKEM768_private_key_from_seed derives a private key from a seed that was
+ * generated by |MLKEM768_generate_key|. It fails and returns 0 if |seed_len| is
+ * incorrect, otherwise it writes |*out_private_key| and returns 1.
+ */
+int MLKEM768_private_key_from_seed(const uint8_t *seed, size_t seed_len,
+    MLKEM_private_key *out_private_key);
+/*
+ * MLKEM_public_from_private sets |*out_public_key| to the public key that
+ * corresponds to |private_key|. (This is faster than parsing the output of
+ * |MLKEM_generate_key| if, for some reason, you need to encapsulate to a key
+ * that was just generated.)
+ */
+void MLKEM768_public_from_private(const MLKEM_private_key *private_key,
+    MLKEM_public_key *out_public_key);
+/* MLKEM768_CIPHERTEXT_BYTES is number of bytes in the ML-KEM768 ciphertext. */
+#define MLKEM768_CIPHERTEXT_BYTES 1088
+/*
+ * MLKEM768_encap encrypts a random shared secret for |public_key|, writes the
+ * ciphertext to |out_ciphertext|, and writes the random shared secret to
+ * |out_shared_secret|.
+ */
+void MLKEM768_encap(const MLKEM_public_key *public_key,
+    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES]);
+/*
+ * MLKEM768_decap decrypts a shared secret from |ciphertext| using |private_key|
+ * and writes it to |out_shared_secret|. If |ciphertext_len| is incorrect it
+ * returns 0, otherwise it rreturns 1. If |ciphertext| is invalid,
+ * |out_shared_secret| is filled with a key that will always be the same for the
+ * same |ciphertext| and |private_key|, but which appears to be random unless
+ * one has access to |private_key|. These alternatives occur in constant time.
+ * Any subsequent symmetric encryption using |out_shared_secret| must use an
+ * authenticated encryption scheme in order to discover the decapsulation
+ * failure.
+ */
+int MLKEM768_decap(const MLKEM_private_key *private_key,
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES]);
+/* Serialisation of keys. */
+/*
+ * MLKEM768_marshal_public_key serializes |public_key| to |out| in the standard
+ * format for ML-KEM public keys. It returns one on success or zero on allocation
+ * error.
+ */
+int MLKEM768_marshal_public_key(const MLKEM_public_key *public_key,
+    uint8_t **output, size_t *output_len);
+/*
+ * MLKEM768_parse_public_key parses a public key, in the format generated by
+ * |MLKEM_marshal_public_key|, from |in| and writes the result to
+ * |out_public_key|. It returns one on success or zero on parse error or if
+ * there are trailing bytes in |in|.
+ */
+int MLKEM768_parse_public_key(const uint8_t *input, size_t input_len,
+    MLKEM_public_key *out_public_key);
+/*
+ * MLKEM_parse_private_key parses a private key, in the format generated by
+ * |MLKEM_marshal_private_key|, from |in| and writes the result to
+ * |out_private_key|. It returns one on success or zero on parse error or if
+ * there are trailing bytes in |in|. This formate is verbose and should be avoided.
+ * Private keys should be stored as seeds and parsed using |MLKEM768_private_key_from_seed|.
+ */
+int MLKEM768_parse_private_key(const uint8_t *input, size_t input_len,
+    MLKEM_private_key *out_private_key);
+/*
+ * ML-KEM-1024
+ *
+ * ML-KEM-1024 also exists. You should prefer ML-KEM-768 where possible.
+ */
+/*
+ * MLKEM1024_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM-1024
+ * public key.
+ */
+#define MLKEM1024_PUBLIC_KEY_BYTES 1568
+/*
+ * MLKEM1024_generate_key generates a random public/private key pair, writes the
+ * encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+ * the private key. If |optional_out_seed| is not NULL then the seed used to
+ * generate the private key is written to it.
+ */
+int MLKEM1024_generate_key(
+    uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
+    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
+    MLKEM_private_key *out_private_key);
+/*
+ * MLKEM1024_private_key_from_seed derives a private key from a seed that was
+ * generated by |MLKEM1024_generate_key|. It fails and returns 0 if |seed_len|
+ * is incorrect, otherwise it writes |*out_private_key| and returns 1.
+ */
+int MLKEM1024_private_key_from_seed(
+    MLKEM_private_key *out_private_key, const uint8_t *seed,
+    size_t seed_len);
+/*
+ * MLKEM1024_public_from_private sets |*out_public_key| to the public key that
+ * corresponds to |private_key|. (This is faster than parsing the output of
+ * |MLKEM1024_generate_key| if, for some reason, you need to encapsulate to a
+ * key that was just generated.)
+ */
+void MLKEM1024_public_from_private(const MLKEM_private_key *private_key,
+    MLKEM_public_key *out_public_key);
+/* MLKEM1024_CIPHERTEXT_BYTES is number of bytes in the ML-KEM-1024 ciphertext. */
+#define MLKEM1024_CIPHERTEXT_BYTES 1568
+/*
+ * MLKEM1024_encap encrypts a random shared secret for |public_key|, writes the
+ * ciphertext to |out_ciphertext|, and writes the random shared secret to
+ * |out_shared_secret|.
+ */
+void MLKEM1024_encap(const MLKEM_public_key *public_key,
+    uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES]);
+/*
+ * MLKEM1024_decap decrypts a shared secret from |ciphertext| using
+ * |private_key| and writes it to |out_shared_secret|. If |ciphertext_len| is
+ * incorrect it returns 0, otherwise it returns 1. If |ciphertext| is invalid
+ * (but of the correct length), |out_shared_secret| is filled with a key that
+ * will always be the same for the same |ciphertext| and |private_key|, but
+ * which appears to be random unless one has access to |private_key|. These
+ * alternatives occur in constant time. Any subsequent symmetric encryption
+ * using |out_shared_secret| must use an authenticated encryption scheme in
+ * order to discover the decapsulation failure.
+ */
+int MLKEM1024_decap(const MLKEM_private_key *private_key,
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES]);
+/*
+ * Serialisation of ML-KEM-1024 keys.
+ * MLKEM1024_marshal_public_key serializes |public_key| to |out| in the standard
+ * format for ML-KEM-1024 public keys. It returns one on success or zero on
+ * allocation error.
+ */
+int MLKEM1024_marshal_public_key(const MLKEM_public_key *public_key,
+    uint8_t **output, size_t *output_len);
+/*
+ * MLKEM1024_parse_public_key parses a public key, in the format generated by
+ * |MLKEM1024_marshal_public_key|, from |in| and writes the result to
+ * |out_public_key|. It returns one on success or zero on parse error or if
+ * there are trailing bytes in |in|.
+ */
+int MLKEM1024_parse_public_key(const uint8_t *input, size_t input_len,
+    MLKEM_public_key *out_public_key);
+/*
+ * MLKEM1024_parse_private_key parses a private key, in NIST's format for
+ * private keys, from |in| and writes the result to |out_private_key|. It
+ * returns one on success or zero on parse error or if there are trailing bytes
+ * in |in|. This format is verbose and should be avoided. Private keys should be
+ * stored as seeds and parsed using |MLKEM1024_private_key_from_seed|.
+ */
+int MLKEM1024_parse_private_key(const uint8_t *input, size_t input_len,
+ MLKEM_private_key *out_private_key);
+/* XXXX Truly internal stuff below, also in need of de-duping */
 /*
 * MLKEM_ENCAP_ENTROPY is the number of bytes of uniformly random entropy
@@ -35,6 +282,49 @@ __BEGIN_HIDDEN_DECLS
 #define MLKEM_ENCAP_ENTROPY 32
 /*
+ * MLKEM768_public_key contains a ML-KEM-768 public key. The contents of this
+ * object should never leave the address space since the format is unstable.
+ */
+struct MLKEM768_public_key {
+        union {
+                uint8_t bytes[512 * (3 + 9) + 32 + 32];
+                uint16_t alignment;
+        } opaque;
+};
+/*
+ * MLKEM768_private_key contains a ML-KEM-768 private key. The contents of this
+ * object should never leave the address space since the format is unstable.
+ */
+struct MLKEM768_private_key {
+        union {
+                uint8_t bytes[512 * (3 + 3 + 9) + 32 + 32 + 32];
+                uint16_t alignment;
+        } opaque;
+};
+/* Public opaque ML-KEM key structures. */
+#define MLKEM_PUBLIC_KEY_UNINITIALIZED 1
+#define MLKEM_PUBLIC_KEY_INITIALIZED 2
+#define MLKEM_PRIVATE_KEY_UNINITIALIZED 3
+#define MLKEM_PRIVATE_KEY_INITIALIZED 4
+struct MLKEM_public_key_st {
+        uint16_t rank;
+        int state;
+        struct MLKEM768_public_key *key_768;
+        struct MLKEM1024_public_key *key_1024;
+};
+struct MLKEM_private_key_st {
+        uint16_t rank;
+        int state;
+        struct MLKEM768_private_key *key_768;
+        struct MLKEM1024_private_key *key_1024;
+};
+/*
 * MLKEM768_generate_key_external_entropy is a deterministic function to create a
 * pair of ML-KEM 768 keys, using the supplied entropy. The entropy needs to be
 * uniformly random generated. This function is should only be used for tests,
@@ -43,7 +333,7 @@ __BEGIN_HIDDEN_DECLS
 */
 int MLKEM768_generate_key_external_entropy(
    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
-    struct MLKEM768_private_key *out_private_key,
+    MLKEM_private_key *out_private_key,
    const uint8_t entropy[MLKEM_SEED_BYTES]);
 /*
@@ -57,11 +347,11 @@ int MLKEM768_generate_key_external_entropy(
 * format for ML-KEM private keys. It returns one on success or zero on
 * allocation error.
 */
-int MLKEM768_marshal_private_key(const struct MLKEM768_private_key *private_key,
+int MLKEM768_marshal_private_key(const MLKEM_private_key *private_key,
    uint8_t **out_private_key, size_t *out_private_key_len);
 /*
- * MLKEM_encap_external_entropy behaves like |MLKEM_encap|, but uses
+ * MLKEM768_encap_external_entropy behaves like |MLKEM768_encap|, but uses
 * |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The decapsulating
 * side will be able to recover |entropy| in full. This function should only be
 * used for tests, regular callers should use the non-deterministic
@@ -70,9 +360,34 @@ int MLKEM768_marshal_private_key(const struct MLKEM768_private_key *private_key,
 void MLKEM768_encap_external_entropy(
    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
-    const struct MLKEM768_public_key *public_key,
+    const MLKEM_public_key *public_key,
    const uint8_t entropy[MLKEM_ENCAP_ENTROPY]);
+/*
+ * MLKEM1024_public_key contains an ML-KEM-1024 public key. The contents of this
+ * object should never leave the address space since the format is unstable.
+ */
+struct MLKEM1024_public_key {
+        union {
+                uint8_t bytes[512 * (4 + 16) + 32 + 32];
+                uint16_t alignment;
+        } opaque;
+};
+/*
+ * MLKEM1024_private_key contains a ML-KEM-1024 private key. The contents of
+ * this object should never leave the address space since the format is
+ * unstable.
+ */
+struct MLKEM1024_private_key {
+        union {
+                uint8_t bytes[512 * (4 + 4 + 16) + 32 + 32 + 32];
+                uint16_t alignment;
+        } opaque;
+};
 /*
 * MLKEM1024_generate_key_external_entropy is a deterministic function to create a
 * pair of ML-KEM 1024 keys, using the supplied entropy. The entropy needs to be
@@ -82,7 +397,7 @@ void MLKEM768_encap_external_entropy(
 */
 int MLKEM1024_generate_key_external_entropy(
    uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
-    struct MLKEM1024_private_key *out_private_key,
+    MLKEM_private_key *out_private_key,
    const uint8_t entropy[MLKEM_SEED_BYTES]);
 /*
@@ -97,7 +412,7 @@ int MLKEM1024_generate_key_external_entropy(
 * allocation error.
 */
 int MLKEM1024_marshal_private_key(
-    const struct MLKEM1024_private_key *private_key, uint8_t **out_private_key,
+    const MLKEM_private_key *private_key, uint8_t **out_private_key,
    size_t *out_private_key_len);
 /*
@@ -110,7 +425,7 @@ int MLKEM1024_marshal_private_key(
 void MLKEM1024_encap_external_entropy(
    uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
-    const struct MLKEM1024_public_key *public_key,
+    const MLKEM_public_key *public_key,
    const uint8_t entropy[MLKEM_ENCAP_ENTROPY]);
 __END_HIDDEN_DECLS
diff --git a/src/lib/libcrypto/mlkem/mlkem_key.c b/src/lib/libcrypto/mlkem/mlkem_key.c
new file mode 100644
index 0000000000..051d8f2b88
--- /dev/null
+++ b/src/lib/libcrypto/mlkem/mlkem_key.c
@@ -0,0 +1,200 @@
+/* $OpenBSD: mlkem_key.c,v 1.1 2025/08/14 15:48:48 beck Exp $ */
+/*
+ * Copyright (c) 2025 Bob Beck <beck@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <openssl/mlkem.h>
+#include "mlkem_internal.h"
+MLKEM_private_key *
+MLKEM_private_key_new(int rank)
+{
+        struct MLKEM1024_private_key *key_1024 = NULL;
+        struct MLKEM768_private_key *key_768 = NULL;
+        MLKEM_private_key *key = NULL;
+        MLKEM_private_key *ret = NULL;
+        if ((key = calloc(1, sizeof(MLKEM_private_key))) == NULL)
+                goto err;
+        switch (rank) {
+        case RANK768:
+                if ((key_768 = calloc(1, sizeof(*key_768))) ==
+                    NULL)
+                        goto err;
+                key->key_768 = key_768;
+                break;
+        case RANK1024:
+                if ((key_1024 = calloc(1, sizeof(*key_1024))) ==
+                    NULL)
+                        goto err;
+                key->key_1024 = key_1024;
+                break;
+        default:
+                goto err;
+        }
+        key->rank = rank;
+        key->state = MLKEM_PRIVATE_KEY_UNINITIALIZED;
+        ret = key;
+        key= NULL;
+ err:
+        MLKEM_private_key_free(key);
+        return ret;
+}
+LCRYPTO_ALIAS(MLKEM_private_key_new);
+void
+MLKEM_private_key_free(MLKEM_private_key *key)
+{
+        if (key == NULL)
+                return;
+        freezero(key->key_768, sizeof(*key->key_768));
+        freezero(key->key_1024, sizeof(*key->key_1024));
+        freezero(key, sizeof(*key));
+}
+LCRYPTO_ALIAS(MLKEM_private_key_free);
+size_t
+MLKEM_private_key_encoded_length(const MLKEM_private_key *key)
+{
+        if (key == NULL)
+                return 0;
+        switch (key->rank) {
+        case RANK768:
+                return MLKEM768_PRIVATE_KEY_BYTES;
+        case RANK1024:
+                return MLKEM1024_PRIVATE_KEY_BYTES;
+        default:
+                return 0;
+        }
+        return 0;
+}
+LCRYPTO_ALIAS(MLKEM_private_key_encoded_length);
+size_t
+MLKEM_private_key_ciphertext_length(const MLKEM_private_key *key)
+{
+        if (key == NULL)
+                return 0;
+        switch (key->rank) {
+        case RANK768:
+                return MLKEM768_CIPHERTEXT_BYTES;
+        case RANK1024:
+                return MLKEM1024_CIPHERTEXT_BYTES;
+        default:
+                return 0;
+        }
+        return 0;
+}
+LCRYPTO_ALIAS(MLKEM_private_key_ciphertext_length);
+MLKEM_public_key *
+MLKEM_public_key_new(int rank)
+{
+        struct MLKEM1024_public_key *key_1024 = NULL;
+        struct MLKEM768_public_key *key_768 = NULL;
+        MLKEM_public_key *key = NULL;
+        MLKEM_public_key *ret = NULL;
+        if ((key = calloc(1, sizeof(MLKEM_public_key))) == NULL)
+                goto err;
+        switch (rank) {
+        case RANK768:
+                if ((key_768 = calloc(1, sizeof(*key_768))) ==
+                    NULL)
+                        goto err;
+                key->key_768 = key_768;
+                break;
+        case RANK1024:
+                if ((key_1024 = calloc(1, sizeof(*key_1024))) ==
+                    NULL)
+                        goto err;
+                key->key_1024 = key_1024;
+                break;
+        default:
+                goto err;
+        }
+        key->rank = rank;
+        key->state = MLKEM_PUBLIC_KEY_UNINITIALIZED;
+        ret = key;
+        key = NULL;
+ err:
+        MLKEM_public_key_free(key);
+        return ret;
+}
+LCRYPTO_ALIAS(MLKEM_public_key_new);
+void
+MLKEM_public_key_free(MLKEM_public_key *key)
+{
+        if (key == NULL)
+                return;
+        freezero(key->key_768, sizeof(*key->key_768));
+        freezero(key->key_1024, sizeof(*key->key_1024));
+        freezero(key, sizeof(*key));
+}
+LCRYPTO_ALIAS(MLKEM_public_key_free);
+size_t
+MLKEM_public_key_encoded_length(const MLKEM_public_key *key)
+{
+        if (key == NULL)
+                return 0;
+        switch (key->rank) {
+        case RANK768:
+                return MLKEM768_PUBLIC_KEY_BYTES;
+        case RANK1024:
+                return MLKEM1024_PUBLIC_KEY_BYTES;
+        default:
+                return 0;
+        }
+        return 0;
+}
+LCRYPTO_ALIAS(MLKEM_public_key_encoded_length);
+size_t
+MLKEM_public_key_ciphertext_length(const MLKEM_public_key *key)
+{
+        if (key == NULL)
+                return 0;
+        switch (key->rank) {
+        case RANK768:
+                return MLKEM768_CIPHERTEXT_BYTES;
+        case RANK1024:
+                return MLKEM1024_CIPHERTEXT_BYTES;
+        default:
+                return 0;
+        }
+        return 0;
+}
+LCRYPTO_ALIAS(MLKEM_public_key_ciphertext_length);
diff --git a/src/lib/libcrypto/rc4/rc4.c b/src/lib/libcrypto/rc4/rc4.c
index 56ed43cba7..69b7d0a815 100644
--- a/src/lib/libcrypto/rc4/rc4.c
+++ b/src/lib/libcrypto/rc4/rc4.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: rc4.c,v 1.13 2025/01/27 14:02:32 jsing Exp $ */
+/* $OpenBSD: rc4.c,v 1.15 2025/08/17 08:04:25 jsing Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -57,234 +57,123 @@
 */
 #include <endian.h>
+#include <stdint.h>
 #include <openssl/rc4.h>
 #include "crypto_arch.h"
-/* RC4 as implemented from a posting from
- * Newsgroups: sci.crypt
- * From: sterndark@netcom.com (David Sterndark)
- * Subject: RC4 Algorithm revealed.
- * Message-ID: <sternCvKL4B.Hyy@netcom.com>
- * Date: Wed, 14 Sep 1994 06:35:31 GMT
- */
 #ifdef HAVE_RC4_INTERNAL
-void rc4_internal(RC4_KEY *key, size_t len, const unsigned char *indata,
+void rc4_internal(RC4_KEY *key, size_t len, const uint8_t *in,
-    unsigned char *outdata);
+    uint8_t *out);
 #else
-static void
+static inline RC4_INT
-rc4_internal(RC4_KEY *key, size_t len, const unsigned char *indata,
+rc4_step(RC4_INT *d, RC4_INT *x, RC4_INT *y)
-    unsigned char *outdata)
 {
-        RC4_INT *d;
+        RC4_INT tx, ty;
-        RC4_INT x, y,tx, ty;
-        size_t i;
-        x = key->x;
+        *x = (*x + 1) & 0xff;
-        y = key->y;
+        tx = d[*x];
-        d = key->data;
+        *y = (tx + *y) & 0xff;
+        d[*x] = ty = d[*y];
+        d[*y] = tx;
-#if defined(RC4_CHUNK)
+        return d[(tx + ty) & 0xff];
-        /*
+}
-         * The original reason for implementing this(*) was the fact that
-         * pre-21164a Alpha CPUs don't have byte load/store instructions
-         * and e.g. a byte store has to be done with 64-bit load, shift,
-         * and, or and finally 64-bit store. Peaking data and operating
-         * at natural word size made it possible to reduce amount of
-         * instructions as well as to perform early read-ahead without
-         * suffering from RAW (read-after-write) hazard. This resulted
-         * in ~40%(**) performance improvement on 21064 box with gcc.
-         * But it's not only Alpha users who win here:-) Thanks to the
-         * early-n-wide read-ahead this implementation also exhibits
-         * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending
-         * on sizeof(RC4_INT)).
-         *
-         * (*)  "this" means code which recognizes the case when input
-         *      and output pointers appear to be aligned at natural CPU
-         *      word boundary
-         * (**) i.e. according to 'apps/openssl speed rc4' benchmark,
-         *      crypto/rc4/rc4speed.c exhibits almost 70% speed-up...
-         *
-         * Caveats.
-         *
-         * - RC4_CHUNK="unsigned long long" should be a #1 choice for
-         *   UltraSPARC. Unfortunately gcc generates very slow code
-         *   (2.5-3 times slower than one generated by Sun's WorkShop
-         *   C) and therefore gcc (at least 2.95 and earlier) should
-         *   always be told that RC4_CHUNK="unsigned long".
-         *
-         *                                      <appro@fy.chalmers.se>
-         */
-# define RC4_STEP       ( \
+#if BYTE_ORDER == BIG_ENDIAN
-                        x=(x+1) &0xff,  \
+static inline uint64_t
-                        tx=d[x],        \
+rc4_chunk(RC4_INT *d, RC4_INT *x, RC4_INT *y)
-                        y=(tx+y)&0xff,  \
+{
-                        ty=d[y],        \
+        uint64_t chunk = 0;
-                        d[y]=tx,        \
+        size_t i;
-                        d[x]=ty,        \
-                        (RC4_CHUNK)d[(tx+ty)&0xff]\
-                        )
-        if ((((size_t)indata & (sizeof(RC4_CHUNK) - 1)) |
+        for (i = 0; i < 8; i++)
-            ((size_t)outdata & (sizeof(RC4_CHUNK) - 1))) == 0 ) {
+                chunk = chunk << 8 | (uint64_t)rc4_step(d, x, y);
-                RC4_CHUNK ichunk, otp;
+        return chunk;
+}
-                /*
-                 * I reckon we can afford to implement both endian
-                 * cases and to decide which way to take at run-time
-                 * because the machine code appears to be very compact
-                 * and redundant 1-2KB is perfectly tolerable (i.e.
-                 * in case the compiler fails to eliminate it:-). By
-                 * suggestion from Terrel Larson <terr@terralogic.net>.
-                 *
-                 * Special notes.
-                 *
-                 * - compilers (those I've tried) don't seem to have
-                 *   problems eliminating either the operators guarded
-                 *   by "if (sizeof(RC4_CHUNK)==8)" or the condition
-                 *   expressions themselves so I've got 'em to replace
-                 *   corresponding #ifdefs from the previous version;
-                 * - I chose to let the redundant switch cases when
-                 *   sizeof(RC4_CHUNK)!=8 be (were also #ifdefed
-                 *   before);
-                 * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in
-                 *   [LB]ESHFT guards against "shift is out of range"
-                 *   warnings when sizeof(RC4_CHUNK)!=8
-                 *
-                 *                      <appro@fy.chalmers.se>
-                 */
-#if BYTE_ORDER == BIG_ENDIAN
-# define BESHFT(c)      (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1))
-                for (; len & (0 - sizeof(RC4_CHUNK)); len -= sizeof(RC4_CHUNK)) {
-                        ichunk  = *(RC4_CHUNK *)indata;
-                        otp = RC4_STEP << BESHFT(0);
-                        otp |= RC4_STEP << BESHFT(1);
-                        otp |= RC4_STEP << BESHFT(2);
-                        otp |= RC4_STEP << BESHFT(3);
-                        if (sizeof(RC4_CHUNK) == 8) {
-                                otp |= RC4_STEP << BESHFT(4);
-                                otp |= RC4_STEP << BESHFT(5);
-                                otp |= RC4_STEP << BESHFT(6);
-                                otp |= RC4_STEP << BESHFT(7);
-                        }
-                        *(RC4_CHUNK *)outdata = otp^ichunk;
-                        indata += sizeof(RC4_CHUNK);
-                        outdata += sizeof(RC4_CHUNK);
-                }
 #else
-# define LESHFT(c)      (((c)*8)&(sizeof(RC4_CHUNK)*8-1))
+static inline uint64_t
-                for (; len & (0 - sizeof(RC4_CHUNK)); len -= sizeof(RC4_CHUNK)) {
+rc4_chunk(RC4_INT *d, RC4_INT *x, RC4_INT *y)
-                        ichunk = *(RC4_CHUNK *)indata;
+{
-                        otp = RC4_STEP;
+        uint64_t chunk = 0;
-                        otp |= RC4_STEP << 8;
+        size_t i;
-                        otp |= RC4_STEP << 16;
-                        otp |= RC4_STEP << 24;
+        for (i = 0; i < 8; i++)
-                        if (sizeof(RC4_CHUNK) == 8) {
+                chunk |= (uint64_t)rc4_step(d, x, y) << (i * 8);
-                                otp |= RC4_STEP << LESHFT(4);
-                                otp |= RC4_STEP << LESHFT(5);
+        return chunk;
-                                otp |= RC4_STEP << LESHFT(6);
+}
-                                otp |= RC4_STEP << LESHFT(7);
-                        }
-                        *(RC4_CHUNK *)outdata = otp ^ ichunk;
-                        indata += sizeof(RC4_CHUNK);
-                        outdata += sizeof(RC4_CHUNK);
-                }
-#endif
-        }
 #endif
-#define RC4_LOOP(in,out) \
-                x=((x+1)&0xff); \
-                tx=d[x]; \
-                y=(tx+y)&0xff; \
-                d[x]=ty=d[y]; \
-                d[y]=tx; \
-                (out) = d[(tx+ty)&0xff]^ (in);
-        i = len >> 3;
+static void
-        if (i) {
+rc4_internal(RC4_KEY *key, size_t len, const uint8_t *in, uint8_t *out)
-                for (;;) {
+{
-                        RC4_LOOP(indata[0], outdata[0]);
+        RC4_INT *d, x, y;
-                        RC4_LOOP(indata[1], outdata[1]);
+        size_t i;
-                        RC4_LOOP(indata[2], outdata[2]);
-                        RC4_LOOP(indata[3], outdata[3]);
-                        RC4_LOOP(indata[4], outdata[4]);
-                        RC4_LOOP(indata[5], outdata[5]);
-                        RC4_LOOP(indata[6], outdata[6]);
-                        RC4_LOOP(indata[7], outdata[7]);
-                        indata += 8;
+        x = key->x;
-                        outdata += 8;
+        y = key->y;
+        d = key->data;
+        /* Process uint64_t chunks if 8 byte aligned. */
+        if ((((size_t)in | (size_t)out) % 8) == 0) {
+                while (len >= 8) {
+                        *(uint64_t *)out = *(const uint64_t *)in ^ rc4_chunk(d, &x, &y);
-                        if (--i == 0)
+                        in += 8;
-                                break;
+                        out += 8;
+                        len -= 8;
                }
        }
-        i = len&0x07;
-        if (i) {
+        while (len >= 8) {
-                for (;;) {
+                for (i = 0; i < 8; i++)
-                        RC4_LOOP(indata[0], outdata[0]);
+                        out[i] = rc4_step(d, &x, &y) ^ in[i];
-                        if (--i == 0)
-                                break;
+                in += 8;
-                        RC4_LOOP(indata[1], outdata[1]);
+                out += 8;
-                        if (--i == 0)
+                len -= 8;
-                                break;
-                        RC4_LOOP(indata[2], outdata[2]);
-                        if (--i == 0)
-                                break;
-                        RC4_LOOP(indata[3], outdata[3]);
-                        if (--i == 0)
-                                break;
-                        RC4_LOOP(indata[4], outdata[4]);
-                        if (--i == 0)
-                                break;
-                        RC4_LOOP(indata[5], outdata[5]);
-                        if (--i == 0)
-                                break;
-                        RC4_LOOP(indata[6], outdata[6]);
-                        if (--i == 0)
-                                break;
-                }
        }
+        for (i = 0; i < len; i++)
+                out[i] = rc4_step(d, &x, &y) ^ in[i];
        key->x = x;
        key->y = y;
 }
 #endif
 #ifdef HAVE_RC4_SET_KEY_INTERNAL
-void rc4_set_key_internal(RC4_KEY *key, int len, const unsigned char *data);
+void rc4_set_key_internal(RC4_KEY *key, int len, const uint8_t *data);
 #else
 static inline void
-rc4_set_key_internal(RC4_KEY *key, int len, const unsigned char *data)
+rc4_set_key_internal(RC4_KEY *key, int len, const uint8_t *data)
 {
-        RC4_INT tmp;
+        RC4_INT *d, tmp;
-        int id1, id2;
+        int idx1, idx2;
-        RC4_INT *d;
+        int i, j;
-        unsigned int i;
-        d = &(key->data[0]);
+        d = key->data;
        key->x = 0;
        key->y = 0;
-        id1 = id2 = 0;
+        idx1 = idx2 = 0;
-#define SK_LOOP(d,n) { \
-                tmp=d[(n)]; \
-                id2 = (data[id1] + tmp + id2) & 0xff; \
-                if (++id1 == len) id1=0; \
-                d[(n)]=d[id2]; \
-                d[id2]=tmp; }
        for (i = 0; i < 256; i++)
                d[i] = i;
        for (i = 0; i < 256; i += 4) {
-                SK_LOOP(d, i + 0);
+                for (j = 0; j < 4; j++) {
-                SK_LOOP(d, i + 1);
+                        tmp = d[i + j];
-                SK_LOOP(d, i + 2);
+                        idx2 = (data[idx1] + tmp + idx2) & 0xff;
-                SK_LOOP(d, i + 3);
+                        d[i + j] = d[idx2];
+                        d[idx2] = tmp;
+                        if (++idx1 == len)
+                                idx1 = 0;
+                }
        }
 }
 #endif
diff --git a/src/lib/libcrypto/x509/x509.h b/src/lib/libcrypto/x509/x509.h
index 729a06d0ed..4148a6398e 100644
--- a/src/lib/libcrypto/x509/x509.h
+++ b/src/lib/libcrypto/x509/x509.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: x509.h,v 1.123 2025/07/16 15:59:26 tb Exp $ */
+/* $OpenBSD: x509.h,v 1.124 2025/08/10 06:36:45 beck Exp $ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
@@ -995,6 +995,7 @@ void ERR_load_X509_strings(void);
 #define X509_R_ERR_ASN1_LIB                              102
 #define X509_R_INVALID_DIRECTORY                         113
 #define X509_R_INVALID_FIELD_NAME                        119
+#define X509_R_INVALID_POLICY_EXTENSION                  201
 #define X509_R_INVALID_TRUST                             123
 #define X509_R_INVALID_VERSION                           137
 #define X509_R_KEY_TYPE_MISMATCH                         115
diff --git a/src/lib/libcrypto/x509/x509_policy.c b/src/lib/libcrypto/x509/x509_policy.c
index 8267e8dc49..2df965aad1 100644
--- a/src/lib/libcrypto/x509/x509_policy.c
+++ b/src/lib/libcrypto/x509/x509_policy.c
@@ -1,4 +1,4 @@
-/*      $OpenBSD: x509_policy.c,v 1.32 2025/05/10 05:54:39 tb Exp $ */
+/*      $OpenBSD: x509_policy.c,v 1.33 2025/08/10 06:36:45 beck Exp $ */
 /*
 * Copyright (c) 2022, Google Inc.
 *
@@ -27,9 +27,6 @@
 #include "x509_internal.h"
 #include "x509_local.h"
-/* XXX move to proper place */
-#define X509_R_INVALID_POLICY_EXTENSION 201
 /*
 * This file computes the X.509 policy tree, as described in RFC 5280,
 * section 6.1 and RFC 9618. It differs in that: