Add ML-KEM 768 from BoringSSL

Changes include conversion from C++, basic KNF, then adaptation to use our sha3 functions for sha3 and shake instead of the BorinSSL version. This Adds units tests to run against BoringSSL and NIST test vectors. The future public API is the same as Boring's - but is not yet exposed pending making bytesring.h public (which will happen separately) and a minor bump Currently this will just ensure we build and run regress. ok tb@ to get it into the tree and massage from there.
author: beck <> 2024-12-13 00:03:57 +0000
committer: beck <> 2024-12-13 00:03:57 +0000
commit: ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621 (patch)
tree: a9ff1c725db56dbeb46224505b3dd6fd05a21777 /src/lib
parent: fd906c7b27573203602764309c3cf5faaefdf573 (diff)
download: openbsd-ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621.tar.gz
openbsd-ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621.tar.bz2
openbsd-ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621.zip
5 files changed, 1412 insertions, 1 deletions
diff --git a/src/lib/libcrypto/Makefile b/src/lib/libcrypto/Makefile
index c981a4189f..f43b09d176 100644
--- a/src/lib/libcrypto/Makefile
+++ b/src/lib/libcrypto/Makefile
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile,v 1.228 2024/11/16 10:38:10 tb Exp $
+# $OpenBSD: Makefile,v 1.229 2024/12/13 00:03:57 beck Exp $
 LIB=    crypto
 LIBREBUILD=y
@@ -43,6 +43,7 @@ CFLAGS+= -I${LCRYPTO_SRC}/hidden
 CFLAGS+= -I${LCRYPTO_SRC}/hmac
 CFLAGS+= -I${LCRYPTO_SRC}/kdf
 CFLAGS+= -I${LCRYPTO_SRC}/lhash
+CFLAGS+= -I${LCRYPTO_SRC}/mlkem
 CFLAGS+= -I${LCRYPTO_SRC}/modes
 CFLAGS+= -I${LCRYPTO_SRC}/ocsp
 CFLAGS+= -I${LCRYPTO_SRC}/pkcs12
@@ -371,6 +372,9 @@ SRCS+= md4.c
 # md5/
 SRCS+= md5.c
+# mlkem/
+SRCS+= mlkem768.c
 # modes/
 SRCS+= cbc128.c
 SRCS+= ccm128.c
@@ -607,6 +611,7 @@ SRCS+= x_all.c
        ${LCRYPTO_SRC}/lhash \
        ${LCRYPTO_SRC}/md4 \
        ${LCRYPTO_SRC}/md5 \
+        ${LCRYPTO_SRC}/mlkem \
        ${LCRYPTO_SRC}/modes \
        ${LCRYPTO_SRC}/objects \
        ${LCRYPTO_SRC}/ocsp \
@@ -639,6 +644,7 @@ HDRS=\
        ${LCRYPTO_SRC}/bio/bio.h \
        ${LCRYPTO_SRC}/bn/bn.h \
        ${LCRYPTO_SRC}/buffer/buffer.h \
+        ${LCRYPTO_SRC}/bytestring/bytestring.h \
        ${LCRYPTO_SRC}/camellia/camellia.h \
        ${LCRYPTO_SRC}/cast/cast.h \
        ${LCRYPTO_SRC}/chacha/chacha.h \
@@ -665,6 +671,7 @@ HDRS=\
        ${LCRYPTO_SRC}/lhash/lhash.h \
        ${LCRYPTO_SRC}/md4/md4.h \
        ${LCRYPTO_SRC}/md5/md5.h \
+        ${LCRYPTO_SRC}/mlkem/mlkem.h \
        ${LCRYPTO_SRC}/modes/modes.h \
        ${LCRYPTO_SRC}/objects/objects.h \
        ${LCRYPTO_SRC}/ocsp/ocsp.h \
diff --git a/src/lib/libcrypto/hidden/openssl/mlkem.h b/src/lib/libcrypto/hidden/openssl/mlkem.h
new file mode 100644
index 0000000000..01ac28cffd
--- /dev/null
+++ b/src/lib/libcrypto/hidden/openssl/mlkem.h
@@ -0,0 +1,40 @@
+/* $OpenBSD: mlkem.h,v 1.1 2024/12/13 00:03:57 beck Exp $ */
+/*
+ * Copyright (c) 2024 Bob Beck <beck@obtuse.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef _LIBCRYPTO_MLKEM_H
+#define _LIBCRYPTO_MLKEM_H
+#ifndef _MSC_VER
+#include_next <openssl/mlkem.h>
+#else
+#include "../include/openssl/mlkem.h"
+#endif
+#include "crypto_namespace.h"
+/* Undo when making public */
+#ifdef LIBRESSL_HAS_MLKEM
+LCRYPTO_USED(MLKEM768_generate_key);
+LCRYPTO_USED(MLKEM768_public_from_private);
+LCRYPTO_USED(MLKEM768_encap);
+LCRYPTO_USED(MLKEM768_decap);
+LCRYPTO_USED(MLKEM768_marshal_public_key);
+LCRYPTO_USED(MLKEM768_parse_public_key);
+LCRYPTO_USED(MLKEM768_private_key_from_seed);
+LCRYPTO_USED(MLKEM768_parse_private_key);
+#endif
+#endif /* _LIBCRYPTO_MLKEM_H */
diff --git a/src/lib/libcrypto/mlkem/mlkem.h b/src/lib/libcrypto/mlkem/mlkem.h
new file mode 100644
index 0000000000..8040f4844b
--- /dev/null
+++ b/src/lib/libcrypto/mlkem/mlkem.h
@@ -0,0 +1,168 @@
+/* Copyright (c) 2024, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#ifndef OPENSSL_HEADER_MLKEM_H
+#define OPENSSL_HEADER_MLKEM_H
+#include <sys/types.h>
+#include <stdint.h>
+#ifdef LIBRESSL_HAS_MLKEM
+/* This needs to become public */
+#include <openssl/bytestring.h>
+#else
+/* Hack for now */
+struct cbs_st;
+struct cbb_st;
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/*
+ * ML-KEM-768
+ *
+ * This implements the Module-Lattice-Based Key-Encapsulation Mechanism from
+ * https://csrc.nist.gov/pubs/fips/204/final
+ */
+/*
+ * MLKEM768_public_key contains a ML-KEM-768 public key. The contents of this
+ * object should never leave the address space since the format is unstable.
+ */
+struct MLKEM768_public_key {
+        union {
+                uint8_t bytes[512 * (3 + 9) + 32 + 32];
+                uint16_t alignment;
+        } opaque;
+};
+/*
+ * MLKEM768_private_key contains a ML-KEM-768 private key. The contents of this
+ * object should never leave the address space since the format is unstable.
+ */
+struct MLKEM768_private_key {
+        union {
+                uint8_t bytes[512 * (3 + 3 + 9) + 32 + 32 + 32];
+                uint16_t alignment;
+        } opaque;
+};
+/*
+ * MLKEM768_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM768 public
+ * key.
+ */
+#define MLKEM768_PUBLIC_KEY_BYTES 1184
+/* MLKEM_SEED_BYTES is the number of bytes in an ML-KEM seed. */
+#define MLKEM_SEED_BYTES 64
+/*
+ *  MLKEM_SHARED_SECRET_BYTES is the number of bytes in the ML-KEM768 shared
+ * secret. Although the round-3 specification has a variable-length output, the
+ * final ML-KEM construction is expected to use a fixed 32-byte output. To
+ * simplify the future transition, we apply the same restriction.
+ */
+#define MLKEM_SHARED_SECRET_BYTES 32
+/*
+ * MLKEM_generate_key generates a random public/private key pair, writes the
+ * encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+ * the private key. If |optional_out_seed| us not NULL then te seed used to
+ * generate te private key is written to it.
+ */
+void MLKEM768_generate_key(
+    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
+    struct MLKEM768_private_key *out_private_key);
+/*
+ * MLKEM768_private_key_from_seed derives a private key from a seed that was
+ * generated by |MLKEM768_generate_key|. It fails and returns 0 if |seed_len| is
+ * incorrect, otherwise it writes |*out_private_key| and returns 1.
+ */
+int MLKEM768_private_key_from_seed(struct MLKEM768_private_key *out_private_key,
+    const uint8_t *seed, size_t seed_len);
+/*
+ * MLKEM_public_from_private sets |*out_public_key| to the public key that
+ * corresponds to |private_key|. (This is faster than parsing the output of
+ * |MLKEM_generate_key| if, for some reason, you need to encapsulate to a key
+ * that was just generated.)
+ */
+void MLKEM768_public_from_private(struct MLKEM768_public_key *out_public_key,
+    const struct MLKEM768_private_key *private_key);
+/* MLKEM768_CIPHERTEXT_BYTES is number of bytes in the ML-KEM768 ciphertext. */
+#define MLKEM768_CIPHERTEXT_BYTES 1088
+/*
+ * MLKEM768_encap encrypts a random shared secret for |public_key|, writes the
+ * ciphertext to |out_ciphertext|, and writes the random shared secret to
+ * |out_shared_secret|.
+ */
+void MLKEM768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM768_public_key *public_key);
+/*
+ * MLKEM768_decap decrypts a shared secret from |ciphertext| using |private_key|
+ * and writes it to |out_shared_secret|. If |ciphertext_len| is incorrect it
+ * returns 0, otherwise it rreturns 1. If |ciphertext| is invalid,
+ * |out_shared_secret| is filled with a key that will always be the same for the
+ * same |ciphertext| and |private_key|, but which appears to be random unless
+ * one has access to |private_key|. These alternatives occur in constant time.
+ * Any subsequent symmetric encryption using |out_shared_secret| must use an
+ * authenticated encryption scheme in order to discover the decapsulation
+ * failure.
+ */
+int MLKEM768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    const struct MLKEM768_private_key *private_key);
+/* Serialisation of keys. */
+/*
+ * MLKEM768_marshal_public_key serializes |public_key| to |out| in the standard
+ * format for ML-KEM public keys. It returns one on success or zero on allocation
+ * error.
+ */
+int MLKEM768_marshal_public_key(struct cbb_st *out,
+    const struct MLKEM768_public_key *public_key);
+/*
+ * MLKEM768_parse_public_key parses a public key, in the format generated by
+ * |MLKEM_marshal_public_key|, from |in| and writes the result to
+ * |out_public_key|. It returns one on success or zero on parse error or if
+ * there are trailing bytes in |in|.
+ */
+int MLKEM768_parse_public_key(struct MLKEM768_public_key *out_public_key,
+    struct cbs_st *in);
+/*
+ * MLKEM_parse_private_key parses a private key, in the format generated by
+ * |MLKEM_marshal_private_key|, from |in| and writes the result to
+ * |out_private_key|. It returns one on success or zero on parse error or if
+ * there are trailing bytes in |in|. This formate is verbose and should be avoided.
+ * Private keys should be stored as seeds and parsed using |MLKEM768_private_key_from_seed|.
+ */
+int MLKEM768_parse_private_key(struct MLKEM768_private_key *out_private_key,
+    struct cbs_st *in);
+#if defined(__cplusplus)
+}
+#endif
+#endif  /* OPENSSL_HEADER_MLKEM_H */
diff --git a/src/lib/libcrypto/mlkem/mlkem768.c b/src/lib/libcrypto/mlkem/mlkem768.c
new file mode 100644
index 0000000000..2ab1f5b0d9
--- /dev/null
+++ b/src/lib/libcrypto/mlkem/mlkem768.c
@@ -0,0 +1,1118 @@
+/* $OpenBSD: mlkem768.c,v 1.1 2024/12/13 00:03:57 beck Exp $ */
+/*
+ * Copyright (c) 2024, Google Inc.
+ * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <openssl/mlkem.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bytestring.h"
+#include "sha3_internal.h"
+#include "mlkem_internal.h"
+#include "constant_time.h"
+#include "crypto_internal.h"
+/* Remove later */
+#undef LCRYPTO_ALIAS
+#define LCRYPTO_ALIAS(A)
+/*
+ * See
+ * https://csrc.nist.gov/pubs/fips/203/final
+ */
+static void
+prf(uint8_t *out, size_t out_len, const uint8_t in[33])
+{
+        sha3_ctx ctx;
+        shake256_init(&ctx);
+        shake_update(&ctx, in, 33);
+        shake_xof(&ctx);
+        shake_out(&ctx, out, out_len);
+}
+/* Section 4.1 */
+static void
+hash_h(uint8_t out[32], const uint8_t *in, size_t len)
+{
+        sha3_ctx ctx;
+        sha3_init(&ctx, 32);
+        sha3_update(&ctx, in, len);
+        sha3_final(out, &ctx);
+}
+static void
+hash_g(uint8_t out[64], const uint8_t *in, size_t len)
+{
+        sha3_ctx ctx;
+        sha3_init(&ctx, 64);
+        sha3_update(&ctx, in, len);
+        sha3_final(out, &ctx);
+}
+/* this is called 'J' in the spec */
+static void
+kdf(uint8_t out[MLKEM_SHARED_SECRET_BYTES], const uint8_t failure_secret[32],
+    const uint8_t *in, size_t len)
+{
+        sha3_ctx ctx;
+        shake256_init(&ctx);
+        shake_update(&ctx, failure_secret, 32);
+        shake_update(&ctx, in, len);
+        shake_xof(&ctx);
+        shake_out(&ctx, out, MLKEM_SHARED_SECRET_BYTES);
+}
+#define DEGREE 256
+#define RANK768 3
+static const size_t kBarrettMultiplier = 5039;
+static const unsigned kBarrettShift = 24;
+static const uint16_t kPrime = 3329;
+static const int kLog2Prime = 12;
+static const uint16_t kHalfPrime = (/*kPrime=*/3329 - 1) / 2;
+static const int kDU768 = 10;
+static const int kDV768 = 4;
+/*
+ * kInverseDegree is 128^-1 mod 3329; 128 because kPrime does not have a 512th
+ * root of unity.
+ */
+static const uint16_t kInverseDegree = 3303;
+static const size_t kEncodedVectorSize =
+    (/*kLog2Prime=*/12 * DEGREE / 8) * RANK768;
+static const size_t kCompressedVectorSize = /*kDU768=*/ 10 * RANK768 * DEGREE /
+    8;
+typedef struct scalar {
+        /* On every function entry and exit, 0 <= c < kPrime. */
+        uint16_t c[DEGREE];
+} scalar;
+typedef struct vector {
+        scalar v[RANK768];
+} vector;
+typedef struct matrix {
+        scalar v[RANK768][RANK768];
+} matrix;
+/*
+ * This bit of Python will be referenced in some of the following comments:
+ *
+ *  p = 3329
+ *
+ * def bitreverse(i):
+ *     ret = 0
+ *     for n in range(7):
+ *         bit = i & 1
+ *         ret <<= 1
+ *         ret |= bit
+ *         i >>= 1
+ *     return ret
+ */
+/* kNTTRoots = [pow(17, bitreverse(i), p) for i in range(128)] */
+static const uint16_t kNTTRoots[128] = {
+        1,    1729, 2580, 3289, 2642, 630,  1897, 848,  1062, 1919, 193,  797,
+        2786, 3260, 569,  1746, 296,  2447, 1339, 1476, 3046, 56,   2240, 1333,
+        1426, 2094, 535,  2882, 2393, 2879, 1974, 821,  289,  331,  3253, 1756,
+        1197, 2304, 2277, 2055, 650,  1977, 2513, 632,  2865, 33,   1320, 1915,
+        2319, 1435, 807,  452,  1438, 2868, 1534, 2402, 2647, 2617, 1481, 648,
+        2474, 3110, 1227, 910,  17,   2761, 583,  2649, 1637, 723,  2288, 1100,
+        1409, 2662, 3281, 233,  756,  2156, 3015, 3050, 1703, 1651, 2789, 1789,
+        1847, 952,  1461, 2687, 939,  2308, 2437, 2388, 733,  2337, 268,  641,
+        1584, 2298, 2037, 3220, 375,  2549, 2090, 1645, 1063, 319,  2773, 757,
+        2099, 561,  2466, 2594, 2804, 1092, 403,  1026, 1143, 2150, 2775, 886,
+        1722, 1212, 1874, 1029, 2110, 2935, 885,  2154,
+};
+/* kInverseNTTRoots = [pow(17, -bitreverse(i), p) for i in range(128)] */
+static const uint16_t kInverseNTTRoots[128] = {
+        1,    1600, 40,   749,  2481, 1432, 2699, 687,  1583, 2760, 69,   543,
+        2532, 3136, 1410, 2267, 2508, 1355, 450,  936,  447,  2794, 1235, 1903,
+        1996, 1089, 3273, 283,  1853, 1990, 882,  3033, 2419, 2102, 219,  855,
+        2681, 1848, 712,  682,  927,  1795, 461,  1891, 2877, 2522, 1894, 1010,
+        1414, 2009, 3296, 464,  2697, 816,  1352, 2679, 1274, 1052, 1025, 2132,
+        1573, 76,   2998, 3040, 1175, 2444, 394,  1219, 2300, 1455, 2117, 1607,
+        2443, 554,  1179, 2186, 2303, 2926, 2237, 525,  735,  863,  2768, 1230,
+        2572, 556,  3010, 2266, 1684, 1239, 780,  2954, 109,  1292, 1031, 1745,
+        2688, 3061, 992,  2596, 941,  892,  1021, 2390, 642,  1868, 2377, 1482,
+        1540, 540,  1678, 1626, 279,  314,  1173, 2573, 3096, 48,   667,  1920,
+        2229, 1041, 2606, 1692, 680,  2746, 568,  3312,
+};
+/* kModRoots = [pow(17, 2*bitreverse(i) + 1, p) for i in range(128)] */
+static const uint16_t kModRoots[128] = {
+        17,   3312, 2761, 568,  583,  2746, 2649, 680,  1637, 1692, 723,  2606,
+        2288, 1041, 1100, 2229, 1409, 1920, 2662, 667,  3281, 48,   233,  3096,
+        756,  2573, 2156, 1173, 3015, 314,  3050, 279,  1703, 1626, 1651, 1678,
+        2789, 540,  1789, 1540, 1847, 1482, 952,  2377, 1461, 1868, 2687, 642,
+        939,  2390, 2308, 1021, 2437, 892,  2388, 941,  733,  2596, 2337, 992,
+        268,  3061, 641,  2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109,
+        375,  2954, 2549, 780,  2090, 1239, 1645, 1684, 1063, 2266, 319,  3010,
+        2773, 556,  757,  2572, 2099, 1230, 561,  2768, 2466, 863,  2594, 735,
+        2804, 525,  1092, 2237, 403,  2926, 1026, 2303, 1143, 2186, 2150, 1179,
+        2775, 554,  886,  2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300,
+        2110, 1219, 2935, 394,  885,  2444, 2154, 1175,
+};
+/* reduce_once reduces 0 <= x < 2*kPrime, mod kPrime. */
+static uint16_t
+reduce_once(uint16_t x)
+{
+        assert(x < 2 * kPrime);
+        const uint16_t subtracted = x - kPrime;
+        uint16_t mask = 0u - (subtracted >> 15);
+        /*
+         * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of
+         * ML-KEM overall and Clang still produces constant-time code using
+         * `csel`. On other platforms & compilers on godbolt that we care about,
+         * this code also produces constant-time output.
+         */
+        return (mask & x) | (~mask & subtracted);
+}
+/*
+ * constant time reduce x mod kPrime using Barrett reduction. x must be less
+ * than kPrime + 2×kPrime².
+ */
+static uint16_t
+reduce(uint32_t x)
+{
+        uint64_t product = (uint64_t)x * kBarrettMultiplier;
+        uint32_t quotient = (uint32_t)(product >> kBarrettShift);
+        uint32_t remainder = x - quotient * kPrime;
+        assert(x < kPrime + 2u * kPrime * kPrime);
+        return reduce_once(remainder);
+}
+static void
+scalar_zero(scalar *out)
+{
+        memset(out, 0, sizeof(*out));
+}
+static void
+vector_zero(vector *out)
+{
+        memset(out, 0, sizeof(*out));
+}
+/*
+ * In place number theoretic transform of a given scalar.
+ * Note that MLKEM's kPrime 3329 does not have a 512th root of unity, so this
+ * transform leaves off the last iteration of the usual FFT code, with the 128
+ * relevant roots of unity being stored in |kNTTRoots|. This means the output
+ * should be seen as 128 elements in GF(3329^2), with the coefficients of the
+ * elements being consecutive entries in |s->c|.
+ */
+static void
+scalar_ntt(scalar *s)
+{
+        int offset = DEGREE;
+        int step;
+        /*
+         * `int` is used here because using `size_t` throughout caused a ~5% slowdown
+         * with Clang 14 on Aarch64.
+         */
+        for (step = 1; step < DEGREE / 2; step <<= 1) {
+                int i, j, k = 0;
+                offset >>= 1;
+                for (i = 0; i < step; i++) {
+                        const uint32_t step_root = kNTTRoots[i + step];
+                        for (j = k; j < k + offset; j++) {
+                                uint16_t odd, even;
+                                odd = reduce(step_root * s->c[j + offset]);
+                                even = s->c[j];
+                                s->c[j] = reduce_once(odd + even);
+                                s->c[j + offset] = reduce_once(even - odd +
+                                    kPrime);
+                        }
+                        k += 2 * offset;
+                }
+        }
+}
+static void
+vector_ntt(vector *a)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                scalar_ntt(&a->v[i]);
+        }
+}
+/*
+ * In place inverse number theoretic transform of a given scalar, with pairs of
+ * entries of s->v being interpreted as elements of GF(3329^2). Just as with the
+ * number theoretic transform, this leaves off the first step of the normal iFFT
+ * to account for the fact that 3329 does not have a 512th root of unity, using
+ * the precomputed 128 roots of unity stored in |kInverseNTTRoots|.
+ */
+static void
+scalar_inverse_ntt(scalar *s)
+{
+        int i, j, k, offset, step = DEGREE / 2;
+        /*
+         * `int` is used here because using `size_t` throughout caused a ~5% slowdown
+         * with Clang 14 on Aarch64.
+         */
+        for (offset = 2; offset < DEGREE; offset <<= 1) {
+                step >>= 1;
+                k = 0;
+                for (i = 0; i < step; i++) {
+                        uint32_t step_root = kInverseNTTRoots[i + step];
+                        for (j = k; j < k + offset; j++) {
+                                uint16_t odd, even;
+                                odd = s->c[j + offset];
+                                even = s->c[j];
+                                s->c[j] = reduce_once(odd + even);
+                                s->c[j + offset] = reduce(step_root *
+                                    (even - odd + kPrime));
+                        }
+                        k += 2 * offset;
+                }
+        }
+        for (i = 0; i < DEGREE; i++) {
+                s->c[i] = reduce(s->c[i] * kInverseDegree);
+        }
+}
+static void
+vector_inverse_ntt(vector *a)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                scalar_inverse_ntt(&a->v[i]);
+        }
+}
+static void
+scalar_add(scalar *lhs, const scalar *rhs)
+{
+        int i;
+        for (i = 0; i < DEGREE; i++) {
+                lhs->c[i] = reduce_once(lhs->c[i] + rhs->c[i]);
+        }
+}
+static void
+scalar_sub(scalar *lhs, const scalar *rhs)
+{
+        int i;
+        for (i = 0; i < DEGREE; i++) {
+                lhs->c[i] = reduce_once(lhs->c[i] - rhs->c[i] + kPrime);
+        }
+}
+/*
+ * Multiplying two scalars in the number theoretically transformed state. Since
+ * 3329 does not have a 512th root of unity, this means we have to interpret
+ * the 2*ith and (2*i+1)th entries of the scalar as elements of GF(3329)[X]/(X^2
+ * - 17^(2*bitreverse(i)+1)) The value of 17^(2*bitreverse(i)+1) mod 3329 is
+ * stored in the precomputed |kModRoots| table. Note that our Barrett transform
+ * only allows us to multipy two reduced numbers together, so we need some
+ * intermediate reduction steps, even if an uint64_t could hold 3 multiplied
+ * numbers.
+ */
+static void
+scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs)
+{
+        int i;
+        for (i = 0; i < DEGREE / 2; i++) {
+                uint32_t real_real = (uint32_t)lhs->c[2 * i] * rhs->c[2 * i];
+                uint32_t img_img = (uint32_t)lhs->c[2 * i + 1] *
+                    rhs->c[2 * i + 1];
+                uint32_t real_img = (uint32_t)lhs->c[2 * i] * rhs->c[2 * i + 1];
+                uint32_t img_real = (uint32_t)lhs->c[2 * i + 1] * rhs->c[2 * i];
+                out->c[2 * i] =
+                    reduce(real_real +
+                    (uint32_t)reduce(img_img) * kModRoots[i]);
+                out->c[2 * i + 1] = reduce(img_real + real_img);
+        }
+}
+static void
+vector_add(vector *lhs, const vector *rhs)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                scalar_add(&lhs->v[i], &rhs->v[i]);
+        }
+}
+static void
+matrix_mult(vector *out, const matrix *m, const vector *a)
+{
+        int i, j;
+        vector_zero(out);
+        for (i = 0; i < RANK768; i++) {
+                for (j = 0; j < RANK768; j++) {
+                        scalar product;
+                        scalar_mult(&product, &m->v[i][j], &a->v[j]);
+                        scalar_add(&out->v[i], &product);
+                }
+        }
+}
+static void
+matrix_mult_transpose(vector *out, const matrix *m,
+    const vector *a)
+{
+        int i, j;
+        vector_zero(out);
+        for (i = 0; i < RANK768; i++) {
+                for (j = 0; j < RANK768; j++) {
+                        scalar product;
+                        scalar_mult(&product, &m->v[j][i], &a->v[j]);
+                        scalar_add(&out->v[i], &product);
+                }
+        }
+}
+static void
+scalar_inner_product(scalar *out, const vector *lhs,
+    const vector *rhs)
+{
+        int i;
+        scalar_zero(out);
+        for (i = 0; i < RANK768; i++) {
+                scalar product;
+                scalar_mult(&product, &lhs->v[i], &rhs->v[i]);
+                scalar_add(out, &product);
+        }
+}
+/*
+ * Algorithm 6 of spec. Rejection samples a Keccak stream to get uniformly
+ * distributed elements. This is used for matrix expansion and only operates on
+ * public inputs.
+ */
+static void
+scalar_from_keccak_vartime(scalar *out, sha3_ctx *keccak_ctx)
+{
+        int i, done = 0;
+        while (done < DEGREE) {
+                uint8_t block[168];
+                shake_out(keccak_ctx, block, sizeof(block));
+                for (i = 0; i < sizeof(block) && done < DEGREE; i += 3) {
+                        uint16_t d1 = block[i] + 256 * (block[i + 1] % 16);
+                        uint16_t d2 = block[i + 1] / 16 + 16 * block[i + 2];
+                        if (d1 < kPrime) {
+                                out->c[done++] = d1;
+                        }
+                        if (d2 < kPrime && done < DEGREE) {
+                                out->c[done++] = d2;
+                        }
+                }
+        }
+}
+/*
+ * Algorithm 7 of the spec, with eta fixed to two and the PRF call
+ * included. Creates binominally distributed elements by sampling 2*|eta| bits,
+ * and setting the coefficient to the count of the first bits minus the count of
+ * the second bits, resulting in a centered binomial distribution. Since eta is
+ * two this gives -2/2 with a probability of 1/16, -1/1 with probability 1/4,
+ * and 0 with probability 3/8.
+ */
+static void
+scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out,
+    const uint8_t input[33])
+{
+        uint8_t entropy[128];
+        int i;
+        CTASSERT(sizeof(entropy) == 2 * /*kEta=*/ 2 * DEGREE / 8);
+        prf(entropy, sizeof(entropy), input);
+        for (i = 0; i < DEGREE; i += 2) {
+                uint8_t byte = entropy[i / 2];
+                uint16_t value = kPrime;
+                value += (byte & 1) + ((byte >> 1) & 1);
+                value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
+                out->c[i] = reduce_once(value);
+                byte >>= 4;
+                value = kPrime;
+                value += (byte & 1) + ((byte >> 1) & 1);
+                value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
+                out->c[i + 1] = reduce_once(value);
+        }
+}
+/*
+ * Generates a secret vector by using
+ * |scalar_centered_binomial_distribution_eta_2_with_prf|, using the given seed
+ * appending and incrementing |counter| for entry of the vector.
+ */
+static void
+vector_generate_secret_eta_2(vector *out, uint8_t *counter,
+    const uint8_t seed[32])
+{
+        uint8_t input[33];
+        int i;
+        memcpy(input, seed, 32);
+        for (i = 0; i < RANK768; i++) {
+                input[32] = (*counter)++;
+                scalar_centered_binomial_distribution_eta_2_with_prf(&out->v[i],
+                    input);
+        }
+}
+/* Expands the matrix of a seed for key generation and for encaps-CPA. */
+static void
+matrix_expand(matrix *out, const uint8_t rho[32])
+{
+        uint8_t input[34];
+        int i, j;
+        memcpy(input, rho, 32);
+        for (i = 0; i < RANK768; i++) {
+                for (j = 0; j < RANK768; j++) {
+                        sha3_ctx keccak_ctx;
+                        input[32] = i;
+                        input[33] = j;
+                        shake128_init(&keccak_ctx);
+                        shake_update(&keccak_ctx, input, sizeof(input));
+                        shake_xof(&keccak_ctx);
+                        scalar_from_keccak_vartime(&out->v[i][j], &keccak_ctx);
+                }
+        }
+}
+static const uint8_t kMasks[8] = {0x01, 0x03, 0x07, 0x0f,
+        0x1f, 0x3f, 0x7f, 0xff};
+static void
+scalar_encode(uint8_t *out, const scalar *s, int bits)
+{
+        uint8_t out_byte = 0;
+        int i, out_byte_bits = 0;
+        assert(bits <= (int)sizeof(*s->c) * 8 && bits != 1);
+        for (i = 0; i < DEGREE; i++) {
+                uint16_t element = s->c[i];
+                int element_bits_done = 0;
+                while (element_bits_done < bits) {
+                        int chunk_bits = bits - element_bits_done;
+                        int out_bits_remaining = 8 - out_byte_bits;
+                        if (chunk_bits >= out_bits_remaining) {
+                                chunk_bits = out_bits_remaining;
+                                out_byte |= (element &
+                                    kMasks[chunk_bits - 1]) << out_byte_bits;
+                                *out = out_byte;
+                                out++;
+                                out_byte_bits = 0;
+                                out_byte = 0;
+                        } else {
+                                out_byte |= (element &
+                                    kMasks[chunk_bits - 1]) << out_byte_bits;
+                                out_byte_bits += chunk_bits;
+                        }
+                        element_bits_done += chunk_bits;
+                        element >>= chunk_bits;
+                }
+        }
+        if (out_byte_bits > 0) {
+                *out = out_byte;
+        }
+}
+/* scalar_encode_1 is |scalar_encode| specialised for |bits| == 1. */
+static void
+scalar_encode_1(uint8_t out[32], const scalar *s)
+{
+        int i, j;
+        for (i = 0; i < DEGREE; i += 8) {
+                uint8_t out_byte = 0;
+                for (j = 0; j < 8; j++) {
+                        out_byte |= (s->c[i + j] & 1) << j;
+                }
+                *out = out_byte;
+                out++;
+        }
+}
+/*
+ * Encodes an entire vector into 32*|RANK768|*|bits| bytes. Note that since 256
+ * (DEGREE) is divisible by 8, the individual vector entries will always fill a
+ * whole number of bytes, so we do not need to worry about bit packing here.
+ */
+static void
+vector_encode(uint8_t *out, const vector *a, int bits)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                scalar_encode(out + i * bits * DEGREE / 8, &a->v[i], bits);
+        }
+}
+/*
+ * scalar_decode parses |DEGREE * bits| bits from |in| into |DEGREE| values in
+ * |out|. It returns one on success and zero if any parsed value is >=
+ * |kPrime|.
+ */
+static int
+scalar_decode(scalar *out, const uint8_t *in, int bits)
+{
+        uint8_t in_byte = 0;
+        int i, in_byte_bits_left = 0;
+        assert(bits <= (int)sizeof(*out->c) * 8 && bits != 1);
+        for (i = 0; i < DEGREE; i++) {
+                uint16_t element = 0;
+                int element_bits_done = 0;
+                while (element_bits_done < bits) {
+                        int chunk_bits = bits - element_bits_done;
+                        if (in_byte_bits_left == 0) {
+                                in_byte = *in;
+                                in++;
+                                in_byte_bits_left = 8;
+                        }
+                        if (chunk_bits > in_byte_bits_left) {
+                                chunk_bits = in_byte_bits_left;
+                        }
+                        element |= (in_byte & kMasks[chunk_bits - 1]) <<
+                            element_bits_done;
+                        in_byte_bits_left -= chunk_bits;
+                        in_byte >>= chunk_bits;
+                        element_bits_done += chunk_bits;
+                }
+                if (element >= kPrime) {
+                        return 0;
+                }
+                out->c[i] = element;
+        }
+        return 1;
+}
+/* scalar_decode_1 is |scalar_decode| specialised for |bits| == 1. */
+static void
+scalar_decode_1(scalar *out, const uint8_t in[32])
+{
+        int i, j;
+        for (i = 0; i < DEGREE; i += 8) {
+                uint8_t in_byte = *in;
+                in++;
+                for (j = 0; j < 8; j++) {
+                        out->c[i + j] = in_byte & 1;
+                        in_byte >>= 1;
+                }
+        }
+}
+/*
+ * Decodes 32*|RANK768|*|bits| bytes from |in| into |out|. It returns one on
+ * success or zero if any parsed value is >= |kPrime|.
+ */
+static int
+vector_decode(vector *out, const uint8_t *in, int bits)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                if (!scalar_decode(&out->v[i], in + i * bits * DEGREE / 8,
+                    bits)) {
+                        return 0;
+                }
+        }
+        return 1;
+}
+/*
+ * Compresses (lossily) an input |x| mod 3329 into |bits| many bits by grouping
+ * numbers close to each other together. The formula used is
+ * round(2^|bits|/kPrime*x) mod 2^|bits|.
+ * Uses Barrett reduction to achieve constant time. Since we need both the
+ * remainder (for rounding) and the quotient (as the result), we cannot use
+ * |reduce| here, but need to do the Barrett reduction directly.
+ */
+static uint16_t
+compress(uint16_t x, int bits)
+{
+        uint32_t shifted = (uint32_t)x << bits;
+        uint64_t product = (uint64_t)shifted * kBarrettMultiplier;
+        uint32_t quotient = (uint32_t)(product >> kBarrettShift);
+        uint32_t remainder = shifted - quotient * kPrime;
+        /*
+         * Adjust the quotient to round correctly:
+         * 0 <= remainder <= kHalfPrime round to 0
+         * kHalfPrime < remainder <= kPrime + kHalfPrime round to 1
+         * kPrime + kHalfPrime < remainder < 2 * kPrime round to 2
+         */
+        assert(remainder < 2u * kPrime);
+        quotient += 1 & constant_time_lt(kHalfPrime, remainder);
+        quotient += 1 & constant_time_lt(kPrime + kHalfPrime, remainder);
+        return quotient & ((1 << bits) - 1);
+}
+/*
+ * Decompresses |x| by using an equi-distant representative. The formula is
+ * round(kPrime/2^|bits|*x). Note that 2^|bits| being the divisor allows us to
+ * implement this logic using only bit operations.
+ */
+static uint16_t
+decompress(uint16_t x, int bits)
+{
+        uint32_t product = (uint32_t)x * kPrime;
+        uint32_t power = 1 << bits;
+        /* This is |product| % power, since |power| is a power of 2. */
+        uint32_t remainder = product & (power - 1);
+        /* This is |product| / power, since |power| is a power of 2. */
+        uint32_t lower = product >> bits;
+        /*
+         * The rounding logic works since the first half of numbers mod |power| have a
+         * 0 as first bit, and the second half has a 1 as first bit, since |power| is
+         * a power of 2. As a 12 bit number, |remainder| is always positive, so we
+         * will shift in 0s for a right shift.
+         */
+        return lower + (remainder >> (bits - 1));
+}
+static void
+scalar_compress(scalar *s, int bits)
+{
+        int i;
+        for (i = 0; i < DEGREE; i++) {
+                s->c[i] = compress(s->c[i], bits);
+        }
+}
+static void
+scalar_decompress(scalar *s, int bits)
+{
+        int i;
+        for (i = 0; i < DEGREE; i++) {
+                s->c[i] = decompress(s->c[i], bits);
+        }
+}
+static void
+vector_compress(vector *a, int bits)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                scalar_compress(&a->v[i], bits);
+        }
+}
+static void
+vector_decompress(vector *a, int bits)
+{
+        int i;
+        for (i = 0; i < RANK768; i++) {
+                scalar_decompress(&a->v[i], bits);
+        }
+}
+struct public_key {
+        vector t;
+        uint8_t rho[32];
+        uint8_t public_key_hash[32];
+        matrix m;
+};
+static struct public_key *
+public_key_768_from_external(const struct MLKEM768_public_key *external)
+{
+        return (struct public_key *)external;
+}
+struct private_key {
+        struct public_key pub;
+        vector s;
+        uint8_t fo_failure_secret[32];
+};
+static struct private_key *
+private_key_768_from_external(const struct MLKEM768_private_key *external)
+{
+        return (struct private_key *)external;
+}
+/*
+ * Calls |MLKEM768_generate_key_external_entropy| with random bytes from
+ * |RAND_bytes|.
+ */
+void
+MLKEM768_generate_key(uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
+    struct MLKEM768_private_key *out_private_key)
+{
+        uint8_t entropy_buf[MLKEM_SEED_BYTES];
+        uint8_t *entropy = optional_out_seed != NULL ? optional_out_seed :
+            entropy_buf;
+        arc4random_buf(entropy, MLKEM_SEED_BYTES);
+        MLKEM768_generate_key_external_entropy(out_encoded_public_key,
+            out_private_key, entropy);
+}
+LCRYPTO_ALIAS(MLKEM768_generate_key);
+int
+MLKEM768_private_key_from_seed(struct MLKEM768_private_key *out_private_key,
+    const uint8_t *seed, size_t seed_len)
+{
+        if (seed_len != MLKEM_SEED_BYTES) {
+                return 0;
+        }
+        uint8_t public_key_bytes[MLKEM768_PUBLIC_KEY_BYTES];
+        MLKEM768_generate_key_external_entropy(public_key_bytes,
+            out_private_key, seed);
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM768_private_key_from_seed);
+static int
+mlkem_marshal_public_key(CBB *out, const struct public_key *pub)
+{
+        uint8_t *vector_output;
+        if (!CBB_add_space(out, &vector_output, kEncodedVectorSize)) {
+                return 0;
+        }
+        vector_encode(vector_output, &pub->t, kLog2Prime);
+        if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) {
+                return 0;
+        }
+        return 1;
+}
+void
+MLKEM768_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    struct MLKEM768_private_key *out_private_key,
+    const uint8_t entropy[MLKEM_SEED_BYTES])
+{
+        struct private_key *priv = private_key_768_from_external(
+            out_private_key);
+        uint8_t augmented_seed[33];
+        uint8_t *rho, *sigma;
+        uint8_t counter = 0;
+        uint8_t hashed[64];
+        vector error;
+        CBB cbb;
+        memcpy(augmented_seed, entropy, 32);
+        augmented_seed[32] = RANK768;
+        hash_g(hashed, augmented_seed, 33);
+        rho = hashed;
+        sigma = hashed + 32;
+        memcpy(priv->pub.rho, hashed, sizeof(priv->pub.rho));
+        matrix_expand(&priv->pub.m, rho);
+        vector_generate_secret_eta_2(&priv->s, &counter, sigma);
+        vector_ntt(&priv->s);
+        vector_generate_secret_eta_2(&error, &counter, sigma);
+        vector_ntt(&error);
+        matrix_mult_transpose(&priv->pub.t, &priv->pub.m, &priv->s);
+        vector_add(&priv->pub.t, &error);
+        CBB_init_fixed(&cbb, out_encoded_public_key, MLKEM768_PUBLIC_KEY_BYTES);
+        if (!mlkem_marshal_public_key(&cbb, &priv->pub)) {
+                abort();
+        }
+        hash_h(priv->pub.public_key_hash, out_encoded_public_key,
+            MLKEM768_PUBLIC_KEY_BYTES);
+        memcpy(priv->fo_failure_secret, entropy + 32, 32);
+}
+void
+MLKEM768_public_from_private(struct MLKEM768_public_key *out_public_key,
+    const struct MLKEM768_private_key *private_key)
+{
+        struct public_key *const pub = public_key_768_from_external(
+            out_public_key);
+        const struct private_key *const priv = private_key_768_from_external(
+            private_key);
+        *pub = priv->pub;
+}
+LCRYPTO_ALIAS(MLKEM768_public_from_private);
+/*
+ * Encrypts a message with given randomness to the ciphertext in |out|. Without
+ * applying the Fujisaki-Okamoto transform this would not result in a CCA secure
+ * scheme, since lattice schemes are vulnerable to decryption failure oracles.
+ */
+static void
+encrypt_cpa(uint8_t out[MLKEM768_CIPHERTEXT_BYTES],
+    const struct public_key *pub, const uint8_t message[32],
+    const uint8_t randomness[32])
+{
+        scalar expanded_message, scalar_error;
+        vector secret, error, u;
+        uint8_t counter = 0;
+        uint8_t input[33];
+        scalar v;
+        vector_generate_secret_eta_2(&secret, &counter, randomness);
+        vector_ntt(&secret);
+        vector_generate_secret_eta_2(&error, &counter, randomness);
+        memcpy(input, randomness, 32);
+        input[32] = counter;
+        scalar_centered_binomial_distribution_eta_2_with_prf(&scalar_error,
+            input);
+        matrix_mult(&u, &pub->m, &secret);
+        vector_inverse_ntt(&u);
+        vector_add(&u, &error);
+        scalar_inner_product(&v, &pub->t, &secret);
+        scalar_inverse_ntt(&v);
+        scalar_add(&v, &scalar_error);
+        scalar_decode_1(&expanded_message, message);
+        scalar_decompress(&expanded_message, 1);
+        scalar_add(&v, &expanded_message);
+        vector_compress(&u, kDU768);
+        vector_encode(out, &u, kDU768);
+        scalar_compress(&v, kDV768);
+        scalar_encode(out + kCompressedVectorSize, &v, kDV768);
+}
+/* Calls MLKEM768_encap_external_entropy| with random bytes */
+void
+MLKEM768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM768_public_key *public_key)
+{
+        uint8_t entropy[MLKEM_ENCAP_ENTROPY];
+        arc4random_buf(entropy, MLKEM_ENCAP_ENTROPY);
+        MLKEM768_encap_external_entropy(out_ciphertext, out_shared_secret,
+            public_key, entropy);
+}
+LCRYPTO_ALIAS(MLKEM768_encap);
+/* See section 6.2 of the spec. */
+void
+MLKEM768_encap_external_entropy(
+    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM768_public_key *public_key,
+    const uint8_t entropy[MLKEM_ENCAP_ENTROPY])
+{
+        const struct public_key *pub = public_key_768_from_external(public_key);
+        uint8_t key_and_randomness[64];
+        uint8_t input[64];
+        memcpy(input, entropy, MLKEM_ENCAP_ENTROPY);
+        memcpy(input + MLKEM_ENCAP_ENTROPY, pub->public_key_hash,
+            sizeof(input) - MLKEM_ENCAP_ENTROPY);
+        hash_g(key_and_randomness, input, sizeof(input));
+        encrypt_cpa(out_ciphertext, pub, entropy, key_and_randomness + 32);
+        memcpy(out_shared_secret, key_and_randomness, 32);
+}
+static void
+decrypt_cpa(uint8_t out[32], const struct private_key *priv,
+    const uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES])
+{
+        scalar mask, v;
+        vector u;
+        vector_decode(&u, ciphertext, kDU768);
+        vector_decompress(&u, kDU768);
+        vector_ntt(&u);
+        scalar_decode(&v, ciphertext + kCompressedVectorSize, kDV768);
+        scalar_decompress(&v, kDV768);
+        scalar_inner_product(&mask, &priv->s, &u);
+        scalar_inverse_ntt(&mask);
+        scalar_sub(&v, &mask);
+        scalar_compress(&v, 1);
+        scalar_encode_1(out, &v);
+}
+/* See section 6.3 */
+int
+MLKEM768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    const struct MLKEM768_private_key *private_key)
+{
+        const struct private_key *priv = private_key_768_from_external(
+            private_key);
+        uint8_t expected_ciphertext[MLKEM768_CIPHERTEXT_BYTES];
+        uint8_t key_and_randomness[64];
+        uint8_t failure_key[32];
+        uint8_t decrypted[64];
+        uint8_t mask;
+        int i;
+        if (ciphertext_len != MLKEM768_CIPHERTEXT_BYTES) {
+                arc4random_buf(out_shared_secret, MLKEM_SHARED_SECRET_BYTES);
+                return 0;
+        }
+        decrypt_cpa(decrypted, priv, ciphertext);
+        memcpy(decrypted + 32, priv->pub.public_key_hash,
+            sizeof(decrypted) - 32);
+        hash_g(key_and_randomness, decrypted, sizeof(decrypted));
+        encrypt_cpa(expected_ciphertext, &priv->pub, decrypted,
+            key_and_randomness + 32);
+        kdf(failure_key, priv->fo_failure_secret, ciphertext, ciphertext_len);
+        mask = constant_time_eq_int_8(memcmp(ciphertext, expected_ciphertext,
+            sizeof(expected_ciphertext)), 0);
+        for (i = 0; i < MLKEM_SHARED_SECRET_BYTES; i++) {
+                out_shared_secret[i] = constant_time_select_8(mask,
+                    key_and_randomness[i], failure_key[i]);
+        }
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM768_decap);
+int
+MLKEM768_marshal_public_key(CBB *out,
+    const struct MLKEM768_public_key *public_key)
+{
+        return mlkem_marshal_public_key(out,
+            public_key_768_from_external(public_key));
+}
+LCRYPTO_ALIAS(MLKEM768_marshal_public_key);
+/*
+ * mlkem_parse_public_key_no_hash parses |in| into |pub| but doesn't calculate
+ * the value of |pub->public_key_hash|.
+ */
+static int
+mlkem_parse_public_key_no_hash(struct public_key *pub, CBS *in)
+{
+        CBS t_bytes;
+        if (!CBS_get_bytes(in, &t_bytes, kEncodedVectorSize) ||
+            !vector_decode(&pub->t, CBS_data(&t_bytes), kLog2Prime)) {
+                return 0;
+        }
+        memcpy(pub->rho, CBS_data(in), sizeof(pub->rho));
+        if (!CBS_skip(in, sizeof(pub->rho)))
+                return 0;
+        matrix_expand(&pub->m, pub->rho);
+        return 1;
+}
+int
+MLKEM768_parse_public_key(struct MLKEM768_public_key *public_key, CBS *in)
+{
+        struct public_key *pub = public_key_768_from_external(public_key);
+        CBS orig_in = *in;
+        if (!mlkem_parse_public_key_no_hash(pub, in) ||
+            CBS_len(in) != 0) {
+                return 0;
+        }
+        hash_h(pub->public_key_hash, CBS_data(&orig_in), CBS_len(&orig_in));
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM768_parse_public_key);
+int
+MLKEM768_marshal_private_key(CBB *out,
+    const struct MLKEM768_private_key *private_key)
+{
+        const struct private_key *const priv = private_key_768_from_external(
+            private_key);
+        uint8_t *s_output;
+        if (!CBB_add_space(out, &s_output, kEncodedVectorSize)) {
+                return 0;
+        }
+        vector_encode(s_output, &priv->s, kLog2Prime);
+        if (!mlkem_marshal_public_key(out, &priv->pub) ||
+            !CBB_add_bytes(out, priv->pub.public_key_hash,
+            sizeof(priv->pub.public_key_hash)) ||
+            !CBB_add_bytes(out, priv->fo_failure_secret,
+            sizeof(priv->fo_failure_secret))) {
+                return 0;
+        }
+        return 1;
+}
+int
+MLKEM768_parse_private_key(struct MLKEM768_private_key *out_private_key,
+    CBS *in)
+{
+        struct private_key *const priv = private_key_768_from_external(
+            out_private_key);
+        CBS s_bytes;
+        if (!CBS_get_bytes(in, &s_bytes, kEncodedVectorSize) ||
+            !vector_decode(&priv->s, CBS_data(&s_bytes), kLog2Prime) ||
+            !mlkem_parse_public_key_no_hash(&priv->pub, in)) {
+                return 0;
+        }
+        memcpy(priv->pub.public_key_hash, CBS_data(in),
+            sizeof(priv->pub.public_key_hash));
+        if (!CBS_skip(in, sizeof(priv->pub.public_key_hash)))
+                return 0;
+        memcpy(priv->fo_failure_secret, CBS_data(in),
+            sizeof(priv->fo_failure_secret));
+        if (!CBS_skip(in, sizeof(priv->fo_failure_secret)))
+                return 0;
+        if (CBS_len(in) != 0)
+                return 0;
+        return 1;
+}
+LCRYPTO_ALIAS(MLKEM768_parse_private_key);
diff --git a/src/lib/libcrypto/mlkem/mlkem_internal.h b/src/lib/libcrypto/mlkem/mlkem_internal.h
new file mode 100644
index 0000000000..3ef877f6d1
--- /dev/null
+++ b/src/lib/libcrypto/mlkem/mlkem_internal.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2023, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#ifndef OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H
+#include <openssl/mlkem.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif
+__BEGIN_HIDDEN_DECLS
+/*
+ * MLKEM_ENCAP_ENTROPY is the number of bytes of uniformly random entropy
+ * necessary to encapsulate a secret. The entropy will be leaked to the
+ * decapsulating party.
+ */
+#define MLKEM_ENCAP_ENTROPY 32
+/*
+ * MLKEM768_generate_key_external_entropy is a deterministic function to create a
+ * pair of ML-KEM 768 keys, using the supplied entropy. The entropy needs to be
+ * uniformly random generated. This function is should only be used for tests,
+ * regular callers should use the non-deterministic |MLKEM_generate_key|
+ * directly.
+ */
+void MLKEM768_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    struct MLKEM768_private_key *out_private_key,
+    const uint8_t entropy[MLKEM_SEED_BYTES]);
+/*
+ * MLKEM768_PRIVATE_KEY_BYTES is the length of the data produced by
+ * |MLKEM768_marshal_private_key|.
+ */
+#define MLKEM768_PRIVATE_KEY_BYTES 2400
+/*
+ * MLKEM768_marshal_private_key serializes |private_key| to |out| in the standard
+ * format for ML-KEM private keys. It returns one on success or zero on
+ * allocation error.
+ */
+int MLKEM768_marshal_private_key(CBB *out,
+    const struct MLKEM768_private_key *private_key);
+/*
+ * MLKEM_encap_external_entropy behaves like |MLKEM_encap|, but uses
+ * |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The decapsulating
+ * side will be able to recover |entropy| in full. This function should only be
+ * used for tests, regular callers should use the non-deterministic
+ * |MLKEM_encap| directly.
+ */
+void MLKEM768_encap_external_entropy(
+    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM768_public_key *public_key,
+    const uint8_t entropy[MLKEM_ENCAP_ENTROPY]);
+__END_HIDDEN_DECLS
+#if defined(__cplusplus)
+}
+#endif
+#endif  /* OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H */
author	beck <>	2024-12-13 00:03:57 +0000
committer	beck <>	2024-12-13 00:03:57 +0000
commit	ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621 (patch)
tree	a9ff1c725db56dbeb46224505b3dd6fd05a21777 /src/lib
parent	fd906c7b27573203602764309c3cf5faaefdf573 (diff)
download	openbsd-ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621.tar.gz openbsd-ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621.tar.bz2 openbsd-ed3c5d3a4797d4b1d8f9769cfc43f8e686a59621.zip