From 7c551e383e59324312bfdf0f615ab2da73ce1217 Mon Sep 17 00:00:00 2001
From: beck <>
Date: Tue, 17 Dec 2024 17:06:10 +0000
Subject: Avoid a reduce once that can cause Clang misoptomization.

Some versions of Clang compile this to non-constant time
code. The fix is adapted from boring. For full details see:
https://boringssl-review.googlesource.com/c/boringssl/+/74447

ok tb@
---
 src/lib/libcrypto/mlkem/mlkem1024.c | 38 ++++++++++++++++++++++++++-----------
 src/lib/libcrypto/mlkem/mlkem768.c  | 38 ++++++++++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 22 deletions(-)

(limited to 'src')

diff --git a/src/lib/libcrypto/mlkem/mlkem1024.c b/src/lib/libcrypto/mlkem/mlkem1024.c
index a6256ece83..d016a1de15 100644
--- a/src/lib/libcrypto/mlkem/mlkem1024.c
+++ b/src/lib/libcrypto/mlkem/mlkem1024.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mlkem1024.c,v 1.2 2024/12/17 07:13:47 tb Exp $ */
+/* $OpenBSD: mlkem1024.c,v 1.3 2024/12/17 17:06:10 beck Exp $ */
 /*
  * Copyright (c) 2024, Google Inc.
  * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
@@ -181,11 +181,18 @@ reduce_once(uint16_t x)
 	assert(x < 2 * kPrime);
 	const uint16_t subtracted = x - kPrime;
 	uint16_t mask = 0u - (subtracted >> 15);
+
 	/*
-	 * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of
-	 * ML-KEM overall and Clang still produces constant-time code using
-	 * `csel`. On other platforms & compilers on godbolt that we care about,
-	 * this code also produces constant-time output.
+	 * Although this is a constant-time select, we omit a value barrier here.
+	 * Value barriers impede auto-vectorization (likely because it forces the
+	 * value to transit through a general-purpose register). On AArch64, this
+	 * is a difference of 2x.
+	 *
+	 * We usually add value barriers to selects because Clang turns 
+         * consecutive selects with the same condition into a branch instead of
+	 * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it
+         * seems to be safe so far  but see
+         * |scalar_centered_binomial_distribution_eta_2_with_prf|.
 	 */
 	return (mask & x) | (~mask & subtracted);
 }
@@ -466,17 +473,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out,
 
 	for (i = 0; i < DEGREE; i += 2) {
 		uint8_t byte = entropy[i / 2];
-		uint16_t value = kPrime;
+		uint16_t mask;
+		uint16_t value = (byte & 1) + ((byte >> 1) & 1);
 
-		value += (byte & 1) + ((byte >> 1) & 1);
 		value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
-		out->c[i] = reduce_once(value);
+		/*
+		 * Add |kPrime| if |value| underflowed. See |reduce_once| for a
+		 * discussion on why the value barrier is omitted. While this
+		 * could have been written reduce_once(value + kPrime), this is
+		 * one extra addition and small range of |value| tempts some
+		 * versions of Clang to emit a branch.
+		 */
+		mask = 0u - (value >> 15);
+		out->c[i] = ((value + kPrime) & mask) | (value & ~mask);
 
 		byte >>= 4;
-		value = kPrime;
-		value += (byte & 1) + ((byte >> 1) & 1);
+		value = (byte & 1) + ((byte >> 1) & 1);
 		value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
-		out->c[i + 1] = reduce_once(value);
+		/*  See above. */
+		mask = 0u - (value >> 15);
+		out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask);
 	}
 }
 
diff --git a/src/lib/libcrypto/mlkem/mlkem768.c b/src/lib/libcrypto/mlkem/mlkem768.c
index daa026e2a3..4f8affaf60 100644
--- a/src/lib/libcrypto/mlkem/mlkem768.c
+++ b/src/lib/libcrypto/mlkem/mlkem768.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mlkem768.c,v 1.3 2024/12/17 07:13:47 tb Exp $ */
+/* $OpenBSD: mlkem768.c,v 1.4 2024/12/17 17:06:10 beck Exp $ */
 /*
  * Copyright (c) 2024, Google Inc.
  * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
@@ -180,11 +180,18 @@ reduce_once(uint16_t x)
 	assert(x < 2 * kPrime);
 	const uint16_t subtracted = x - kPrime;
 	uint16_t mask = 0u - (subtracted >> 15);
+
 	/*
-	 * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of
-	 * ML-KEM overall and Clang still produces constant-time code using
-	 * `csel`. On other platforms & compilers on godbolt that we care about,
-	 * this code also produces constant-time output.
+	 * Although this is a constant-time select, we omit a value barrier here.
+	 * Value barriers impede auto-vectorization (likely because it forces the
+	 * value to transit through a general-purpose register). On AArch64, this
+	 * is a difference of 2x.
+	 *
+	 * We usually add value barriers to selects because Clang turns 
+         * consecutive selects with the same condition into a branch instead of
+	 * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it
+         * seems to be safe so far  but see
+         * |scalar_centered_binomial_distribution_eta_2_with_prf|.
 	 */
 	return (mask & x) | (~mask & subtracted);
 }
@@ -465,17 +472,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out,
 
 	for (i = 0; i < DEGREE; i += 2) {
 		uint8_t byte = entropy[i / 2];
-		uint16_t value = kPrime;
+		uint16_t mask;
+		uint16_t value = (byte & 1) + ((byte >> 1) & 1);
 
-		value += (byte & 1) + ((byte >> 1) & 1);
 		value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
-		out->c[i] = reduce_once(value);
+		/*
+		 * Add |kPrime| if |value| underflowed. See |reduce_once| for a
+		 * discussion on why the value barrier is omitted. While this
+		 * could have been written reduce_once(value + kPrime), this is
+		 * one extra addition and small range of |value| tempts some
+		 * versions of Clang to emit a branch.
+		 */
+		mask = 0u - (value >> 15);
+		out->c[i] = ((value + kPrime) & mask) | (value & ~mask);
 
 		byte >>= 4;
-		value = kPrime;
-		value += (byte & 1) + ((byte >> 1) & 1);
+		value = (byte & 1) + ((byte >> 1) & 1);
 		value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
-		out->c[i + 1] = reduce_once(value);
+		/*  See above. */
+		mask = 0u - (value >> 15);
+		out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask);
 	}
 }
 
-- 
cgit v1.2.3-55-g6feb