From 7c551e383e59324312bfdf0f615ab2da73ce1217 Mon Sep 17 00:00:00 2001 From: beck <> Date: Tue, 17 Dec 2024 17:06:10 +0000 Subject: Avoid a reduce once that can cause Clang misoptomization. Some versions of Clang compile this to non-constant time code. The fix is adapted from boring. For full details see: https://boringssl-review.googlesource.com/c/boringssl/+/74447 ok tb@ --- src/lib/libcrypto/mlkem/mlkem1024.c | 38 ++++++++++++++++++++++++++----------- src/lib/libcrypto/mlkem/mlkem768.c | 38 ++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/lib/libcrypto/mlkem/mlkem1024.c b/src/lib/libcrypto/mlkem/mlkem1024.c index a6256ece83..d016a1de15 100644 --- a/src/lib/libcrypto/mlkem/mlkem1024.c +++ b/src/lib/libcrypto/mlkem/mlkem1024.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mlkem1024.c,v 1.2 2024/12/17 07:13:47 tb Exp $ */ +/* $OpenBSD: mlkem1024.c,v 1.3 2024/12/17 17:06:10 beck Exp $ */ /* * Copyright (c) 2024, Google Inc. * Copyright (c) 2024, Bob Beck @@ -181,11 +181,18 @@ reduce_once(uint16_t x) assert(x < 2 * kPrime); const uint16_t subtracted = x - kPrime; uint16_t mask = 0u - (subtracted >> 15); + /* - * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of - * ML-KEM overall and Clang still produces constant-time code using - * `csel`. On other platforms & compilers on godbolt that we care about, - * this code also produces constant-time output. + * Although this is a constant-time select, we omit a value barrier here. + * Value barriers impede auto-vectorization (likely because it forces the + * value to transit through a general-purpose register). On AArch64, this + * is a difference of 2x. + * + * We usually add value barriers to selects because Clang turns + * consecutive selects with the same condition into a branch instead of + * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it + * seems to be safe so far but see + * |scalar_centered_binomial_distribution_eta_2_with_prf|. */ return (mask & x) | (~mask & subtracted); } @@ -466,17 +473,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out, for (i = 0; i < DEGREE; i += 2) { uint8_t byte = entropy[i / 2]; - uint16_t value = kPrime; + uint16_t mask; + uint16_t value = (byte & 1) + ((byte >> 1) & 1); - value += (byte & 1) + ((byte >> 1) & 1); value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); - out->c[i] = reduce_once(value); + /* + * Add |kPrime| if |value| underflowed. See |reduce_once| for a + * discussion on why the value barrier is omitted. While this + * could have been written reduce_once(value + kPrime), this is + * one extra addition and small range of |value| tempts some + * versions of Clang to emit a branch. + */ + mask = 0u - (value >> 15); + out->c[i] = ((value + kPrime) & mask) | (value & ~mask); byte >>= 4; - value = kPrime; - value += (byte & 1) + ((byte >> 1) & 1); + value = (byte & 1) + ((byte >> 1) & 1); value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); - out->c[i + 1] = reduce_once(value); + /* See above. */ + mask = 0u - (value >> 15); + out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask); } } diff --git a/src/lib/libcrypto/mlkem/mlkem768.c b/src/lib/libcrypto/mlkem/mlkem768.c index daa026e2a3..4f8affaf60 100644 --- a/src/lib/libcrypto/mlkem/mlkem768.c +++ b/src/lib/libcrypto/mlkem/mlkem768.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mlkem768.c,v 1.3 2024/12/17 07:13:47 tb Exp $ */ +/* $OpenBSD: mlkem768.c,v 1.4 2024/12/17 17:06:10 beck Exp $ */ /* * Copyright (c) 2024, Google Inc. * Copyright (c) 2024, Bob Beck @@ -180,11 +180,18 @@ reduce_once(uint16_t x) assert(x < 2 * kPrime); const uint16_t subtracted = x - kPrime; uint16_t mask = 0u - (subtracted >> 15); + /* - * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of - * ML-KEM overall and Clang still produces constant-time code using - * `csel`. On other platforms & compilers on godbolt that we care about, - * this code also produces constant-time output. + * Although this is a constant-time select, we omit a value barrier here. + * Value barriers impede auto-vectorization (likely because it forces the + * value to transit through a general-purpose register). On AArch64, this + * is a difference of 2x. + * + * We usually add value barriers to selects because Clang turns + * consecutive selects with the same condition into a branch instead of + * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it + * seems to be safe so far but see + * |scalar_centered_binomial_distribution_eta_2_with_prf|. */ return (mask & x) | (~mask & subtracted); } @@ -465,17 +472,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out, for (i = 0; i < DEGREE; i += 2) { uint8_t byte = entropy[i / 2]; - uint16_t value = kPrime; + uint16_t mask; + uint16_t value = (byte & 1) + ((byte >> 1) & 1); - value += (byte & 1) + ((byte >> 1) & 1); value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); - out->c[i] = reduce_once(value); + /* + * Add |kPrime| if |value| underflowed. See |reduce_once| for a + * discussion on why the value barrier is omitted. While this + * could have been written reduce_once(value + kPrime), this is + * one extra addition and small range of |value| tempts some + * versions of Clang to emit a branch. + */ + mask = 0u - (value >> 15); + out->c[i] = ((value + kPrime) & mask) | (value & ~mask); byte >>= 4; - value = kPrime; - value += (byte & 1) + ((byte >> 1) & 1); + value = (byte & 1) + ((byte >> 1) & 1); value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); - out->c[i + 1] = reduce_once(value); + /* See above. */ + mask = 0u - (value >> 15); + out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask); } } -- cgit v1.2.3-55-g6feb