summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorbeck <>2024-12-17 17:06:10 +0000
committerbeck <>2024-12-17 17:06:10 +0000
commit7c551e383e59324312bfdf0f615ab2da73ce1217 (patch)
tree0fd6a01051f3ab0f76cbc4c2cfb8bb3db5aa8e3a /src
parente14f6b6a614c7fef4d056c57a6b22ae998c59312 (diff)
downloadopenbsd-7c551e383e59324312bfdf0f615ab2da73ce1217.tar.gz
openbsd-7c551e383e59324312bfdf0f615ab2da73ce1217.tar.bz2
openbsd-7c551e383e59324312bfdf0f615ab2da73ce1217.zip
Avoid a reduce once that can cause Clang misoptomization.
Some versions of Clang compile this to non-constant time code. The fix is adapted from boring. For full details see: https://boringssl-review.googlesource.com/c/boringssl/+/74447 ok tb@
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/mlkem/mlkem1024.c38
-rw-r--r--src/lib/libcrypto/mlkem/mlkem768.c38
2 files changed, 54 insertions, 22 deletions
diff --git a/src/lib/libcrypto/mlkem/mlkem1024.c b/src/lib/libcrypto/mlkem/mlkem1024.c
index a6256ece83..d016a1de15 100644
--- a/src/lib/libcrypto/mlkem/mlkem1024.c
+++ b/src/lib/libcrypto/mlkem/mlkem1024.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: mlkem1024.c,v 1.2 2024/12/17 07:13:47 tb Exp $ */ 1/* $OpenBSD: mlkem1024.c,v 1.3 2024/12/17 17:06:10 beck Exp $ */
2/* 2/*
3 * Copyright (c) 2024, Google Inc. 3 * Copyright (c) 2024, Google Inc.
4 * Copyright (c) 2024, Bob Beck <beck@obtuse.com> 4 * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
@@ -181,11 +181,18 @@ reduce_once(uint16_t x)
181 assert(x < 2 * kPrime); 181 assert(x < 2 * kPrime);
182 const uint16_t subtracted = x - kPrime; 182 const uint16_t subtracted = x - kPrime;
183 uint16_t mask = 0u - (subtracted >> 15); 183 uint16_t mask = 0u - (subtracted >> 15);
184
184 /* 185 /*
185 * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of 186 * Although this is a constant-time select, we omit a value barrier here.
186 * ML-KEM overall and Clang still produces constant-time code using 187 * Value barriers impede auto-vectorization (likely because it forces the
187 * `csel`. On other platforms & compilers on godbolt that we care about, 188 * value to transit through a general-purpose register). On AArch64, this
188 * this code also produces constant-time output. 189 * is a difference of 2x.
190 *
191 * We usually add value barriers to selects because Clang turns
192 * consecutive selects with the same condition into a branch instead of
193 * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it
194 * seems to be safe so far but see
195 * |scalar_centered_binomial_distribution_eta_2_with_prf|.
189 */ 196 */
190 return (mask & x) | (~mask & subtracted); 197 return (mask & x) | (~mask & subtracted);
191} 198}
@@ -466,17 +473,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out,
466 473
467 for (i = 0; i < DEGREE; i += 2) { 474 for (i = 0; i < DEGREE; i += 2) {
468 uint8_t byte = entropy[i / 2]; 475 uint8_t byte = entropy[i / 2];
469 uint16_t value = kPrime; 476 uint16_t mask;
477 uint16_t value = (byte & 1) + ((byte >> 1) & 1);
470 478
471 value += (byte & 1) + ((byte >> 1) & 1);
472 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); 479 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
473 out->c[i] = reduce_once(value); 480 /*
481 * Add |kPrime| if |value| underflowed. See |reduce_once| for a
482 * discussion on why the value barrier is omitted. While this
483 * could have been written reduce_once(value + kPrime), this is
484 * one extra addition and small range of |value| tempts some
485 * versions of Clang to emit a branch.
486 */
487 mask = 0u - (value >> 15);
488 out->c[i] = ((value + kPrime) & mask) | (value & ~mask);
474 489
475 byte >>= 4; 490 byte >>= 4;
476 value = kPrime; 491 value = (byte & 1) + ((byte >> 1) & 1);
477 value += (byte & 1) + ((byte >> 1) & 1);
478 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); 492 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
479 out->c[i + 1] = reduce_once(value); 493 /* See above. */
494 mask = 0u - (value >> 15);
495 out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask);
480 } 496 }
481} 497}
482 498
diff --git a/src/lib/libcrypto/mlkem/mlkem768.c b/src/lib/libcrypto/mlkem/mlkem768.c
index daa026e2a3..4f8affaf60 100644
--- a/src/lib/libcrypto/mlkem/mlkem768.c
+++ b/src/lib/libcrypto/mlkem/mlkem768.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: mlkem768.c,v 1.3 2024/12/17 07:13:47 tb Exp $ */ 1/* $OpenBSD: mlkem768.c,v 1.4 2024/12/17 17:06:10 beck Exp $ */
2/* 2/*
3 * Copyright (c) 2024, Google Inc. 3 * Copyright (c) 2024, Google Inc.
4 * Copyright (c) 2024, Bob Beck <beck@obtuse.com> 4 * Copyright (c) 2024, Bob Beck <beck@obtuse.com>
@@ -180,11 +180,18 @@ reduce_once(uint16_t x)
180 assert(x < 2 * kPrime); 180 assert(x < 2 * kPrime);
181 const uint16_t subtracted = x - kPrime; 181 const uint16_t subtracted = x - kPrime;
182 uint16_t mask = 0u - (subtracted >> 15); 182 uint16_t mask = 0u - (subtracted >> 15);
183
183 /* 184 /*
184 * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of 185 * Although this is a constant-time select, we omit a value barrier here.
185 * ML-KEM overall and Clang still produces constant-time code using 186 * Value barriers impede auto-vectorization (likely because it forces the
186 * `csel`. On other platforms & compilers on godbolt that we care about, 187 * value to transit through a general-purpose register). On AArch64, this
187 * this code also produces constant-time output. 188 * is a difference of 2x.
189 *
190 * We usually add value barriers to selects because Clang turns
191 * consecutive selects with the same condition into a branch instead of
192 * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it
193 * seems to be safe so far but see
194 * |scalar_centered_binomial_distribution_eta_2_with_prf|.
188 */ 195 */
189 return (mask & x) | (~mask & subtracted); 196 return (mask & x) | (~mask & subtracted);
190} 197}
@@ -465,17 +472,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out,
465 472
466 for (i = 0; i < DEGREE; i += 2) { 473 for (i = 0; i < DEGREE; i += 2) {
467 uint8_t byte = entropy[i / 2]; 474 uint8_t byte = entropy[i / 2];
468 uint16_t value = kPrime; 475 uint16_t mask;
476 uint16_t value = (byte & 1) + ((byte >> 1) & 1);
469 477
470 value += (byte & 1) + ((byte >> 1) & 1);
471 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); 478 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
472 out->c[i] = reduce_once(value); 479 /*
480 * Add |kPrime| if |value| underflowed. See |reduce_once| for a
481 * discussion on why the value barrier is omitted. While this
482 * could have been written reduce_once(value + kPrime), this is
483 * one extra addition and small range of |value| tempts some
484 * versions of Clang to emit a branch.
485 */
486 mask = 0u - (value >> 15);
487 out->c[i] = ((value + kPrime) & mask) | (value & ~mask);
473 488
474 byte >>= 4; 489 byte >>= 4;
475 value = kPrime; 490 value = (byte & 1) + ((byte >> 1) & 1);
476 value += (byte & 1) + ((byte >> 1) & 1);
477 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); 491 value -= ((byte >> 2) & 1) + ((byte >> 3) & 1);
478 out->c[i + 1] = reduce_once(value); 492 /* See above. */
493 mask = 0u - (value >> 15);
494 out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask);
479 } 495 }
480} 496}
481 497