diff options
author | beck <> | 2024-12-17 17:06:10 +0000 |
---|---|---|
committer | beck <> | 2024-12-17 17:06:10 +0000 |
commit | 7c551e383e59324312bfdf0f615ab2da73ce1217 (patch) | |
tree | 0fd6a01051f3ab0f76cbc4c2cfb8bb3db5aa8e3a /src | |
parent | e14f6b6a614c7fef4d056c57a6b22ae998c59312 (diff) | |
download | openbsd-7c551e383e59324312bfdf0f615ab2da73ce1217.tar.gz openbsd-7c551e383e59324312bfdf0f615ab2da73ce1217.tar.bz2 openbsd-7c551e383e59324312bfdf0f615ab2da73ce1217.zip |
Avoid a reduce once that can cause Clang misoptomization.
Some versions of Clang compile this to non-constant time
code. The fix is adapted from boring. For full details see:
https://boringssl-review.googlesource.com/c/boringssl/+/74447
ok tb@
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/mlkem/mlkem1024.c | 38 | ||||
-rw-r--r-- | src/lib/libcrypto/mlkem/mlkem768.c | 38 |
2 files changed, 54 insertions, 22 deletions
diff --git a/src/lib/libcrypto/mlkem/mlkem1024.c b/src/lib/libcrypto/mlkem/mlkem1024.c index a6256ece83..d016a1de15 100644 --- a/src/lib/libcrypto/mlkem/mlkem1024.c +++ b/src/lib/libcrypto/mlkem/mlkem1024.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: mlkem1024.c,v 1.2 2024/12/17 07:13:47 tb Exp $ */ | 1 | /* $OpenBSD: mlkem1024.c,v 1.3 2024/12/17 17:06:10 beck Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2024, Google Inc. | 3 | * Copyright (c) 2024, Google Inc. |
4 | * Copyright (c) 2024, Bob Beck <beck@obtuse.com> | 4 | * Copyright (c) 2024, Bob Beck <beck@obtuse.com> |
@@ -181,11 +181,18 @@ reduce_once(uint16_t x) | |||
181 | assert(x < 2 * kPrime); | 181 | assert(x < 2 * kPrime); |
182 | const uint16_t subtracted = x - kPrime; | 182 | const uint16_t subtracted = x - kPrime; |
183 | uint16_t mask = 0u - (subtracted >> 15); | 183 | uint16_t mask = 0u - (subtracted >> 15); |
184 | |||
184 | /* | 185 | /* |
185 | * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of | 186 | * Although this is a constant-time select, we omit a value barrier here. |
186 | * ML-KEM overall and Clang still produces constant-time code using | 187 | * Value barriers impede auto-vectorization (likely because it forces the |
187 | * `csel`. On other platforms & compilers on godbolt that we care about, | 188 | * value to transit through a general-purpose register). On AArch64, this |
188 | * this code also produces constant-time output. | 189 | * is a difference of 2x. |
190 | * | ||
191 | * We usually add value barriers to selects because Clang turns | ||
192 | * consecutive selects with the same condition into a branch instead of | ||
193 | * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it | ||
194 | * seems to be safe so far but see | ||
195 | * |scalar_centered_binomial_distribution_eta_2_with_prf|. | ||
189 | */ | 196 | */ |
190 | return (mask & x) | (~mask & subtracted); | 197 | return (mask & x) | (~mask & subtracted); |
191 | } | 198 | } |
@@ -466,17 +473,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out, | |||
466 | 473 | ||
467 | for (i = 0; i < DEGREE; i += 2) { | 474 | for (i = 0; i < DEGREE; i += 2) { |
468 | uint8_t byte = entropy[i / 2]; | 475 | uint8_t byte = entropy[i / 2]; |
469 | uint16_t value = kPrime; | 476 | uint16_t mask; |
477 | uint16_t value = (byte & 1) + ((byte >> 1) & 1); | ||
470 | 478 | ||
471 | value += (byte & 1) + ((byte >> 1) & 1); | ||
472 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); | 479 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); |
473 | out->c[i] = reduce_once(value); | 480 | /* |
481 | * Add |kPrime| if |value| underflowed. See |reduce_once| for a | ||
482 | * discussion on why the value barrier is omitted. While this | ||
483 | * could have been written reduce_once(value + kPrime), this is | ||
484 | * one extra addition and small range of |value| tempts some | ||
485 | * versions of Clang to emit a branch. | ||
486 | */ | ||
487 | mask = 0u - (value >> 15); | ||
488 | out->c[i] = ((value + kPrime) & mask) | (value & ~mask); | ||
474 | 489 | ||
475 | byte >>= 4; | 490 | byte >>= 4; |
476 | value = kPrime; | 491 | value = (byte & 1) + ((byte >> 1) & 1); |
477 | value += (byte & 1) + ((byte >> 1) & 1); | ||
478 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); | 492 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); |
479 | out->c[i + 1] = reduce_once(value); | 493 | /* See above. */ |
494 | mask = 0u - (value >> 15); | ||
495 | out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask); | ||
480 | } | 496 | } |
481 | } | 497 | } |
482 | 498 | ||
diff --git a/src/lib/libcrypto/mlkem/mlkem768.c b/src/lib/libcrypto/mlkem/mlkem768.c index daa026e2a3..4f8affaf60 100644 --- a/src/lib/libcrypto/mlkem/mlkem768.c +++ b/src/lib/libcrypto/mlkem/mlkem768.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: mlkem768.c,v 1.3 2024/12/17 07:13:47 tb Exp $ */ | 1 | /* $OpenBSD: mlkem768.c,v 1.4 2024/12/17 17:06:10 beck Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2024, Google Inc. | 3 | * Copyright (c) 2024, Google Inc. |
4 | * Copyright (c) 2024, Bob Beck <beck@obtuse.com> | 4 | * Copyright (c) 2024, Bob Beck <beck@obtuse.com> |
@@ -180,11 +180,18 @@ reduce_once(uint16_t x) | |||
180 | assert(x < 2 * kPrime); | 180 | assert(x < 2 * kPrime); |
181 | const uint16_t subtracted = x - kPrime; | 181 | const uint16_t subtracted = x - kPrime; |
182 | uint16_t mask = 0u - (subtracted >> 15); | 182 | uint16_t mask = 0u - (subtracted >> 15); |
183 | |||
183 | /* | 184 | /* |
184 | * On Aarch64, omitting a |value_barrier_u16| results in a 2x speedup of | 185 | * Although this is a constant-time select, we omit a value barrier here. |
185 | * ML-KEM overall and Clang still produces constant-time code using | 186 | * Value barriers impede auto-vectorization (likely because it forces the |
186 | * `csel`. On other platforms & compilers on godbolt that we care about, | 187 | * value to transit through a general-purpose register). On AArch64, this |
187 | * this code also produces constant-time output. | 188 | * is a difference of 2x. |
189 | * | ||
190 | * We usually add value barriers to selects because Clang turns | ||
191 | * consecutive selects with the same condition into a branch instead of | ||
192 | * CMOV/CSEL. This condition does not occur in ML-KEM, so omitting it | ||
193 | * seems to be safe so far but see | ||
194 | * |scalar_centered_binomial_distribution_eta_2_with_prf|. | ||
188 | */ | 195 | */ |
189 | return (mask & x) | (~mask & subtracted); | 196 | return (mask & x) | (~mask & subtracted); |
190 | } | 197 | } |
@@ -465,17 +472,26 @@ scalar_centered_binomial_distribution_eta_2_with_prf(scalar *out, | |||
465 | 472 | ||
466 | for (i = 0; i < DEGREE; i += 2) { | 473 | for (i = 0; i < DEGREE; i += 2) { |
467 | uint8_t byte = entropy[i / 2]; | 474 | uint8_t byte = entropy[i / 2]; |
468 | uint16_t value = kPrime; | 475 | uint16_t mask; |
476 | uint16_t value = (byte & 1) + ((byte >> 1) & 1); | ||
469 | 477 | ||
470 | value += (byte & 1) + ((byte >> 1) & 1); | ||
471 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); | 478 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); |
472 | out->c[i] = reduce_once(value); | 479 | /* |
480 | * Add |kPrime| if |value| underflowed. See |reduce_once| for a | ||
481 | * discussion on why the value barrier is omitted. While this | ||
482 | * could have been written reduce_once(value + kPrime), this is | ||
483 | * one extra addition and small range of |value| tempts some | ||
484 | * versions of Clang to emit a branch. | ||
485 | */ | ||
486 | mask = 0u - (value >> 15); | ||
487 | out->c[i] = ((value + kPrime) & mask) | (value & ~mask); | ||
473 | 488 | ||
474 | byte >>= 4; | 489 | byte >>= 4; |
475 | value = kPrime; | 490 | value = (byte & 1) + ((byte >> 1) & 1); |
476 | value += (byte & 1) + ((byte >> 1) & 1); | ||
477 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); | 491 | value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); |
478 | out->c[i + 1] = reduce_once(value); | 492 | /* See above. */ |
493 | mask = 0u - (value >> 15); | ||
494 | out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask); | ||
479 | } | 495 | } |
480 | } | 496 | } |
481 | 497 | ||