summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjsing <>2023-06-17 14:43:50 +0000
committerjsing <>2023-06-17 14:43:50 +0000
commit1e9f6df5ef913da2d33a3fe78ba8621500d23a25 (patch)
tree08bd0fb55aab5260bd57e742a1e2602ffb35ee9e
parent3c634e1940d0eae384ff7716583caec4334afc3c (diff)
downloadopenbsd-1e9f6df5ef913da2d33a3fe78ba8621500d23a25.tar.gz
openbsd-1e9f6df5ef913da2d33a3fe78ba8621500d23a25.tar.bz2
openbsd-1e9f6df5ef913da2d33a3fe78ba8621500d23a25.zip
Speed up Montgomery multiplication.
Factor out and optimise the inner loop for Montgomery multiplication, making use of bn_qwmulw_addqw_addw() to perform Montgomery multiplication by one word in larger steps. This provides a significant performance gain, especially on platforms where bn_qwmulw_addqw_addw() is (or can be) optimised. ok tb@
-rw-r--r--src/lib/libcrypto/bn/bn_mont.c47
1 files changed, 37 insertions, 10 deletions
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
index 6194e09953..b44246e070 100644
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ b/src/lib/libcrypto/bn/bn_mont.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_mont.c,v 1.59 2023/04/30 05:21:20 tb Exp $ */ 1/* $OpenBSD: bn_mont.c,v 1.60 2023/06/17 14:43:50 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -327,6 +327,36 @@ bn_mod_mul_montgomery_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
327 return ret; 327 return ret;
328} 328}
329 329
330static void
331bn_montgomery_multiply_word(const BN_ULONG *ap, BN_ULONG b, const BN_ULONG *np,
332 BN_ULONG *tp, BN_ULONG w, BN_ULONG *carry_a, BN_ULONG *carry_n, int n_len)
333{
334 BN_ULONG x3, x2, x1, x0;
335
336 *carry_a = *carry_n = 0;
337
338 while (n_len & ~3) {
339 bn_qwmulw_addqw_addw(ap[3], ap[2], ap[1], ap[0], b,
340 tp[3], tp[2], tp[1], tp[0], *carry_a, carry_a,
341 &x3, &x2, &x1, &x0);
342 bn_qwmulw_addqw_addw(np[3], np[2], np[1], np[0], w,
343 x3, x2, x1, x0, *carry_n, carry_n,
344 &tp[3], &tp[2], &tp[1], &tp[0]);
345 ap += 4;
346 np += 4;
347 tp += 4;
348 n_len -= 4;
349 }
350 while (n_len > 0) {
351 bn_mulw_addw_addw(ap[0], b, tp[0], *carry_a, carry_a, &x0);
352 bn_mulw_addw_addw(np[0], w, x0, *carry_n, carry_n, &tp[0]);
353 ap++;
354 np++;
355 tp++;
356 n_len--;
357 }
358}
359
330/* 360/*
331 * bn_montgomery_multiply_words() computes r = aR * bR * R^-1 = abR for the 361 * bn_montgomery_multiply_words() computes r = aR * bR * R^-1 = abR for the
332 * given word arrays. The caller must ensure that rp, ap, bp and np are all 362 * given word arrays. The caller must ensure that rp, ap, bp and np are all
@@ -336,10 +366,10 @@ void
336bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 366bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
337 const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0, int n_len) 367 const BN_ULONG *np, BN_ULONG *tp, BN_ULONG n0, int n_len)
338{ 368{
339 BN_ULONG a0, b, carry_a, carry_n, carry, mask, w, x; 369 BN_ULONG a0, b, carry_a, carry_n, carry, mask, w;
340 int i, j; 370 int i;
341 371
342 carry_a = carry_n = carry = 0; 372 carry = 0;
343 373
344 for (i = 0; i < n_len; i++) 374 for (i = 0; i < n_len; i++)
345 tp[i] = 0; 375 tp[i] = 0;
@@ -349,15 +379,12 @@ bn_montgomery_multiply_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *b
349 for (i = 0; i < n_len; i++) { 379 for (i = 0; i < n_len; i++) {
350 b = bp[i]; 380 b = bp[i];
351 381
352 /* Compute new t[0] * n0, as we need it inside the loop. */ 382 /* Compute new t[0] * n0, as we need it for this iteration. */
353 w = (a0 * b + tp[0]) * n0; 383 w = (a0 * b + tp[0]) * n0;
354 384
355 for (j = 0; j < n_len; j++) { 385 bn_montgomery_multiply_word(ap, b, np, tp, w, &carry_a,
356 bn_mulw_addw_addw(ap[j], b, tp[j], carry_a, &carry_a, &x); 386 &carry_n, n_len);
357 bn_mulw_addw_addw(np[j], w, x, carry_n, &carry_n, &tp[j]);
358 }
359 bn_addw_addw(carry_a, carry_n, carry, &carry, &tp[n_len]); 387 bn_addw_addw(carry_a, carry_n, carry, &carry, &tp[n_len]);
360 carry_a = carry_n = 0;
361 388
362 tp++; 389 tp++;
363 } 390 }