From e074f5c2c19e5324112455428b2f4b66b012e9a6 Mon Sep 17 00:00:00 2001 From: jsing <> Date: Sun, 28 May 2023 17:42:30 +0000 Subject: Provide optimised bn_mulw_{addw,addw_addw,addtw}() for aarch64. This results in bn_mul_comba4() and bn_mul_comba8() requiring ~30% less instructions than they did previously. --- src/lib/libcrypto/bn/arch/aarch64/bn_arch.h | 69 ++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) (limited to 'src/lib') diff --git a/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h b/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h index 1b9358e710..708083aaf2 100644 --- a/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h +++ b/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h @@ -1,4 +1,4 @@ -/* $OpenBSD: bn_arch.h,v 1.8 2023/05/28 17:22:04 jsing Exp $ */ +/* $OpenBSD: bn_arch.h,v 1.9 2023/05/28 17:42:30 jsing Exp $ */ /* * Copyright (c) 2023 Joel Sing * @@ -81,6 +81,73 @@ bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0) *out_r0 = r0; } +#define HAVE_BN_MULW_ADDW + +static inline void +bn_mulw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1, + BN_ULONG *out_r0) +{ + BN_ULONG r1, r0; + + __asm__ ( + "umulh %[r1], %[a], %[b] \n" + "mul %[r0], %[a], %[b] \n" + "adds %[r0], %[r0], %[c] \n" + "adc %[r1], %[r1], xzr \n" + : [r1]"=&r"(r1), [r0]"=&r"(r0) + : [a]"r"(a), [b]"r"(b), [c]"r"(c) + : "cc"); + + *out_r1 = r1; + *out_r0 = r0; +} + +#define HAVE_BN_MULW_ADDW_ADDW + +static inline void +bn_mulw_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG d, + BN_ULONG *out_r1, BN_ULONG *out_r0) +{ + BN_ULONG r1, r0; + + __asm__ ( + "umulh %[r1], %[a], %[b] \n" + "mul %[r0], %[a], %[b] \n" + "adds %[r0], %[r0], %[c] \n" + "adc %[r1], %[r1], xzr \n" + "adds %[r0], %[r0], %[d] \n" + "adc %[r1], %[r1], xzr \n" + : [r1]"=&r"(r1), [r0]"=&r"(r0) + : [a]"r"(a), [b]"r"(b), [c]"r"(c), [d]"r"(d) + : "cc"); + + *out_r1 = r1; + *out_r0 = r0; +} + +#define HAVE_BN_MULW_ADDTW + +static inline void +bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, + BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0) +{ + BN_ULONG r2, r1, r0; + + __asm__ ( + "umulh %[r1], %[a], %[b] \n" + "mul %[r0], %[a], %[b] \n" + "adds %[r0], %[r0], %[c0] \n" + "adcs %[r1], %[r1], %[c1] \n" + "adc %[r2], xzr, %[c2] \n" + : [r2]"=&r"(r2), [r1]"=&r"(r1), [r0]"=&r"(r0) + : [a]"r"(a), [b]"r"(b), [c2]"r"(c2), [c1]"r"(c1), [c0]"r"(c0) + : "cc"); + + *out_r2 = r2; + *out_r1 = r1; + *out_r0 = r0; +} + #define HAVE_BN_SUBW static inline void -- cgit v1.2.3-55-g6feb