From fccd42a0ddbc572e85ac564dc08f0da4315d8275 Mon Sep 17 00:00:00 2001 From: jsing <> Date: Sat, 28 Jun 2025 12:51:08 +0000 Subject: Provide accelerated SHA-1 for aarch64. Provide an assembly implementation of SHA-1 for aarch64 using the ARM Cryptographic Extension (CE). This results in around a 2x speed up for larger block sizes. ok tb@ --- src/lib/libcrypto/arch/aarch64/Makefile.inc | 4 +- src/lib/libcrypto/arch/aarch64/crypto_arch.h | 3 +- src/lib/libcrypto/sha/sha1_aarch64.c | 34 +++++ src/lib/libcrypto/sha/sha1_aarch64_ce.S | 214 +++++++++++++++++++++++++++ 4 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 src/lib/libcrypto/sha/sha1_aarch64.c create mode 100644 src/lib/libcrypto/sha/sha1_aarch64_ce.S (limited to 'src') diff --git a/src/lib/libcrypto/arch/aarch64/Makefile.inc b/src/lib/libcrypto/arch/aarch64/Makefile.inc index d93cb815ef..d1f22d87cd 100644 --- a/src/lib/libcrypto/arch/aarch64/Makefile.inc +++ b/src/lib/libcrypto/arch/aarch64/Makefile.inc @@ -1,9 +1,11 @@ -# $OpenBSD: Makefile.inc,v 1.16 2025/03/12 14:13:41 jsing Exp $ +# $OpenBSD: Makefile.inc,v 1.17 2025/06/28 12:51:08 jsing Exp $ # aarch64-specific libcrypto build rules SRCS += crypto_cpu_caps.c +SRCS += sha1_aarch64.c +SRCS += sha1_aarch64_ce.S SRCS += sha256_aarch64.c SRCS += sha256_aarch64_ce.S SRCS += sha512_aarch64.c diff --git a/src/lib/libcrypto/arch/aarch64/crypto_arch.h b/src/lib/libcrypto/arch/aarch64/crypto_arch.h index 35ecba9394..51c8d79e2d 100644 --- a/src/lib/libcrypto/arch/aarch64/crypto_arch.h +++ b/src/lib/libcrypto/arch/aarch64/crypto_arch.h @@ -1,4 +1,4 @@ -/* $OpenBSD: crypto_arch.h,v 1.4 2025/03/12 14:13:41 jsing Exp $ */ +/* $OpenBSD: crypto_arch.h,v 1.5 2025/06/28 12:51:08 jsing Exp $ */ /* * Copyright (c) 2024 Joel Sing * @@ -35,6 +35,7 @@ extern uint64_t crypto_cpu_caps_aarch64; #ifndef OPENSSL_NO_ASM +#define HAVE_SHA1_BLOCK_DATA_ORDER #define HAVE_SHA256_BLOCK_DATA_ORDER #define HAVE_SHA512_BLOCK_DATA_ORDER diff --git a/src/lib/libcrypto/sha/sha1_aarch64.c b/src/lib/libcrypto/sha/sha1_aarch64.c new file mode 100644 index 0000000000..04c87761e0 --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_aarch64.c @@ -0,0 +1,34 @@ +/* $OpenBSD: sha1_aarch64.c,v 1.1 2025/06/28 12:51:08 jsing Exp $ */ +/* + * Copyright (c) 2025 Joel Sing + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "crypto_arch.h" + +void sha1_block_ce(SHA256_CTX *ctx, const void *in, size_t num); +void sha1_block_generic(SHA256_CTX *ctx, const void *in, size_t num); + +void +sha1_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) +{ + if ((crypto_cpu_caps_aarch64 & CRYPTO_CPU_CAPS_AARCH64_SHA1) != 0) { + sha1_block_ce(ctx, in, num); + return; + } + + sha1_block_generic(ctx, in, num); +} diff --git a/src/lib/libcrypto/sha/sha1_aarch64_ce.S b/src/lib/libcrypto/sha/sha1_aarch64_ce.S new file mode 100644 index 0000000000..8ccf230298 --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_aarch64_ce.S @@ -0,0 +1,214 @@ +/* $OpenBSD: sha1_aarch64_ce.S,v 1.1 2025/06/28 12:51:08 jsing Exp $ */ +/* + * Copyright (c) 2023,2025 Joel Sing + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * SHA-1 implementation using the ARM Cryptographic Extension (CE). + * + * There are six instructions for hardware acceleration of SHA-1 - the + * documentation for these instructions is woefully inadequate: + * + * sha1c: hash update (choose) + * sha1h: fixed rotate + * sha1m: hash update (majority) + * sha1p: hash update (parity) + * sha1su0: message schedule update with sigma0 for four rounds + * sha1su1: message schedule update with sigma1 for four rounds + */ + +#define ctx x0 +#define in x1 +#define num x2 + +/* Note: the lower 64 bits of v8 through v15 are callee saved. */ + +#define hc0 v16 +#define hc1 v17 +#define hc1s s17 + +#define hs0 v18 +#define hs1 v19 +#define hs1s s19 + +#define w0 v20 +#define w1 v21 +#define w2 v22 +#define w3 v23 + +#define k0 v24 +#define k1 v25 +#define k2 v26 +#define k3 v27 + +#define tmp0 v28 +#define tmp1 s29 + +#define tmp2 w11 + +/* + * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7), + * m2 (W8:W9:W10:11) and m3 (W12:W13:W14:W15). The sha1su0 instruction computes + * W0 = W8 ^ W2 ^ W0, while sha1su1 computes rol(W0 ^ W13, 1). + */ +#define sha1_message_schedule_update(m0, m1, m2, m3) \ + sha1su0 m0.4s, m1.4s, m2.4s; \ + sha1su1 m0.4s, m3.4s; + +/* + * Compute four SHA-1 rounds by adding W0:W1:W2:W3 + K0:K1:K2:K3, then + * computing the remainder of each round (including the shuffle) via + * sha1{c,p,m}/sha1h. + */ + +#define sha1_round1(h0, h1, w, k) \ + add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ + mov tmp1, h0.s[0]; \ + sha1c h0, h1, tmp0.4s; \ + sha1h h1, tmp1; + +#define sha1_round2(h0, h1, w, k) \ + add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ + mov tmp1, h0.s[0]; \ + sha1p h0, h1, tmp0.4s; \ + sha1h h1, tmp1; + +#define sha1_round3(h0, h1, w, k) \ + add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ + mov tmp1, h0.s[0]; \ + sha1m h0, h1, tmp0.4s; \ + sha1h h1, tmp1; + +#define sha1_round4(h0, h1, w, k) \ + add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ + mov tmp1, h0.s[0]; \ + sha1p h0, h1, tmp0.4s; \ + sha1h h1, tmp1; + +.arch armv8-a+sha2 + +.text + +/* + * void sha1_block_ce(SHA256_CTX *ctx, const void *in, size_t num); + * + * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num + */ +.globl sha1_block_ce +.type sha1_block_ce,@function +sha1_block_ce: + + /* + * Load SHA-1 round constants. + */ + + /* Round 1 - 0x5a827999 */ + movz tmp2, #0x5a82, lsl #16 + movk tmp2, #0x7999 + dup k0.4s, tmp2 + + /* Round 2 - 0x6ed9eba1 */ + movz tmp2, #0x6ed9, lsl #16 + movk tmp2, #0xeba1 + dup k1.4s, tmp2 + + /* Round 3 - 0x8f1bbcdc */ + movz tmp2, #0x8f1b, lsl #16 + movk tmp2, #0xbcdc + dup k2.4s, tmp2 + + /* Round 4 - 0xca62c1d6 */ + movz tmp2, #0xca62, lsl #16 + movk tmp2, #0xc1d6 + dup k3.4s, tmp2 + + /* Load current hash state from context (hc0 = a:b:c:d, hc1 = e). */ + ld1 {hc0.4s}, [ctx] + ldr hc1s, [ctx, #(4*4)] + +block_loop: + /* Copy current hash state. */ + mov hs0.4s, hc0.4s + mov hs1s, hc1.s[0] + + /* Load and byte swap message schedule. */ + ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64 + rev32 w0.16b, w0.16b + rev32 w1.16b, w1.16b + rev32 w2.16b, w2.16b + rev32 w3.16b, w3.16b + + /* Rounds 0 through 15 (four rounds at a time). */ + sha1_round1(hs0, hs1s, w0, k0) + sha1_round1(hs0, hs1s, w1, k0) + sha1_round1(hs0, hs1s, w2, k0) + sha1_round1(hs0, hs1s, w3, k0) + + /* Rounds 16 through 31 (four rounds at a time). */ + sha1_message_schedule_update(w0, w1, w2, w3) + sha1_message_schedule_update(w1, w2, w3, w0) + sha1_message_schedule_update(w2, w3, w0, w1) + sha1_message_schedule_update(w3, w0, w1, w2) + + sha1_round1(hs0, hs1s, w0, k0) + sha1_round2(hs0, hs1s, w1, k1) + sha1_round2(hs0, hs1s, w2, k1) + sha1_round2(hs0, hs1s, w3, k1) + + /* Rounds 32 through 47 (four rounds at a time). */ + sha1_message_schedule_update(w0, w1, w2, w3) + sha1_message_schedule_update(w1, w2, w3, w0) + sha1_message_schedule_update(w2, w3, w0, w1) + sha1_message_schedule_update(w3, w0, w1, w2) + + sha1_round2(hs0, hs1s, w0, k1) + sha1_round2(hs0, hs1s, w1, k1) + sha1_round3(hs0, hs1s, w2, k2) + sha1_round3(hs0, hs1s, w3, k2) + + /* Rounds 48 through 63 (four rounds at a time). */ + sha1_message_schedule_update(w0, w1, w2, w3) + sha1_message_schedule_update(w1, w2, w3, w0) + sha1_message_schedule_update(w2, w3, w0, w1) + sha1_message_schedule_update(w3, w0, w1, w2) + + sha1_round3(hs0, hs1s, w0, k2) + sha1_round3(hs0, hs1s, w1, k2) + sha1_round3(hs0, hs1s, w2, k2) + sha1_round4(hs0, hs1s, w3, k3) + + /* Rounds 64 through 79 (four rounds at a time). */ + sha1_message_schedule_update(w0, w1, w2, w3) + sha1_message_schedule_update(w1, w2, w3, w0) + sha1_message_schedule_update(w2, w3, w0, w1) + sha1_message_schedule_update(w3, w0, w1, w2) + + sha1_round4(hs0, hs1s, w0, k3) + sha1_round4(hs0, hs1s, w1, k3) + sha1_round4(hs0, hs1s, w2, k3) + sha1_round4(hs0, hs1s, w3, k3) + + /* Add intermediate state to hash state. */ + add hc0.4s, hc0.4s, hs0.4s + add hc1.4s, hc1.4s, hs1.4s + + sub num, num, #1 + cbnz num, block_loop + + /* Store hash state to context. */ + st1 {hc0.4s}, [ctx] + str hc1s, [ctx, #(4*4)] + + ret -- cgit v1.2.3-55-g6feb