From fccd42a0ddbc572e85ac564dc08f0da4315d8275 Mon Sep 17 00:00:00 2001
From: jsing <>
Date: Sat, 28 Jun 2025 12:51:08 +0000
Subject: Provide accelerated SHA-1 for aarch64.

Provide an assembly implementation of SHA-1 for aarch64 using the ARM
Cryptographic Extension (CE). This results in around a 2x speed up for
larger block sizes.

ok tb@
---
 src/lib/libcrypto/arch/aarch64/Makefile.inc  |   4 +-
 src/lib/libcrypto/arch/aarch64/crypto_arch.h |   3 +-
 src/lib/libcrypto/sha/sha1_aarch64.c         |  34 +++++
 src/lib/libcrypto/sha/sha1_aarch64_ce.S      | 214 +++++++++++++++++++++++++++
 4 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 src/lib/libcrypto/sha/sha1_aarch64.c
 create mode 100644 src/lib/libcrypto/sha/sha1_aarch64_ce.S

(limited to 'src')

diff --git a/src/lib/libcrypto/arch/aarch64/Makefile.inc b/src/lib/libcrypto/arch/aarch64/Makefile.inc
index d93cb815ef..d1f22d87cd 100644
--- a/src/lib/libcrypto/arch/aarch64/Makefile.inc
+++ b/src/lib/libcrypto/arch/aarch64/Makefile.inc
@@ -1,9 +1,11 @@
-# $OpenBSD: Makefile.inc,v 1.16 2025/03/12 14:13:41 jsing Exp $
+# $OpenBSD: Makefile.inc,v 1.17 2025/06/28 12:51:08 jsing Exp $
 
 # aarch64-specific libcrypto build rules
 
 SRCS += crypto_cpu_caps.c
 
+SRCS += sha1_aarch64.c
+SRCS += sha1_aarch64_ce.S
 SRCS += sha256_aarch64.c
 SRCS += sha256_aarch64_ce.S
 SRCS += sha512_aarch64.c
diff --git a/src/lib/libcrypto/arch/aarch64/crypto_arch.h b/src/lib/libcrypto/arch/aarch64/crypto_arch.h
index 35ecba9394..51c8d79e2d 100644
--- a/src/lib/libcrypto/arch/aarch64/crypto_arch.h
+++ b/src/lib/libcrypto/arch/aarch64/crypto_arch.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: crypto_arch.h,v 1.4 2025/03/12 14:13:41 jsing Exp $ */
+/*	$OpenBSD: crypto_arch.h,v 1.5 2025/06/28 12:51:08 jsing Exp $ */
 /*
  * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
  *
@@ -35,6 +35,7 @@ extern uint64_t crypto_cpu_caps_aarch64;
 
 #ifndef OPENSSL_NO_ASM
 
+#define HAVE_SHA1_BLOCK_DATA_ORDER
 #define HAVE_SHA256_BLOCK_DATA_ORDER
 #define HAVE_SHA512_BLOCK_DATA_ORDER
 
diff --git a/src/lib/libcrypto/sha/sha1_aarch64.c b/src/lib/libcrypto/sha/sha1_aarch64.c
new file mode 100644
index 0000000000..04c87761e0
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_aarch64.c
@@ -0,0 +1,34 @@
+/* $OpenBSD: sha1_aarch64.c,v 1.1 2025/06/28 12:51:08 jsing Exp $ */
+/*
+ * Copyright (c) 2025 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/sha.h>
+
+#include "crypto_arch.h"
+
+void sha1_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
+void sha1_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
+
+void
+sha1_block_data_order(SHA256_CTX *ctx, const void *in, size_t num)
+{
+	if ((crypto_cpu_caps_aarch64 & CRYPTO_CPU_CAPS_AARCH64_SHA1) != 0) {
+		sha1_block_ce(ctx, in, num);
+		return;
+	}
+
+	sha1_block_generic(ctx, in, num);
+}
diff --git a/src/lib/libcrypto/sha/sha1_aarch64_ce.S b/src/lib/libcrypto/sha/sha1_aarch64_ce.S
new file mode 100644
index 0000000000..8ccf230298
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_aarch64_ce.S
@@ -0,0 +1,214 @@
+/* $OpenBSD: sha1_aarch64_ce.S,v 1.1 2025/06/28 12:51:08 jsing Exp $ */
+/*
+ * Copyright (c) 2023,2025 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * SHA-1 implementation using the ARM Cryptographic Extension (CE).
+ *
+ * There are six instructions for hardware acceleration of SHA-1 - the
+ * documentation for these instructions is woefully inadequate:
+ *
+ *  sha1c:   hash update (choose)
+ *  sha1h:   fixed rotate
+ *  sha1m:   hash update (majority)
+ *  sha1p:   hash update (parity)
+ *  sha1su0: message schedule update with sigma0 for four rounds
+ *  sha1su1: message schedule update with sigma1 for four rounds
+ */
+
+#define ctx		x0
+#define in		x1
+#define num		x2
+
+/* Note: the lower 64 bits of v8 through v15 are callee saved. */
+
+#define hc0		v16
+#define hc1		v17
+#define hc1s		s17
+
+#define hs0		v18
+#define hs1		v19
+#define hs1s		s19
+
+#define w0		v20
+#define w1		v21
+#define w2		v22
+#define w3		v23
+
+#define k0		v24
+#define k1		v25
+#define k2		v26
+#define k3		v27
+
+#define tmp0		v28
+#define tmp1		s29
+
+#define tmp2		w11
+
+/*
+ * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7),
+ * m2 (W8:W9:W10:11) and m3 (W12:W13:W14:W15). The sha1su0 instruction computes
+ * W0 = W8 ^ W2 ^ W0, while sha1su1 computes rol(W0 ^ W13, 1).
+ */
+#define sha1_message_schedule_update(m0, m1, m2, m3) \
+	sha1su0 m0.4s, m1.4s, m2.4s;					\
+	sha1su1 m0.4s, m3.4s;
+
+/*
+ * Compute four SHA-1 rounds by adding W0:W1:W2:W3 + K0:K1:K2:K3, then
+ * computing the remainder of each round (including the shuffle) via
+ * sha1{c,p,m}/sha1h.
+ */
+
+#define sha1_round1(h0, h1, w, k) \
+	add	tmp0.4s, w.4s, k.4s;		/* Tt = Wt + Kt */	\
+	mov	tmp1, h0.s[0];						\
+	sha1c	h0, h1, tmp0.4s;					\
+	sha1h	h1, tmp1;
+
+#define sha1_round2(h0, h1, w, k) \
+	add	tmp0.4s, w.4s, k.4s;		/* Tt = Wt + Kt */	\
+	mov	tmp1, h0.s[0];						\
+	sha1p	h0, h1, tmp0.4s;					\
+	sha1h	h1, tmp1;
+
+#define sha1_round3(h0, h1, w, k) \
+	add	tmp0.4s, w.4s, k.4s;		/* Tt = Wt + Kt */	\
+	mov	tmp1, h0.s[0];						\
+	sha1m	h0, h1, tmp0.4s;					\
+	sha1h	h1, tmp1;
+
+#define sha1_round4(h0, h1, w, k) \
+	add	tmp0.4s, w.4s, k.4s;		/* Tt = Wt + Kt */	\
+	mov	tmp1, h0.s[0];						\
+	sha1p	h0, h1, tmp0.4s;					\
+	sha1h	h1, tmp1;
+
+.arch	armv8-a+sha2
+
+.text
+
+/*
+ * void sha1_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
+ *
+ * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num
+ */
+.globl	sha1_block_ce
+.type   sha1_block_ce,@function
+sha1_block_ce:
+
+	/*
+	 * Load SHA-1 round constants.
+	 */
+
+	/* Round 1 - 0x5a827999 */
+	movz	tmp2, #0x5a82, lsl #16
+	movk	tmp2, #0x7999
+	dup	k0.4s, tmp2
+
+	/* Round 2 - 0x6ed9eba1 */
+	movz	tmp2, #0x6ed9, lsl #16
+	movk	tmp2, #0xeba1
+	dup	k1.4s, tmp2
+
+	/* Round 3 - 0x8f1bbcdc */
+	movz	tmp2, #0x8f1b, lsl #16
+	movk	tmp2, #0xbcdc
+	dup	k2.4s, tmp2
+
+	/* Round 4 - 0xca62c1d6 */
+	movz	tmp2, #0xca62, lsl #16
+	movk	tmp2, #0xc1d6
+	dup	k3.4s, tmp2
+
+	/* Load current hash state from context (hc0 = a:b:c:d, hc1 = e). */
+	ld1	{hc0.4s}, [ctx]
+	ldr	hc1s, [ctx, #(4*4)]
+
+block_loop:
+	/* Copy current hash state. */
+	mov	hs0.4s, hc0.4s
+	mov	hs1s, hc1.s[0]
+
+	/* Load and byte swap message schedule. */
+	ld1	{w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
+	rev32	w0.16b, w0.16b
+	rev32	w1.16b, w1.16b
+	rev32	w2.16b, w2.16b
+	rev32	w3.16b, w3.16b
+
+	/* Rounds 0 through 15 (four rounds at a time). */
+	sha1_round1(hs0, hs1s, w0, k0)
+	sha1_round1(hs0, hs1s, w1, k0)
+	sha1_round1(hs0, hs1s, w2, k0)
+	sha1_round1(hs0, hs1s, w3, k0)
+
+	/* Rounds 16 through 31 (four rounds at a time). */
+	sha1_message_schedule_update(w0, w1, w2, w3)
+	sha1_message_schedule_update(w1, w2, w3, w0)
+	sha1_message_schedule_update(w2, w3, w0, w1)
+	sha1_message_schedule_update(w3, w0, w1, w2)
+
+	sha1_round1(hs0, hs1s, w0, k0)
+	sha1_round2(hs0, hs1s, w1, k1)
+	sha1_round2(hs0, hs1s, w2, k1)
+	sha1_round2(hs0, hs1s, w3, k1)
+
+	/* Rounds 32 through 47 (four rounds at a time). */
+	sha1_message_schedule_update(w0, w1, w2, w3)
+	sha1_message_schedule_update(w1, w2, w3, w0)
+	sha1_message_schedule_update(w2, w3, w0, w1)
+	sha1_message_schedule_update(w3, w0, w1, w2)
+
+	sha1_round2(hs0, hs1s, w0, k1)
+	sha1_round2(hs0, hs1s, w1, k1)
+	sha1_round3(hs0, hs1s, w2, k2)
+	sha1_round3(hs0, hs1s, w3, k2)
+
+	/* Rounds 48 through 63 (four rounds at a time). */
+	sha1_message_schedule_update(w0, w1, w2, w3)
+	sha1_message_schedule_update(w1, w2, w3, w0)
+	sha1_message_schedule_update(w2, w3, w0, w1)
+	sha1_message_schedule_update(w3, w0, w1, w2)
+
+	sha1_round3(hs0, hs1s, w0, k2)
+	sha1_round3(hs0, hs1s, w1, k2)
+	sha1_round3(hs0, hs1s, w2, k2)
+	sha1_round4(hs0, hs1s, w3, k3)
+
+	/* Rounds 64 through 79 (four rounds at a time). */
+	sha1_message_schedule_update(w0, w1, w2, w3)
+	sha1_message_schedule_update(w1, w2, w3, w0)
+	sha1_message_schedule_update(w2, w3, w0, w1)
+	sha1_message_schedule_update(w3, w0, w1, w2)
+
+	sha1_round4(hs0, hs1s, w0, k3)
+	sha1_round4(hs0, hs1s, w1, k3)
+	sha1_round4(hs0, hs1s, w2, k3)
+	sha1_round4(hs0, hs1s, w3, k3)
+
+	/* Add intermediate state to hash state. */
+	add	hc0.4s, hc0.4s, hs0.4s
+	add	hc1.4s, hc1.4s, hs1.4s
+
+	sub	num, num, #1
+	cbnz	num, block_loop
+
+	/* Store hash state to context. */
+	st1	{hc0.4s}, [ctx]
+	str	hc1s, [ctx, #(4*4)]
+
+	ret
-- 
cgit v1.2.3-55-g6feb