From d8f769ca48f14cb8455dfa8f2334c3c683502fe4 Mon Sep 17 00:00:00 2001
From: jsing <>
Date: Fri, 6 Dec 2024 11:57:18 +0000
Subject: Provide a SHA-1 assembly implementation for amd64 using SHA-NI.

This provides a SHA-1 assembly implementation for amd64, which uses
the Intel SHA Extensions (aka SHA New Instructions or SHA-NI). This
provides a 2-2.5x performance gain on some Intel CPUs and many AMD CPUs.

ok tb@
---
 src/lib/libcrypto/arch/amd64/Makefile.inc |   3 +-
 src/lib/libcrypto/sha/sha1_amd64.c        |   8 +-
 src/lib/libcrypto/sha/sha1_amd64_shani.S  | 170 ++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+), 2 deletions(-)
 create mode 100644 src/lib/libcrypto/sha/sha1_amd64_shani.S

(limited to 'src')

diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc
index 33c7dbba26..f8f829cca1 100644
--- a/src/lib/libcrypto/arch/amd64/Makefile.inc
+++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.inc,v 1.34 2024/12/04 13:13:33 jsing Exp $
+# $OpenBSD: Makefile.inc,v 1.35 2024/12/06 11:57:17 jsing Exp $
 
 # amd64-specific libcrypto build rules
 
@@ -51,6 +51,7 @@ SSLASM+= rc4 rc4-x86_64
 CFLAGS+= -DSHA1_ASM
 SRCS+= sha1_amd64.c
 SRCS+= sha1_amd64_generic.S
+SRCS+= sha1_amd64_shani.S
 CFLAGS+= -DSHA256_ASM
 SRCS+= sha256_amd64.c
 SRCS+= sha256_amd64_generic.S
diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c
index b3d4ab1263..2976cc7e6e 100644
--- a/src/lib/libcrypto/sha/sha1_amd64.c
+++ b/src/lib/libcrypto/sha/sha1_amd64.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: sha1_amd64.c,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
+/* $OpenBSD: sha1_amd64.c,v 1.2 2024/12/06 11:57:18 jsing Exp $ */
 /*
  * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
  *
@@ -20,9 +20,15 @@
 #include "crypto_arch.h"
 
 void sha1_block_generic(SHA_CTX *ctx, const void *in, size_t num);
+void sha1_block_shani(SHA_CTX *ctx, const void *in, size_t num);
 
 void
 sha1_block_data_order(SHA_CTX *ctx, const void *in, size_t num)
 {
+	if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_SHA) != 0) {
+		sha1_block_shani(ctx, in, num);
+		return;
+	}
+
 	sha1_block_generic(ctx, in, num);
 }
diff --git a/src/lib/libcrypto/sha/sha1_amd64_shani.S b/src/lib/libcrypto/sha/sha1_amd64_shani.S
new file mode 100644
index 0000000000..d7699d10f1
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_amd64_shani.S
@@ -0,0 +1,170 @@
+/* $OpenBSD: sha1_amd64_shani.S,v 1.1 2024/12/06 11:57:18 jsing Exp $ */
+/*
+ * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef __CET__
+#include <cet.h>
+#else
+#define _CET_ENDBR
+#endif
+
+/*
+ * SHA-1 implementation using the Intel SHA extensions:
+ *
+ * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+ */
+
+#define	ctx		%rdi
+#define	in		%rsi
+#define	num		%rdx
+
+#define	end		%rbx
+
+#define	xabcd_save	%xmm0
+#define	xe_save		%xmm1
+
+#define	xabcd		%xmm2
+#define	xe0		%xmm3
+#define	xe1		%xmm4
+
+#define	xmsg0		%xmm5
+#define	xmsg1		%xmm6
+#define	xmsg2		%xmm7
+#define	xmsg3		%xmm8
+
+#define	xshufmask	%xmm9
+
+
+#define sha1_message_schedule_load(idx, m, xmsg) \
+	movdqu	(idx*16)(m), xmsg;					\
+	pshufb	xshufmask, xmsg;
+
+#define sha1_message_schedule_update(xm0, xm1, xm2, xm3) \
+	sha1msg1 xm1, xm0;						\
+	pxor	xm2, xm0;						\
+	sha1msg2 xm3, xm0;
+
+#define sha1_shani_round(fn, xmsg, xe, xe_next) \
+	sha1nexte xmsg, xe;						\
+	movdqa	xabcd, xe_next;						\
+	sha1rnds4 fn, xe, xabcd;
+
+#define sha1_shani_round_load(fn, idx, m, xmsg, xe, xe_next) \
+	sha1_message_schedule_load(idx, m, xmsg);			\
+	sha1_shani_round(fn, xmsg, xe, xe_next);
+
+#define sha1_shani_round_update(fn, xm0, xm1, xm2, xm3, xe, xe_next) \
+	sha1_message_schedule_update(xm0, xm1, xm2, xm3);		\
+	sha1_shani_round(fn, xm0, xe, xe_next);
+
+
+.text
+
+/*
+ * void sha1_block_shani(SHA256_CTX *ctx, const void *in, size_t num);
+ *
+ * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
+ */
+.align 16
+.globl	sha1_block_shani
+.type	sha1_block_shani,@function
+sha1_block_shani:
+	_CET_ENDBR
+
+	/* Save callee save registers. */
+	pushq	%rbx
+
+	/* Compute end of message. */
+	shlq	$6, num
+	leaq	(in, num, 1), end
+
+	/* Load endian shuffle mask. */
+	movdqa	shufmask(%rip), xshufmask
+
+	/* Load current hash state from context. */
+	movdqu	(0*16)(ctx), xabcd
+	pshufd	$0x1b, xabcd, xabcd	/* dcba -> abcd */
+	pxor	xe0, xe0
+	pinsrd	$3, (1*16)(ctx), xe0	/* e */
+
+	jmp	.Lshani_block_loop
+
+.align 16
+.Lshani_block_loop:
+	/* Save state for accumulation. */
+	movdqa	xabcd, xabcd_save
+	movdqa	xe0, xe_save
+
+	/* Rounds 0 through 15 (four rounds at a time). */
+	sha1_message_schedule_load(0, in, xmsg0);
+	paddd	xmsg0, xe0
+	movdqa	xabcd, xe1
+	sha1rnds4 $0, xe0, xabcd
+
+	sha1_shani_round_load($0, 1, in, xmsg1, xe1, xe0);
+	sha1_shani_round_load($0, 2, in, xmsg2, xe0, xe1);
+	sha1_shani_round_load($0, 3, in, xmsg3, xe1, xe0);
+
+	/* Rounds 16 through 79 (four rounds at a time). */
+	sha1_shani_round_update($0, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
+	sha1_shani_round_update($1, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
+	sha1_shani_round_update($1, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
+	sha1_shani_round_update($1, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
+
+	sha1_shani_round_update($1, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
+	sha1_shani_round_update($1, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
+	sha1_shani_round_update($2, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
+	sha1_shani_round_update($2, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
+
+	sha1_shani_round_update($2, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
+	sha1_shani_round_update($2, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
+	sha1_shani_round_update($2, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
+	sha1_shani_round_update($3, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
+
+	sha1_shani_round_update($3, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
+	sha1_shani_round_update($3, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
+	sha1_shani_round_update($3, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
+	sha1_shani_round_update($3, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
+
+	/* Accumulate hash state. */
+	paddd	xabcd_save, xabcd
+	sha1nexte xe_save, xe0
+
+	addq	$64, in
+	cmpq	end, in
+	jb	.Lshani_block_loop
+
+	/* Update stored hash context. */
+	pshufd	$0x1b, xabcd, xabcd	/* abcd -> dcba */
+	movdqu	xabcd, (0*16)(ctx)
+	pextrd	$3, xe0, (1*16)(ctx)	/* e */
+
+	/* Restore callee save registers. */
+	popq	%rbx
+
+	ret
+
+.rodata
+
+/*
+ * Shuffle mask - byte reversal for little endian to big endian word conversion,
+ * and reordering to abcd.
+ */
+.align	16
+.type	shufmask,@object
+shufmask:
+.octa	0x000102030405060708090a0b0c0d0e0f
+.size	shufmask,.-shufmask
-- 
cgit v1.2.3-55-g6feb