Provide a replacement assembly implementation for SHA-1 on amd64.

As already done for SHA-256 and SHA-512, replace the perlasm generated SHA-1 assembly implementation with one that is actually readable. Call the assembly implementation from a C wrapper that can, in the future, dispatch to alternate implementations. On a modern CPU the performance is around 5% faster than the base implementation generated by sha1-x86_64.pl, however it is around 15% slower than the excessively complex SSSE2/AVX version that is also generated by the same script (a SHA-NI version will greatly outperform this and is much cleaner/simpler). ok tb@
author: jsing <> 2024-12-04 13:13:33 +0000
committer: jsing <> 2024-12-04 13:13:33 +0000
commit: 1c3ce6cc8e538cecc33ed58f89d969af28952dea (patch)
tree: 6adf1634c082704fca00fea488f843d1345662b2 /src
parent: 54b7e03a99e6dbd79315380653c1bf578c8444b0 (diff)
download: openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.gz
openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.bz2
openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.zip
3 files changed, 345 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc
index fe22385633..33c7dbba26 100644
--- a/src/lib/libcrypto/arch/amd64/Makefile.inc
+++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $
+# $OpenBSD: Makefile.inc,v 1.34 2024/12/04 13:13:33 jsing Exp $
 # amd64-specific libcrypto build rules
@@ -49,7 +49,8 @@ SSLASM+= rc4 rc4-x86_64
 # ripemd
 # sha
 CFLAGS+= -DSHA1_ASM
-SSLASM+= sha sha1-x86_64
+SRCS+= sha1_amd64.c
+SRCS+= sha1_amd64_generic.S
 CFLAGS+= -DSHA256_ASM
 SRCS+= sha256_amd64.c
 SRCS+= sha256_amd64_generic.S
diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c
new file mode 100644
index 0000000000..b3d4ab1263
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_amd64.c
@@ -0,0 +1,28 @@
+/* $OpenBSD: sha1_amd64.c,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
+/*
+ * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <openssl/sha.h>
+#include "crypto_arch.h"
+void sha1_block_generic(SHA_CTX *ctx, const void *in, size_t num);
+void
+sha1_block_data_order(SHA_CTX *ctx, const void *in, size_t num)
+{
+        sha1_block_generic(ctx, in, num);
+}
diff --git a/src/lib/libcrypto/sha/sha1_amd64_generic.S b/src/lib/libcrypto/sha/sha1_amd64_generic.S
new file mode 100644
index 0000000000..d3e184dbca
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_amd64_generic.S
@@ -0,0 +1,314 @@
+/* $OpenBSD: sha1_amd64_generic.S,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
+/*
+ * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef __CET__
+#include <cet.h>
+#else
+#define _CET_ENDBR
+#endif
+#define ctx             %rdi
+#define in              %rsi
+#define num             %rdx
+#define end             %rbp
+#define hs0             %r8d
+#define hs1             %r9d
+#define hs2             %r10d
+#define hs3             %r11d
+#define hs4             %r12d
+#define tmp0            %eax
+#define tmp1            %ebx
+#define tmp2            %ecx
+#define tmp3            %edx
+/*
+ * Load message into wt, storing a copy in the message schedule:
+ *
+ *  Wt = Mt
+ */
+#define sha1_message_schedule_load(idx, m, w, wt) \
+        movl    ((idx&0xf)*4)(m), wt;                           \
+        bswapl  wt;                                             \
+        movl    wt, ((idx&0xf)*4)(w);
+/*
+ * Update message schedule and return current value in wt:
+ *
+ *  W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1)
+ */
+#define sha1_message_schedule_update(idx, w, wt) \
+        movl    (((idx-3)&0xf)*4)(w), wt;       /* W13 */       \
+        xorl    (((idx-8)&0xf)*4)(w), wt;       /* W8 */        \
+        xorl    (((idx-14)&0xf)*4)(w), wt;      /* W2 */        \
+        xorl    (((idx)&0xf)*4)(w), wt;         /* W0 */        \
+        roll    $1, wt;                                         \
+        \
+        movl    wt, ((idx&0xf)*4)(w);
+/*
+ * Compute a SHA-1 round without logic function:
+ *
+ *  T = rol(a, 5) + e + Kt + Wt
+ *
+ * The caller is required to compute the appropriate logic function
+ * (Ch, Maj, Parity) and add it to e.
+ *
+ * Upon completion b = rol(b, 30), e = T, pending rotation.
+ */
+#define sha1_round(a, b, c, d, e, kt, wt) \
+        leal    kt(wt, e, 1), e;                /* Kt + Wt */   \
+        \
+        movl    a, tmp1;                        /* rol(a, 5) */ \
+        roll    $5, tmp1;                                       \
+        addl    tmp1, e;                                        \
+        \
+        roll    $30, b;                         /* rol(b, 30) */
+/*
+ * Compute a SHA-1 round with Ch:
+ *
+ *  T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt
+ *
+ *  Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
+ *
+ * Upon completion b = rol(b, 30), e = T, pending rotation.
+ */
+#define sha1_round_ch(a, b, c, d, e, kt, wt) \
+        movl    c, tmp2;                        /* Ch */        \
+        xorl    d, tmp2;                        /* Ch */        \
+        andl    b, tmp2;                        /* Ch */        \
+        xorl    d, tmp2;                        /* Ch */        \
+        addl    tmp2, e;                        /* Ch */        \
+        \
+        sha1_round(a, b, c, d, e, kt, wt);
+/*
+ * Compute a SHA-1 round with Parity:
+ *
+ *  T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt
+ *
+ *  Parity(x, y, z) = x ^ y ^ z
+ *
+ * Upon completion b = rol(b, 30), e = T, pending rotation.
+ */
+#define sha1_round_parity(a, b, c, d, e, kt, wt) \
+        movl    b, tmp2;                        /* Parity */    \
+        xorl    c, tmp2;                        /* Parity */    \
+        xorl    d, tmp2;                        /* Parity */    \
+        addl    tmp2, e;                        /* Parity */    \
+        \
+        sha1_round(a, b, c, d, e, kt, wt);
+/*
+ * Compute a SHA-1 round with Maj:
+ *
+ *  T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt
+ *
+ *  Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
+ *
+ * Upon completion b = rol(b, 30), e = T, pending rotation.
+ */
+#define sha1_round_maj(a, b, c, d, e, kt, wt) \
+        movl    c, tmp2;                        /* Maj */       \
+        xorl    d, tmp2;                        /* Maj */       \
+        andl    b, tmp2;                        /* Maj */       \
+        movl    c, tmp3;                        /* Maj */       \
+        andl    d, tmp3;                        /* Maj */       \
+        xorl    tmp2, tmp3;                     /* Maj */       \
+        addl    tmp3, e;                        /* Maj */       \
+        \
+        sha1_round(a, b, c, d, e, kt, wt);
+#define sha1_round1_load(idx, a, b, c, d, e) \
+        sha1_message_schedule_load(idx, in, %rsp, tmp0) \
+        sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
+#define sha1_round1_update(idx, a, b, c, d, e) \
+        sha1_message_schedule_update(idx, %rsp, tmp0) \
+        sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
+#define sha1_round2_update(idx, a, b, c, d, e) \
+        sha1_message_schedule_update(idx, %rsp, tmp0) \
+        sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
+#define sha1_round3_update(idx, a, b, c, d, e) \
+        sha1_message_schedule_update(idx, %rsp, tmp0) \
+        sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
+#define sha1_round4_update(idx, a, b, c, d, e) \
+        sha1_message_schedule_update(idx, %rsp, tmp0) \
+        sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
+.text
+/*
+ * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num);
+ *
+ * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
+ */
+.align 16
+.globl  sha1_block_generic
+.type   sha1_block_generic,@function
+sha1_block_generic:
+        _CET_ENDBR
+        /* Save callee save registers. */
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        /* Allocate space for message schedule. */
+        movq    %rsp, %rax
+        subq    $(64+1*8), %rsp
+        andq    $~63, %rsp
+        movq    %rax, (64+0*8)(%rsp)
+        /* Compute and store end of message. */
+        shlq    $6, num
+        leaq    (in, num, 1), %rbp
+        /* Load current hash state from context. */
+        movl    (0*4)(ctx), hs0
+        movl    (1*4)(ctx), hs1
+        movl    (2*4)(ctx), hs2
+        movl    (3*4)(ctx), hs3
+        movl    (4*4)(ctx), hs4
+        jmp     .Lblock_loop
+.align 16
+.Lblock_loop:
+        /* Round 0 through 15. */
+        sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
+        sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
+        sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
+        sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
+        sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
+        sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
+        sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
+        sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
+        sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
+        sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
+        sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
+        sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
+        sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
+        sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
+        sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
+        sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
+        /* Round 16 through 31. */
+        sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3);
+        sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2);
+        sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1);
+        sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0);
+        sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4);
+        sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3);
+        sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2);
+        sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1);
+        sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0);
+        sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4);
+        sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3);
+        sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2);
+        sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1);
+        sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0);
+        sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4);
+        sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3);
+        /* Round 32 through 47. */
+        sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2);
+        sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1);
+        sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0);
+        sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4);
+        sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3);
+        sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2);
+        sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1);
+        sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0);
+        sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4);
+        sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3);
+        sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2);
+        sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1);
+        sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0);
+        sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4);
+        sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3);
+        sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2);
+        /* Round 48 through 63. */
+        sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1);
+        sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0);
+        sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4);
+        sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3);
+        sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2);
+        sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1);
+        sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0);
+        sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4);
+        sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3);
+        sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2);
+        sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1);
+        sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0);
+        sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4);
+        sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3);
+        sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2);
+        sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1);
+        /* Round 64 through 79. */
+        sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0);
+        sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4);
+        sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3);
+        sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2);
+        sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1);
+        sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0);
+        sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4);
+        sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3);
+        sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2);
+        sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1);
+        sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0);
+        sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4);
+        sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3);
+        sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2);
+        sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1);
+        sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0);
+        /* Add intermediate state to hash state. */
+        addl    (0*4)(ctx), hs0
+        addl    (1*4)(ctx), hs1
+        addl    (2*4)(ctx), hs2
+        addl    (3*4)(ctx), hs3
+        addl    (4*4)(ctx), hs4
+        /* Store new hash state to context. */
+        movl    hs0, (0*4)(ctx)
+        movl    hs1, (1*4)(ctx)
+        movl    hs2, (2*4)(ctx)
+        movl    hs3, (3*4)(ctx)
+        movl    hs4, (4*4)(ctx)
+        addq    $64, in
+        cmpq    end, in
+        jb      .Lblock_loop
+        movq    (64+0*8)(%rsp), %rsp
+        /* Restore callee save registers. */
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
author	jsing <>	2024-12-04 13:13:33 +0000
committer	jsing <>	2024-12-04 13:13:33 +0000
commit	1c3ce6cc8e538cecc33ed58f89d969af28952dea (patch)
tree	6adf1634c082704fca00fea488f843d1345662b2 /src
parent	54b7e03a99e6dbd79315380653c1bf578c8444b0 (diff)
download	openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.gz openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.bz2 openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.zip

diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc index fe22385633..33c7dbba26 100644 --- a/src/lib/libcrypto/arch/amd64/Makefile.inc +++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
1	# $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $	1	# $OpenBSD: Makefile.inc,v 1.34 2024/12/04 13:13:33 jsing Exp $
2		2
3	# amd64-specific libcrypto build rules	3	# amd64-specific libcrypto build rules
4		4
@@ -49,7 +49,8 @@ SSLASM+= rc4 rc4-x86_64
49	# ripemd	49	# ripemd
50	# sha	50	# sha
51	CFLAGS+= -DSHA1_ASM	51	CFLAGS+= -DSHA1_ASM
52	SSLASM+= sha sha1-x86_64	52	SRCS+= sha1_amd64.c
		53	SRCS+= sha1_amd64_generic.S
53	CFLAGS+= -DSHA256_ASM	54	CFLAGS+= -DSHA256_ASM
54	SRCS+= sha256_amd64.c	55	SRCS+= sha256_amd64.c
55	SRCS+= sha256_amd64_generic.S	56	SRCS+= sha256_amd64_generic.S


diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c new file mode 100644 index 0000000000..b3d4ab1263 --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_amd64.c
@@ -0,0 +1,28 @@
		1	/* $OpenBSD: sha1_amd64.c,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
		2	/*
		3	* Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
		4	*
		5	* Permission to use, copy, modify, and distribute this software for any
		6	* purpose with or without fee is hereby granted, provided that the above
		7	* copyright notice and this permission notice appear in all copies.
		8	*
		9	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
		10	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
		11	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
		12	* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
		13	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
		14	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
		15	* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
		16	*/
		17
		18	#include <openssl/sha.h>
		19
		20	#include "crypto_arch.h"
		21
		22	void sha1_block_generic(SHA_CTX ctx, const void in, size_t num);
		23
		24	void
		25	sha1_block_data_order(SHA_CTX ctx, const void in, size_t num)
		26	{
		27	sha1_block_generic(ctx, in, num);
		28	}


diff --git a/src/lib/libcrypto/sha/sha1_amd64_generic.S b/src/lib/libcrypto/sha/sha1_amd64_generic.S new file mode 100644 index 0000000000..d3e184dbca --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_amd64_generic.S
@@ -0,0 +1,314 @@
		1	/* $OpenBSD: sha1_amd64_generic.S,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
		2	/*
		3	* Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
		4	*
		5	* Permission to use, copy, modify, and distribute this software for any
		6	* purpose with or without fee is hereby granted, provided that the above
		7	* copyright notice and this permission notice appear in all copies.
		8	*
		9	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
		10	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
		11	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
		12	* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
		13	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
		14	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
		15	* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
		16	*/
		17
		18	#ifdef __CET__
		19	#include <cet.h>
		20	#else
		21	#define _CET_ENDBR
		22	#endif
		23
		24	#define ctx %rdi
		25	#define in %rsi
		26	#define num %rdx
		27
		28	#define end %rbp
		29
		30	#define hs0 %r8d
		31	#define hs1 %r9d
		32	#define hs2 %r10d
		33	#define hs3 %r11d
		34	#define hs4 %r12d
		35
		36	#define tmp0 %eax
		37	#define tmp1 %ebx
		38	#define tmp2 %ecx
		39	#define tmp3 %edx
		40
		41	/*
		42	* Load message into wt, storing a copy in the message schedule:
		43	*
		44	* Wt = Mt
		45	*/
		46	#define sha1_message_schedule_load(idx, m, w, wt) \
		47	movl ((idx&0xf)*4)(m), wt; \
		48	bswapl wt; \
		49	movl wt, ((idx&0xf)*4)(w);
		50
		51	/*
		52	* Update message schedule and return current value in wt:
		53	*
		54	* W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1)
		55	*/
		56	#define sha1_message_schedule_update(idx, w, wt) \
		57	movl (((idx-3)&0xf)4)(w), wt; / W13 */ \
		58	xorl (((idx-8)&0xf)4)(w), wt; / W8 */ \
		59	xorl (((idx-14)&0xf)4)(w), wt; / W2 */ \
		60	xorl (((idx)&0xf)4)(w), wt; / W0 */ \
		61	roll $1, wt; \
		62	\
		63	movl wt, ((idx&0xf)*4)(w);
		64
		65	/*
		66	* Compute a SHA-1 round without logic function:
		67	*
		68	* T = rol(a, 5) + e + Kt + Wt
		69	*
		70	* The caller is required to compute the appropriate logic function
		71	* (Ch, Maj, Parity) and add it to e.
		72	*
		73	* Upon completion b = rol(b, 30), e = T, pending rotation.
		74	*/
		75	#define sha1_round(a, b, c, d, e, kt, wt) \
		76	leal kt(wt, e, 1), e; /* Kt + Wt */ \
		77	\
		78	movl a, tmp1; /* rol(a, 5) */ \
		79	roll $5, tmp1; \
		80	addl tmp1, e; \
		81	\
		82	roll $30, b; /* rol(b, 30) */
		83
		84	/*
		85	* Compute a SHA-1 round with Ch:
		86	*
		87	* T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt
		88	*
		89	* Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
		90	*
		91	* Upon completion b = rol(b, 30), e = T, pending rotation.
		92	*/
		93	#define sha1_round_ch(a, b, c, d, e, kt, wt) \
		94	movl c, tmp2; /* Ch */ \
		95	xorl d, tmp2; /* Ch */ \
		96	andl b, tmp2; /* Ch */ \
		97	xorl d, tmp2; /* Ch */ \
		98	addl tmp2, e; /* Ch */ \
		99	\
		100	sha1_round(a, b, c, d, e, kt, wt);
		101
		102	/*
		103	* Compute a SHA-1 round with Parity:
		104	*
		105	* T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt
		106	*
		107	* Parity(x, y, z) = x ^ y ^ z
		108	*
		109	* Upon completion b = rol(b, 30), e = T, pending rotation.
		110	*/
		111	#define sha1_round_parity(a, b, c, d, e, kt, wt) \
		112	movl b, tmp2; /* Parity */ \
		113	xorl c, tmp2; /* Parity */ \
		114	xorl d, tmp2; /* Parity */ \
		115	addl tmp2, e; /* Parity */ \
		116	\
		117	sha1_round(a, b, c, d, e, kt, wt);
		118
		119	/*
		120	* Compute a SHA-1 round with Maj:
		121	*
		122	* T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt
		123	*
		124	* Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
		125	*
		126	* Upon completion b = rol(b, 30), e = T, pending rotation.
		127	*/
		128	#define sha1_round_maj(a, b, c, d, e, kt, wt) \
		129	movl c, tmp2; /* Maj */ \
		130	xorl d, tmp2; /* Maj */ \
		131	andl b, tmp2; /* Maj */ \
		132	movl c, tmp3; /* Maj */ \
		133	andl d, tmp3; /* Maj */ \
		134	xorl tmp2, tmp3; /* Maj */ \
		135	addl tmp3, e; /* Maj */ \
		136	\
		137	sha1_round(a, b, c, d, e, kt, wt);
		138
		139	#define sha1_round1_load(idx, a, b, c, d, e) \
		140	sha1_message_schedule_load(idx, in, %rsp, tmp0) \
		141	sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
		142
		143	#define sha1_round1_update(idx, a, b, c, d, e) \
		144	sha1_message_schedule_update(idx, %rsp, tmp0) \
		145	sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
		146
		147	#define sha1_round2_update(idx, a, b, c, d, e) \
		148	sha1_message_schedule_update(idx, %rsp, tmp0) \
		149	sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
		150
		151	#define sha1_round3_update(idx, a, b, c, d, e) \
		152	sha1_message_schedule_update(idx, %rsp, tmp0) \
		153	sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
		154
		155	#define sha1_round4_update(idx, a, b, c, d, e) \
		156	sha1_message_schedule_update(idx, %rsp, tmp0) \
		157	sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
		158
		159	.text
		160
		161	/*
		162	* void sha1_block_generic(SHA1_CTX ctx, const void in, size_t num);
		163	*
		164	* Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
		165	*/
		166	.align 16
		167	.globl sha1_block_generic
		168	.type sha1_block_generic,@function
		169	sha1_block_generic:
		170	_CET_ENDBR
		171
		172	/* Save callee save registers. */
		173	pushq %rbx
		174	pushq %rbp
		175	pushq %r12
		176
		177	/* Allocate space for message schedule. */
		178	movq %rsp, %rax
		179	subq $(64+1*8), %rsp
		180	andq $~63, %rsp
		181	movq %rax, (64+0*8)(%rsp)
		182
		183	/* Compute and store end of message. */
		184	shlq $6, num
		185	leaq (in, num, 1), %rbp
		186
		187	/* Load current hash state from context. */
		188	movl (0*4)(ctx), hs0
		189	movl (1*4)(ctx), hs1
		190	movl (2*4)(ctx), hs2
		191	movl (3*4)(ctx), hs3
		192	movl (4*4)(ctx), hs4
		193
		194	jmp .Lblock_loop
		195
		196	.align 16
		197	.Lblock_loop:
		198
		199	/* Round 0 through 15. */
		200	sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
		201	sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
		202	sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
		203	sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
		204	sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
		205	sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
		206	sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
		207	sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
		208	sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
		209	sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
		210	sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
		211	sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
		212	sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
		213	sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
		214	sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
		215	sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
		216
		217	/* Round 16 through 31. */
		218	sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3);
		219	sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2);
		220	sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1);
		221	sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0);
		222	sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4);
		223	sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3);
		224	sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2);
		225	sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1);
		226	sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0);
		227	sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4);
		228	sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3);
		229	sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2);
		230	sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1);
		231	sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0);
		232	sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4);
		233	sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3);
		234
		235	/* Round 32 through 47. */
		236	sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2);
		237	sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1);
		238	sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0);
		239	sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4);
		240	sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3);
		241	sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2);
		242	sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1);
		243	sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0);
		244	sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4);
		245	sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3);
		246	sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2);
		247	sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1);
		248	sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0);
		249	sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4);
		250	sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3);
		251	sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2);
		252
		253	/* Round 48 through 63. */
		254	sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1);
		255	sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0);
		256	sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4);
		257	sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3);
		258	sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2);
		259	sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1);
		260	sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0);
		261	sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4);
		262	sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3);
		263	sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2);
		264	sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1);
		265	sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0);
		266	sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4);
		267	sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3);
		268	sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2);
		269	sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1);
		270
		271	/* Round 64 through 79. */
		272	sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0);
		273	sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4);
		274	sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3);
		275	sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2);
		276	sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1);
		277	sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0);
		278	sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4);
		279	sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3);
		280	sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2);
		281	sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1);
		282	sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0);
		283	sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4);
		284	sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3);
		285	sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2);
		286	sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1);
		287	sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0);
		288
		289	/* Add intermediate state to hash state. */
		290	addl (0*4)(ctx), hs0
		291	addl (1*4)(ctx), hs1
		292	addl (2*4)(ctx), hs2
		293	addl (3*4)(ctx), hs3
		294	addl (4*4)(ctx), hs4
		295
		296	/* Store new hash state to context. */
		297	movl hs0, (0*4)(ctx)
		298	movl hs1, (1*4)(ctx)
		299	movl hs2, (2*4)(ctx)
		300	movl hs3, (3*4)(ctx)
		301	movl hs4, (4*4)(ctx)
		302
		303	addq $64, in
		304	cmpq end, in
		305	jb .Lblock_loop
		306
		307	movq (64+0*8)(%rsp), %rsp
		308
		309	/* Restore callee save registers. */
		310	popq %r12
		311	popq %rbp
		312	popq %rbx
		313
		314	ret