libbb/sha1: optional x86-64 hardware accelerates hashing

function old new delta sha1_process_block64_shaNI - 510 +510 sha1_begin 52 107 +55 .rodata 108285 108301 +16 static.shaNI - 1 +1 ------------------------------------------------------------------------------ (add/remove: 4/0 grow/shrink: 2/0 up/down: 582/0) Total: 582 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2022-01-07 00:43:59 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2022-01-07 00:43:59 +0100
commit: 711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (patch)
tree: 170fa55e39133e3ba7182fa56d1643e25b55010a
parent: a93668cc4277b14eaff07fcfdef9693c990ec824 (diff)
download: busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.gz
busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.bz2
busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.zip
4 files changed, 270 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index c80bee286..708d3b0c8 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -63,6 +63,13 @@ config SHA1_SMALL
        1               224  229           654   732
        2,3             200  195           358   380
+config SHA1_HWACCEL
+        bool "SHA1: Use hardware accelerated instructions if possible"
+        default y
+        help
+        On x86, this adds ~590 bytes of code. Throughput
+        is about twice as fast as fully-unrolled generic code.
 config SHA3_SMALL
        int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
        default 1  # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 19b8aad60..a3db02b6f 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -57,6 +57,7 @@ lib-y += make_directory.o
 lib-y += makedev.o
 lib-y += hash_md5_sha.o
 lib-y += hash_md5_sha_x86-64.o
+lib-y += hash_md5_sha_x86-64_shaNI.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index ee19c1cb7..4c6904b48 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 /* in hash_md5_sha_x86-64.S */
 struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
-void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);
+void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx);
 # else
 /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
@@ -1142,6 +1142,28 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 }
 #endif /* NEED_SHA512 */
+#if ENABLE_SHA1_HWACCEL
+# if defined(__GNUC__) && defined(__x86_64__)
+static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+        asm (
+                "cpuid\n"
+                : "=a"(*eax), /* Output */
+                  "=b"(*ebx),
+                  "=c"(*ecx),
+                  "=d"(*edx)
+                : "0"(*eax),  /* Input */
+                  "1"(*ebx),
+                  "2"(*ecx),
+                  "3"(*edx)
+                /* No clobbered registers */
+        );
+}
+struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
+void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
+# endif
+#endif
 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 {
        ctx->hash[0] = 0x67452301;
@@ -1151,6 +1173,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
        ctx->hash[4] = 0xc3d2e1f0;
        ctx->total64 = 0;
        ctx->process_block = sha1_process_block64;
+#if ENABLE_SHA1_HWACCEL
+# if defined(__GNUC__) && defined(__x86_64__)
+        {
+                static smallint shaNI;
+                if (!shaNI) {
+                        unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
+                        cpuid(&eax, &ebx, &ecx, &edx);
+                        shaNI = ((ebx >> 28) & 2) - 1;
+                }
+                if (shaNI > 0)
+                        ctx->process_block = sha1_process_block64_shaNI;
+        }
+# endif
+#endif
 }
 static const uint32_t init256[] ALIGN4 = {
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
new file mode 100644
index 000000000..473b472f1
--- /dev/null
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -0,0 +1,225 @@
+#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA1 insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define xor128 pxor
+#define xor128 xorps
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+#define extr128_32 pextrd
+//#define extr128_32 extractps  # not shorter
+        .section        .text.sha1_process_block64_shaNI,"ax",@progbits
+        .globl  sha1_process_block64_shaNI
+        .hidden sha1_process_block64_shaNI
+        .type   sha1_process_block64_shaNI, @function
+#define ABCD            %xmm0
+#define E0              %xmm1   /* Need two E's b/c they ping pong */
+#define E1              %xmm2
+#define MSG0            %xmm3
+#define MSG1            %xmm4
+#define MSG2            %xmm5
+#define MSG3            %xmm6
+#define SHUF_MASK       %xmm7
+        .balign 8       # allow decoders to fetch at least 2 first insns
+sha1_process_block64_shaNI:
+        /* load initial hash values */
+        xor128          E0, E0
+        movu128         80(%rdi), ABCD
+        pinsrd          $3, 80+4*4(%rdi), E0    # load to upper 32-bit word
+        shuf128_32      $0x1B, ABCD, ABCD       # 00011011: bswap
+        mova128         PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+        /* Save hash values for addition after rounds */
+        mova128         E0, %xmm9
+        mova128         ABCD, %xmm8
+        /* Rounds 0-3 */
+        movu128         0*16(%rdi), MSG0
+        pshufb          SHUF_MASK, MSG0
+                paddd           MSG0, E0
+                mova128         ABCD, E1
+                sha1rnds4       $0, E0, ABCD
+        /* Rounds 4-7 */
+        movu128         1*16(%rdi), MSG1
+        pshufb          SHUF_MASK, MSG1
+                sha1nexte       MSG1, E1
+                mova128         ABCD, E0
+                sha1rnds4       $0, E1, ABCD
+        sha1msg1        MSG1, MSG0
+        /* Rounds 8-11 */
+        movu128         2*16(%rdi), MSG2
+        pshufb          SHUF_MASK, MSG2
+                sha1nexte       MSG2, E0
+                mova128         ABCD, E1
+                sha1rnds4       $0, E0, ABCD
+        sha1msg1        MSG2, MSG1
+        xor128          MSG2, MSG0
+        /* Rounds 12-15 */
+        movu128         3*16(%rdi), MSG3
+        pshufb          SHUF_MASK, MSG3
+                sha1nexte       MSG3, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG3, MSG0
+                sha1rnds4       $0, E1, ABCD
+        sha1msg1        MSG3, MSG2
+        xor128          MSG3, MSG1
+        /* Rounds 16-19 */
+                sha1nexte       MSG0, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG0, MSG1
+                sha1rnds4       $0, E0, ABCD
+        sha1msg1        MSG0, MSG3
+        xor128          MSG0, MSG2
+        /* Rounds 20-23 */
+                sha1nexte       MSG1, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG1, MSG2
+                sha1rnds4       $1, E1, ABCD
+        sha1msg1        MSG1, MSG0
+        xor128          MSG1, MSG3
+        /* Rounds 24-27 */
+                sha1nexte       MSG2, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG2, MSG3
+                sha1rnds4       $1, E0, ABCD
+        sha1msg1        MSG2, MSG1
+        xor128          MSG2, MSG0
+        /* Rounds 28-31 */
+                sha1nexte       MSG3, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG3, MSG0
+                sha1rnds4       $1, E1, ABCD
+        sha1msg1        MSG3, MSG2
+        xor128          MSG3, MSG1
+        /* Rounds 32-35 */
+                sha1nexte       MSG0, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG0, MSG1
+                sha1rnds4       $1, E0, ABCD
+        sha1msg1        MSG0, MSG3
+        xor128          MSG0, MSG2
+        /* Rounds 36-39 */
+                sha1nexte       MSG1, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG1, MSG2
+                sha1rnds4       $1, E1, ABCD
+        sha1msg1        MSG1, MSG0
+        xor128          MSG1, MSG3
+        /* Rounds 40-43 */
+                sha1nexte       MSG2, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG2, MSG3
+                sha1rnds4       $2, E0, ABCD
+        sha1msg1        MSG2, MSG1
+        xor128          MSG2, MSG0
+        /* Rounds 44-47 */
+                sha1nexte       MSG3, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG3, MSG0
+                sha1rnds4       $2, E1, ABCD
+        sha1msg1        MSG3, MSG2
+        xor128          MSG3, MSG1
+        /* Rounds 48-51 */
+                sha1nexte       MSG0, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG0, MSG1
+                sha1rnds4       $2, E0, ABCD
+        sha1msg1        MSG0, MSG3
+        xor128          MSG0, MSG2
+        /* Rounds 52-55 */
+                sha1nexte       MSG1, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG1, MSG2
+                sha1rnds4       $2, E1, ABCD
+        sha1msg1        MSG1, MSG0
+        xor128          MSG1, MSG3
+        /* Rounds 56-59 */
+                sha1nexte       MSG2, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG2, MSG3
+                sha1rnds4       $2, E0, ABCD
+        sha1msg1        MSG2, MSG1
+        xor128          MSG2, MSG0
+        /* Rounds 60-63 */
+                sha1nexte       MSG3, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG3, MSG0
+                sha1rnds4       $3, E1, ABCD
+        sha1msg1        MSG3, MSG2
+        xor128          MSG3, MSG1
+        /* Rounds 64-67 */
+                sha1nexte       MSG0, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG0, MSG1
+                sha1rnds4       $3, E0, ABCD
+        sha1msg1        MSG0, MSG3
+        xor128          MSG0, MSG2
+        /* Rounds 68-71 */
+                sha1nexte       MSG1, E1
+                mova128         ABCD, E0
+        sha1msg2        MSG1, MSG2
+                sha1rnds4       $3, E1, ABCD
+        xor128          MSG1, MSG3
+        /* Rounds 72-75 */
+                sha1nexte       MSG2, E0
+                mova128         ABCD, E1
+        sha1msg2        MSG2, MSG3
+                sha1rnds4       $3, E0, ABCD
+        /* Rounds 76-79 */
+                sha1nexte       MSG3, E1
+                mova128         ABCD, E0
+                sha1rnds4       $3, E1, ABCD
+        /* Add current hash values with previously saved */
+        sha1nexte       %xmm9, E0
+        paddd           %xmm8, ABCD
+        /* Write hash values back in the correct order */
+        shuf128_32      $0x1B, ABCD, ABCD
+        movu128         ABCD, 80(%rdi)
+        extr128_32      $3, E0, 80+4*4(%rdi)
+        ret
+        .size   sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
+.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+        .octa 0x000102030405060708090a0b0c0d0e0f
+#endif
author	Denys Vlasenko <vda.linux@googlemail.com>	2022-01-07 00:43:59 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2022-01-07 00:43:59 +0100
commit	711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (patch)
tree	170fa55e39133e3ba7182fa56d1643e25b55010a
parent	a93668cc4277b14eaff07fcfdef9693c990ec824 (diff)
download	busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.gz busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.bz2 busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.zip

diff --git a/libbb/Config.src b/libbb/Config.src index c80bee286..708d3b0c8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src
@@ -63,6 +63,13 @@ config SHA1_SMALL
63	1 224 229 654 732	63	1 224 229 654 732
64	2,3 200 195 358 380	64	2,3 200 195 358 380
65		65
		66	config SHA1_HWACCEL
		67	bool "SHA1: Use hardware accelerated instructions if possible"
		68	default y
		69	help
		70	On x86, this adds ~590 bytes of code. Throughput
		71	is about twice as fast as fully-unrolled generic code.
		72
66	config SHA3_SMALL	73	config SHA3_SMALL
67	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"	74	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
68	default 1 # all "fast or small" options default to small	75	default 1 # all "fast or small" options default to small


diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 19b8aad60..a3db02b6f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src
@@ -57,6 +57,7 @@ lib-y += make_directory.o
57	lib-y += makedev.o	57	lib-y += makedev.o
58	lib-y += hash_md5_sha.o	58	lib-y += hash_md5_sha.o
59	lib-y += hash_md5_sha_x86-64.o	59	lib-y += hash_md5_sha_x86-64.o
		60	lib-y += hash_md5_sha_x86-64_shaNI.o
60	# Alternative (disabled) MD5 implementation	61	# Alternative (disabled) MD5 implementation
61	#lib-y += hash_md5prime.o	62	#lib-y += hash_md5prime.o
62	lib-y += messages.o	63	lib-y += messages.o


diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index ee19c1cb7..4c6904b48 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
699		699
700	/* in hash_md5_sha_x86-64.S */	700	/* in hash_md5_sha_x86-64.S */
701	struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };	701	struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
702	void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);	702	void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx);
703		703
704	# else	704	# else
705	/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.	705	/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
@@ -1142,6 +1142,28 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
1142	}	1142	}
1143	#endif /* NEED_SHA512 */	1143	#endif /* NEED_SHA512 */
1144		1144
		1145	#if ENABLE_SHA1_HWACCEL
		1146	# if defined(__GNUC__) && defined(__x86_64__)
		1147	static void cpuid(unsigned eax, unsigned ebx, unsigned ecx, unsigned edx)
		1148	{
		1149	asm (
		1150	"cpuid\n"
		1151	: "=a"(eax), / Output */
		1152	"=b"(*ebx),
		1153	"=c"(*ecx),
		1154	"=d"(*edx)
		1155	: "0"(eax), / Input */
		1156	"1"(*ebx),
		1157	"2"(*ecx),
		1158	"3"(*edx)
		1159	/* No clobbered registers */
		1160	);
		1161	}
		1162	struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
		1163	void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
		1164	# endif
		1165	#endif
		1166
1145	void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)	1167	void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1146	{	1168	{
1147	ctx->hash[0] = 0x67452301;	1169	ctx->hash[0] = 0x67452301;
@@ -1151,6 +1173,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1151	ctx->hash[4] = 0xc3d2e1f0;	1173	ctx->hash[4] = 0xc3d2e1f0;
1152	ctx->total64 = 0;	1174	ctx->total64 = 0;
1153	ctx->process_block = sha1_process_block64;	1175	ctx->process_block = sha1_process_block64;
		1176	#if ENABLE_SHA1_HWACCEL
		1177	# if defined(__GNUC__) && defined(__x86_64__)
		1178	{
		1179	static smallint shaNI;
		1180	if (!shaNI) {
		1181	unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
		1182	cpuid(&eax, &ebx, &ecx, &edx);
		1183	shaNI = ((ebx >> 28) & 2) - 1;
		1184	}
		1185	if (shaNI > 0)
		1186	ctx->process_block = sha1_process_block64_shaNI;
		1187	}
		1188	# endif
		1189	#endif
1154	}	1190	}
1155		1191
1156	static const uint32_t init256[] ALIGN4 = {	1192	static const uint32_t init256[] ALIGN4 = {


diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S new file mode 100644 index 000000000..473b472f1 --- /dev/null +++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -0,0 +1,225 @@
		1	#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
		2	/* The code is adapted from Linux kernel's source */
		3
		4	// We use shorter insns, even though they are for "wrong"
		5	// data type (fp, not int).
		6	// For Intel, there is no penalty for doing it at all
		7	// (CPUs which do have such penalty do not support SHA1 insns).
		8	// For AMD, the penalty is one extra cycle
		9	// (allegedly: I failed to find measurable difference).
		10
		11	//#define mova128 movdqa
		12	#define mova128 movaps
		13	//#define movu128 movdqu
		14	#define movu128 movups
		15	//#define xor128 pxor
		16	#define xor128 xorps
		17	//#define shuf128_32 pshufd
		18	#define shuf128_32 shufps
		19
		20	#define extr128_32 pextrd
		21	//#define extr128_32 extractps # not shorter
		22
		23	.section .text.sha1_process_block64_shaNI,"ax",@progbits
		24	.globl sha1_process_block64_shaNI
		25	.hidden sha1_process_block64_shaNI
		26	.type sha1_process_block64_shaNI, @function
		27
		28	#define ABCD %xmm0
		29	#define E0 %xmm1 /* Need two E's b/c they ping pong */
		30	#define E1 %xmm2
		31	#define MSG0 %xmm3
		32	#define MSG1 %xmm4
		33	#define MSG2 %xmm5
		34	#define MSG3 %xmm6
		35	#define SHUF_MASK %xmm7
		36
		37	.balign 8 # allow decoders to fetch at least 2 first insns
		38	sha1_process_block64_shaNI:
		39	/* load initial hash values */
		40
		41	xor128 E0, E0
		42	movu128 80(%rdi), ABCD
		43	pinsrd $3, 80+4*4(%rdi), E0 # load to upper 32-bit word
		44	shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap
		45
		46	mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
		47
		48	/* Save hash values for addition after rounds */
		49	mova128 E0, %xmm9
		50	mova128 ABCD, %xmm8
		51
		52	/* Rounds 0-3 */
		53	movu128 0*16(%rdi), MSG0
		54	pshufb SHUF_MASK, MSG0
		55	paddd MSG0, E0
		56	mova128 ABCD, E1
		57	sha1rnds4 $0, E0, ABCD
		58
		59	/* Rounds 4-7 */
		60	movu128 1*16(%rdi), MSG1
		61	pshufb SHUF_MASK, MSG1
		62	sha1nexte MSG1, E1
		63	mova128 ABCD, E0
		64	sha1rnds4 $0, E1, ABCD
		65	sha1msg1 MSG1, MSG0
		66
		67	/* Rounds 8-11 */
		68	movu128 2*16(%rdi), MSG2
		69	pshufb SHUF_MASK, MSG2
		70	sha1nexte MSG2, E0
		71	mova128 ABCD, E1
		72	sha1rnds4 $0, E0, ABCD
		73	sha1msg1 MSG2, MSG1
		74	xor128 MSG2, MSG0
		75
		76	/* Rounds 12-15 */
		77	movu128 3*16(%rdi), MSG3
		78	pshufb SHUF_MASK, MSG3
		79	sha1nexte MSG3, E1
		80	mova128 ABCD, E0
		81	sha1msg2 MSG3, MSG0
		82	sha1rnds4 $0, E1, ABCD
		83	sha1msg1 MSG3, MSG2
		84	xor128 MSG3, MSG1
		85
		86	/* Rounds 16-19 */
		87	sha1nexte MSG0, E0
		88	mova128 ABCD, E1
		89	sha1msg2 MSG0, MSG1
		90	sha1rnds4 $0, E0, ABCD
		91	sha1msg1 MSG0, MSG3
		92	xor128 MSG0, MSG2
		93
		94	/* Rounds 20-23 */
		95	sha1nexte MSG1, E1
		96	mova128 ABCD, E0
		97	sha1msg2 MSG1, MSG2
		98	sha1rnds4 $1, E1, ABCD
		99	sha1msg1 MSG1, MSG0
		100	xor128 MSG1, MSG3
		101
		102	/* Rounds 24-27 */
		103	sha1nexte MSG2, E0
		104	mova128 ABCD, E1
		105	sha1msg2 MSG2, MSG3
		106	sha1rnds4 $1, E0, ABCD
		107	sha1msg1 MSG2, MSG1
		108	xor128 MSG2, MSG0
		109
		110	/* Rounds 28-31 */
		111	sha1nexte MSG3, E1
		112	mova128 ABCD, E0
		113	sha1msg2 MSG3, MSG0
		114	sha1rnds4 $1, E1, ABCD
		115	sha1msg1 MSG3, MSG2
		116	xor128 MSG3, MSG1
		117
		118	/* Rounds 32-35 */
		119	sha1nexte MSG0, E0
		120	mova128 ABCD, E1
		121	sha1msg2 MSG0, MSG1
		122	sha1rnds4 $1, E0, ABCD
		123	sha1msg1 MSG0, MSG3
		124	xor128 MSG0, MSG2
		125
		126	/* Rounds 36-39 */
		127	sha1nexte MSG1, E1
		128	mova128 ABCD, E0
		129	sha1msg2 MSG1, MSG2
		130	sha1rnds4 $1, E1, ABCD
		131	sha1msg1 MSG1, MSG0
		132	xor128 MSG1, MSG3
		133
		134	/* Rounds 40-43 */
		135	sha1nexte MSG2, E0
		136	mova128 ABCD, E1
		137	sha1msg2 MSG2, MSG3
		138	sha1rnds4 $2, E0, ABCD
		139	sha1msg1 MSG2, MSG1
		140	xor128 MSG2, MSG0
		141
		142	/* Rounds 44-47 */
		143	sha1nexte MSG3, E1
		144	mova128 ABCD, E0
		145	sha1msg2 MSG3, MSG0
		146	sha1rnds4 $2, E1, ABCD
		147	sha1msg1 MSG3, MSG2
		148	xor128 MSG3, MSG1
		149
		150	/* Rounds 48-51 */
		151	sha1nexte MSG0, E0
		152	mova128 ABCD, E1
		153	sha1msg2 MSG0, MSG1
		154	sha1rnds4 $2, E0, ABCD
		155	sha1msg1 MSG0, MSG3
		156	xor128 MSG0, MSG2
		157
		158	/* Rounds 52-55 */
		159	sha1nexte MSG1, E1
		160	mova128 ABCD, E0
		161	sha1msg2 MSG1, MSG2
		162	sha1rnds4 $2, E1, ABCD
		163	sha1msg1 MSG1, MSG0
		164	xor128 MSG1, MSG3
		165
		166	/* Rounds 56-59 */
		167	sha1nexte MSG2, E0
		168	mova128 ABCD, E1
		169	sha1msg2 MSG2, MSG3
		170	sha1rnds4 $2, E0, ABCD
		171	sha1msg1 MSG2, MSG1
		172	xor128 MSG2, MSG0
		173
		174	/* Rounds 60-63 */
		175	sha1nexte MSG3, E1
		176	mova128 ABCD, E0
		177	sha1msg2 MSG3, MSG0
		178	sha1rnds4 $3, E1, ABCD
		179	sha1msg1 MSG3, MSG2
		180	xor128 MSG3, MSG1
		181
		182	/* Rounds 64-67 */
		183	sha1nexte MSG0, E0
		184	mova128 ABCD, E1
		185	sha1msg2 MSG0, MSG1
		186	sha1rnds4 $3, E0, ABCD
		187	sha1msg1 MSG0, MSG3
		188	xor128 MSG0, MSG2
		189
		190	/* Rounds 68-71 */
		191	sha1nexte MSG1, E1
		192	mova128 ABCD, E0
		193	sha1msg2 MSG1, MSG2
		194	sha1rnds4 $3, E1, ABCD
		195	xor128 MSG1, MSG3
		196
		197	/* Rounds 72-75 */
		198	sha1nexte MSG2, E0
		199	mova128 ABCD, E1
		200	sha1msg2 MSG2, MSG3
		201	sha1rnds4 $3, E0, ABCD
		202
		203	/* Rounds 76-79 */
		204	sha1nexte MSG3, E1
		205	mova128 ABCD, E0
		206	sha1rnds4 $3, E1, ABCD
		207
		208	/* Add current hash values with previously saved */
		209	sha1nexte %xmm9, E0
		210	paddd %xmm8, ABCD
		211
		212	/* Write hash values back in the correct order */
		213	shuf128_32 $0x1B, ABCD, ABCD
		214	movu128 ABCD, 80(%rdi)
		215	extr128_32 $3, E0, 80+4*4(%rdi)
		216
		217	ret
		218	.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
		219
		220	.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
		221	.align 16
		222	PSHUFFLE_BYTE_FLIP_MASK:
		223	.octa 0x000102030405060708090a0b0c0d0e0f
		224
		225	#endif