libbb/sha1: add config-selectable fully unrolled version, closes 14391

function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2021-12-30 13:07:12 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2021-12-30 13:17:16 +0100
commit: 25aadc893d21b35f7d34a9d1edc843632e7abd8f (patch)
tree: f8d2d860f06eeccb254e071e39213bc2a3f723ef
parent: 9173c9cce48dc4c867fb06bb72e8c762740c5c86 (diff)
download: busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.gz
busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.bz2
busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.zip
2 files changed, 95 insertions, 14 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index 24b31fad9..13188ef03 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -42,21 +42,32 @@ config MD5_SMALL
        default 1  # all "fast or small" options default to small
        range 0 3
        help
-        Trade binary size versus speed for the md5sum algorithm.
+        Trade binary size versus speed for the md5 algorithm.
        Approximate values running uClibc and hashing
        linux-2.4.4.tar.bz2 were:
-        value               user times (sec)  text size (386)
+        value           user times (sec)  text size (386)
-        0 (fastest)         1.1                6144
+        0 (fastest)     1.1               6144
-        1                   1.4                5392
+        1               1.4               5392
-        2                   3.0                5088
+        2               3.0               5088
-        3 (smallest)        5.1                4912
+        3 (smallest)    5.1               4912
+config SHA1_SMALL
+        int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
+        default 3  # all "fast or small" options default to small
+        range 0 3
+        help
+        Trade binary size versus speed for the sha1 algorithm.
+                        throughput MB/s   size of sha1_process_block64
+        value           486  x86-64       486   x86-64
+        0               339  374          4149  4167
+        1,2,3           200  195           358   380
 config SHA3_SMALL
        int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
        default 1  # all "fast or small" options default to small
        range 0 1
        help
-        Trade binary size versus speed for the sha3sum algorithm.
+        Trade binary size versus speed for the sha3 algorithm.
        SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
        64-bit x86: +270 bytes of code, 45% faster
        32-bit x86: +450 bytes of code, 75% faster
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a468397e3..75673e334 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
        OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
        OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
        OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
-# undef OP
 # endif
        /* Add checksum to the starting values */
        ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
        ctx->hash[3] += D;
 #endif
 }
+#undef OP
 #undef FF
 #undef FG
 #undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 * then rebuild and compare "shaNNNsum bigfile" results.
 */
+#if CONFIG_SHA1_SMALL == 0
+/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
+ * It seems further speedup can be achieved by handling more than
+ * 64 bytes per one function call (coreutils does that).
+ */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+        static const uint32_t rconsts[] ALIGN4 = {
+                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+        };
+        uint32_t W[16];
+        uint32_t a, b, c, d, e;
+        a = ctx->hash[0];
+        b = ctx->hash[1];
+        c = ctx->hash[2];
+        d = ctx->hash[3];
+        e = ctx->hash[4];
+#undef OP
+#define OP(A,B,C,D,E, n) \
+        do { \
+                uint32_t work = EXPR(B, C, D); \
+                if (n <= 15) \
+                        work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
+                if (n >= 16) \
+                        work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
+                E += work + rotl32(A, 5) + rconsts[n / 20]; \
+                B = rotl32(B, 30); \
+        } while (0)
+#define OP20(n) \
+        OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
+        OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
+        OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
+        OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
+        /* 4 rounds of 20 operations each */
+#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
+        OP20(0);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+        OP20(20);
+#undef EXPR
+#define EXPR(b,c,d) (((b | c) & d) | (b & c))
+        OP20(40);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+        OP20(60);
+#undef EXPR
+#undef OP
+#undef OP20
+        ctx->hash[0] += a;
+        ctx->hash[1] += b;
+        ctx->hash[2] += c;
+        ctx->hash[3] += d;
+        ctx->hash[4] += e;
+}
+#else
+/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
+/* Compact version, almost twice as slow as fully unrolled */
 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 {
        static const uint32_t rconsts[] ALIGN4 = {
                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
        };
        int i, j;
-        int cnt;
+        int n;
        uint32_t W[16+16];
        uint32_t a, b, c, d, e;
        /* On-stack work buffer frees up one register in the main loop
-         * which otherwise will be needed to hold ctx pointer */
+         * which otherwise will be needed to hold ctx pointer.
+         *
+         * The compiler is not smart enough to realize it, though. :(
+         * If __attribute__((optimize("2"))) is added to the function,
+         * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
+         * register (making code 6 bytes smaller, not just faster).
+         */
        for (i = 0; i < 16; i++)
                W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
        e = ctx->hash[4];
        /* 4 rounds of 20 operations each */
-        cnt = 0;
+        n = 0;
        for (i = 0; i < 4; i++) {
                j = 19;
                do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
                                else /* i = 1 or 3 */
                                        work ^= b;
 ge16:
-                                W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
+                                W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
                        }
-                        work += W[cnt];
+                        work += W[n];
                        work += e + rotl32(a, 5) + rconsts[i];
                        /* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
                        c = rotl32(b, 30);
                        b = a;
                        a = work;
-                        cnt = (cnt + 1) & 15;
+                        n = (n + 1) & 15;
                } while (--j >= 0);
        }
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
        ctx->hash[3] += d;
        ctx->hash[4] += e;
 }
+#endif
 /* Constants for SHA512 from FIPS 180-2:4.2.3.
 * SHA256 constants from FIPS 180-2:4.2.2
author	Denys Vlasenko <vda.linux@googlemail.com>	2021-12-30 13:07:12 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2021-12-30 13:17:16 +0100
commit	25aadc893d21b35f7d34a9d1edc843632e7abd8f (patch)
tree	f8d2d860f06eeccb254e071e39213bc2a3f723ef
parent	9173c9cce48dc4c867fb06bb72e8c762740c5c86 (diff)
download	busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.gz busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.bz2 busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.zip

diff --git a/libbb/Config.src b/libbb/Config.src index 24b31fad9..13188ef03 100644 --- a/libbb/Config.src +++ b/libbb/Config.src
@@ -42,21 +42,32 @@ config MD5_SMALL
42	default 1 # all "fast or small" options default to small	42	default 1 # all "fast or small" options default to small
43	range 0 3	43	range 0 3
44	help	44	help
45	Trade binary size versus speed for the md5sum algorithm.	45	Trade binary size versus speed for the md5 algorithm.
46	Approximate values running uClibc and hashing	46	Approximate values running uClibc and hashing
47	linux-2.4.4.tar.bz2 were:	47	linux-2.4.4.tar.bz2 were:
48	value user times (sec) text size (386)	48	value user times (sec) text size (386)
49	0 (fastest) 1.1 6144	49	0 (fastest) 1.1 6144
50	1 1.4 5392	50	1 1.4 5392
51	2 3.0 5088	51	2 3.0 5088
52	3 (smallest) 5.1 4912	52	3 (smallest) 5.1 4912
		53
		54	config SHA1_SMALL
		55	int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
		56	default 3 # all "fast or small" options default to small
		57	range 0 3
		58	help
		59	Trade binary size versus speed for the sha1 algorithm.
		60	throughput MB/s size of sha1_process_block64
		61	value 486 x86-64 486 x86-64
		62	0 339 374 4149 4167
		63	1,2,3 200 195 358 380
53		64
54	config SHA3_SMALL	65	config SHA3_SMALL
55	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"	66	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
56	default 1 # all "fast or small" options default to small	67	default 1 # all "fast or small" options default to small
57	range 0 1	68	range 0 1
58	help	69	help
59	Trade binary size versus speed for the sha3sum algorithm.	70	Trade binary size versus speed for the sha3 algorithm.
60	SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):	71	SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
61	64-bit x86: +270 bytes of code, 45% faster	72	64-bit x86: +270 bytes of code, 45% faster
62	32-bit x86: +450 bytes of code, 75% faster	73	32-bit x86: +450 bytes of code, 75% faster


diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index a468397e3..75673e334 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
390	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);	390	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);	391	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);	392	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393	# undef OP
394	# endif	393	# endif
395	/* Add checksum to the starting values */	394	/* Add checksum to the starting values */
396	ctx->hash[0] += A;	395	ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
399	ctx->hash[3] += D;	398	ctx->hash[3] += D;
400	#endif	399	#endif
401	}	400	}
		401	#undef OP
402	#undef FF	402	#undef FF
403	#undef FG	403	#undef FG
404	#undef FH	404	#undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t ctx, void resbuf)
490	* then rebuild and compare "shaNNNsum bigfile" results.	490	* then rebuild and compare "shaNNNsum bigfile" results.
491	*/	491	*/
492		492
		493	#if CONFIG_SHA1_SMALL == 0
		494	/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
		495	* It seems further speedup can be achieved by handling more than
		496	* 64 bytes per one function call (coreutils does that).
		497	*/
		498	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
		499	{
		500	static const uint32_t rconsts[] ALIGN4 = {
		501	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
		502	};
		503	uint32_t W[16];
		504	uint32_t a, b, c, d, e;
		505
		506	a = ctx->hash[0];
		507	b = ctx->hash[1];
		508	c = ctx->hash[2];
		509	d = ctx->hash[3];
		510	e = ctx->hash[4];
		511
		512	#undef OP
		513	#define OP(A,B,C,D,E, n) \
		514	do { \
		515	uint32_t work = EXPR(B, C, D); \
		516	if (n <= 15) \
		517	work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
		518	if (n >= 16) \
		519	work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
		520	E += work + rotl32(A, 5) + rconsts[n / 20]; \
		521	B = rotl32(B, 30); \
		522	} while (0)
		523	#define OP20(n) \
		524	OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
		525	OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
		526	OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
		527	OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
		528
		529	/* 4 rounds of 20 operations each */
		530	#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
		531	OP20(0);
		532	#undef EXPR
		533	#define EXPR(b,c,d) (c ^ d ^ b)
		534	OP20(20);
		535	#undef EXPR
		536	#define EXPR(b,c,d) (((b \| c) & d) \| (b & c))
		537	OP20(40);
		538	#undef EXPR
		539	#define EXPR(b,c,d) (c ^ d ^ b)
		540	OP20(60);
		541
		542	#undef EXPR
		543	#undef OP
		544	#undef OP20
		545
		546	ctx->hash[0] += a;
		547	ctx->hash[1] += b;
		548	ctx->hash[2] += c;
		549	ctx->hash[3] += d;
		550	ctx->hash[4] += e;
		551	}
		552	#else
		553	/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
		554
		555	/* Compact version, almost twice as slow as fully unrolled */
493	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)	556	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494	{	557	{
495	static const uint32_t rconsts[] ALIGN4 = {	558	static const uint32_t rconsts[] ALIGN4 = {
496	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6	559	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497	};	560	};
498	int i, j;	561	int i, j;
499	int cnt;	562	int n;
500	uint32_t W[16+16];	563	uint32_t W[16+16];
501	uint32_t a, b, c, d, e;	564	uint32_t a, b, c, d, e;
502		565
503	/* On-stack work buffer frees up one register in the main loop	566	/* On-stack work buffer frees up one register in the main loop
504	* which otherwise will be needed to hold ctx pointer */	567	* which otherwise will be needed to hold ctx pointer.
		568	*
		569	* The compiler is not smart enough to realize it, though. :(
		570	* If __attribute__((optimize("2"))) is added to the function,
		571	* only then gcc-9.3.1 spills "ctx" to stack and uses the freed
		572	* register (making code 6 bytes smaller, not just faster).
		573	*/
505	for (i = 0; i < 16; i++)	574	for (i = 0; i < 16; i++)
506	W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);	575	W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507		576
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
512	e = ctx->hash[4];	581	e = ctx->hash[4];
513		582
514	/* 4 rounds of 20 operations each */	583	/* 4 rounds of 20 operations each */
515	cnt = 0;	584	n = 0;
516	for (i = 0; i < 4; i++) {	585	for (i = 0; i < 4; i++) {
517	j = 19;	586	j = 19;
518	do {	587	do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
529	else /* i = 1 or 3 */	598	else /* i = 1 or 3 */
530	work ^= b;	599	work ^= b;
531	ge16:	600	ge16:
532	W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);	601	W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
533	}	602	}
534	work += W[cnt];	603	work += W[n];
535	work += e + rotl32(a, 5) + rconsts[i];	604	work += e + rotl32(a, 5) + rconsts[i];
536		605
537	/* Rotate by one for next time */	606	/* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
540	c = rotl32(b, 30);	609	c = rotl32(b, 30);
541	b = a;	610	b = a;
542	a = work;	611	a = work;
543	cnt = (cnt + 1) & 15;	612	n = (n + 1) & 15;
544	} while (--j >= 0);	613	} while (--j >= 0);
545	}	614	}
546		615
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
550	ctx->hash[3] += d;	619	ctx->hash[3] += d;
551	ctx->hash[4] += e;	620	ctx->hash[4] += e;
552	}	621	}
		622	#endif
553		623
554	/* Constants for SHA512 from FIPS 180-2:4.2.3.	624	/* Constants for SHA512 from FIPS 180-2:4.2.3.
555	* SHA256 constants from FIPS 180-2:4.2.2	625	* SHA256 constants from FIPS 180-2:4.2.2