libbb/sha1: shrink and speed up fully unrolled version

function old new delta sha1_process_block64 4149 3950 -199 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2021-12-31 17:06:00 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2021-12-31 17:07:47 +0100
commit: f09d088fdf6eeeba902fb5627930145a3058a5f0 (patch)
tree: 6c3b17c675f4860babf27dd7f4056921fbad9896
parent: 0b62a08777e29c34f947c791a1eded5b97e05699 (diff)
download: busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.gz
busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.bz2
busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.zip
2 files changed, 23 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index c793f5939..d2054dc63 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
        Trade binary size versus speed for the sha1 algorithm.
                        throughput MB/s   size of sha1_process_block64
        value           486  x86-64       486   x86-64
-        0               339  374          4149  4167
+        0               360  374          3950  4167
        1               224  229           654   732
        2,3             200  195           358   380
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 053ebe291..faf485df5 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
        d = ctx->hash[3];
        e = ctx->hash[4];
+/* From kernel source comments:
+ * """
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ * """
+ */
+#if defined(__i386__)
+# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
+#else
+# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
+#endif
 #undef OP
 #define OP(A,B,C,D,E, n) \
        do { \
@@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
                        work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
                if (n >= 16) \
                        work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
+                DO_NOT_TRY_PROPAGATING(W[n & 15]); \
                E += work + rotl32(A, 5) + rconsts[n / 20]; \
                B = rotl32(B, 30); \
        } while (0)
author	Denys Vlasenko <vda.linux@googlemail.com>	2021-12-31 17:06:00 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2021-12-31 17:07:47 +0100
commit	f09d088fdf6eeeba902fb5627930145a3058a5f0 (patch)
tree	6c3b17c675f4860babf27dd7f4056921fbad9896
parent	0b62a08777e29c34f947c791a1eded5b97e05699 (diff)
download	busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.gz busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.bz2 busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.zip

diff --git a/libbb/Config.src b/libbb/Config.src index c793f5939..d2054dc63 100644 --- a/libbb/Config.src +++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
59	Trade binary size versus speed for the sha1 algorithm.	59	Trade binary size versus speed for the sha1 algorithm.
60	throughput MB/s size of sha1_process_block64	60	throughput MB/s size of sha1_process_block64
61	value 486 x86-64 486 x86-64	61	value 486 x86-64 486 x86-64
62	0 339 374 4149 4167	62	0 360 374 3950 4167
63	1 224 229 654 732	63	1 224 229 654 732
64	2,3 200 195 358 380	64	2,3 200 195 358 380
65		65


diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 053ebe291..faf485df5 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
509	d = ctx->hash[3];	509	d = ctx->hash[3];
510	e = ctx->hash[4];	510	e = ctx->hash[4];
511		511
		512	/* From kernel source comments:
		513	* """
		514	* If you have 32 registers or more, the compiler can (and should)
		515	* try to change the array[] accesses into registers. However, on
		516	* machines with less than ~25 registers, that won't really work,
		517	* and at least gcc will make an unholy mess of it.
		518	*
		519	* So to avoid that mess which just slows things down, we force
		520	* the stores to memory to actually happen (we might be better off
		521	* with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
		522	* suggested by Artur Skawina - that will also make gcc unable to
		523	* try to do the silly "optimize away loads" part because it won't
		524	* see what the value will be).
		525	* """
		526	*/
		527	#if defined(__i386__)
		528	# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
		529	#else
		530	# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
		531	#endif
		532
512	#undef OP	533	#undef OP
513	#define OP(A,B,C,D,E, n) \	534	#define OP(A,B,C,D,E, n) \
514	do { \	535	do { \
@@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
517	work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \	538	work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
518	if (n >= 16) \	539	if (n >= 16) \
519	work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \	540	work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
		541	DO_NOT_TRY_PROPAGATING(W[n & 15]); \
520	E += work + rotl32(A, 5) + rconsts[n / 20]; \	542	E += work + rotl32(A, 5) + rconsts[n / 20]; \
521	B = rotl32(B, 30); \	543	B = rotl32(B, 30); \
522	} while (0)	544	} while (0)