aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-12-31 17:06:00 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-12-31 17:07:47 +0100
commitf09d088fdf6eeeba902fb5627930145a3058a5f0 (patch)
tree6c3b17c675f4860babf27dd7f4056921fbad9896
parent0b62a08777e29c34f947c791a1eded5b97e05699 (diff)
downloadbusybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.gz
busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.bz2
busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.zip
libbb/sha1: shrink and speed up fully unrolled version
function old new delta sha1_process_block64 4149 3950 -199 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/Config.src2
-rw-r--r--libbb/hash_md5_sha.c22
2 files changed, 23 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index c793f5939..d2054dc63 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
59 Trade binary size versus speed for the sha1 algorithm. 59 Trade binary size versus speed for the sha1 algorithm.
60 throughput MB/s size of sha1_process_block64 60 throughput MB/s size of sha1_process_block64
61 value 486 x86-64 486 x86-64 61 value 486 x86-64 486 x86-64
62 0 339 374 4149 4167 62 0 360 374 3950 4167
63 1 224 229 654 732 63 1 224 229 654 732
64 2,3 200 195 358 380 64 2,3 200 195 358 380
65 65
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 053ebe291..faf485df5 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
509 d = ctx->hash[3]; 509 d = ctx->hash[3];
510 e = ctx->hash[4]; 510 e = ctx->hash[4];
511 511
512/* From kernel source comments:
513 * """
514 * If you have 32 registers or more, the compiler can (and should)
515 * try to change the array[] accesses into registers. However, on
516 * machines with less than ~25 registers, that won't really work,
517 * and at least gcc will make an unholy mess of it.
518 *
519 * So to avoid that mess which just slows things down, we force
520 * the stores to memory to actually happen (we might be better off
521 * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
522 * suggested by Artur Skawina - that will also make gcc unable to
523 * try to do the silly "optimize away loads" part because it won't
524 * see what the value will be).
525 * """
526 */
527#if defined(__i386__)
528# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
529#else
530# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
531#endif
532
512#undef OP 533#undef OP
513#define OP(A,B,C,D,E, n) \ 534#define OP(A,B,C,D,E, n) \
514 do { \ 535 do { \
@@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
517 work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ 538 work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
518 if (n >= 16) \ 539 if (n >= 16) \
519 work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \ 540 work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
541 DO_NOT_TRY_PROPAGATING(W[n & 15]); \
520 E += work + rotl32(A, 5) + rconsts[n / 20]; \ 542 E += work + rotl32(A, 5) + rconsts[n / 20]; \
521 B = rotl32(B, 30); \ 543 B = rotl32(B, 30); \
522 } while (0) 544 } while (0)