diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-12-31 17:06:00 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-12-31 17:07:47 +0100 |
commit | f09d088fdf6eeeba902fb5627930145a3058a5f0 (patch) | |
tree | 6c3b17c675f4860babf27dd7f4056921fbad9896 | |
parent | 0b62a08777e29c34f947c791a1eded5b97e05699 (diff) | |
download | busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.gz busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.tar.bz2 busybox-w32-f09d088fdf6eeeba902fb5627930145a3058a5f0.zip |
libbb/sha1: shrink and speed up fully unrolled version
function old new delta
sha1_process_block64 4149 3950 -199
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/Config.src | 2 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 22 |
2 files changed, 23 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index c793f5939..d2054dc63 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -59,7 +59,7 @@ config SHA1_SMALL | |||
59 | Trade binary size versus speed for the sha1 algorithm. | 59 | Trade binary size versus speed for the sha1 algorithm. |
60 | throughput MB/s size of sha1_process_block64 | 60 | throughput MB/s size of sha1_process_block64 |
61 | value 486 x86-64 486 x86-64 | 61 | value 486 x86-64 486 x86-64 |
62 | 0 339 374 4149 4167 | 62 | 0 360 374 3950 4167 |
63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
65 | 65 | ||
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 053ebe291..faf485df5 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
509 | d = ctx->hash[3]; | 509 | d = ctx->hash[3]; |
510 | e = ctx->hash[4]; | 510 | e = ctx->hash[4]; |
511 | 511 | ||
512 | /* From kernel source comments: | ||
513 | * """ | ||
514 | * If you have 32 registers or more, the compiler can (and should) | ||
515 | * try to change the array[] accesses into registers. However, on | ||
516 | * machines with less than ~25 registers, that won't really work, | ||
517 | * and at least gcc will make an unholy mess of it. | ||
518 | * | ||
519 | * So to avoid that mess which just slows things down, we force | ||
520 | * the stores to memory to actually happen (we might be better off | ||
521 | * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as | ||
522 | * suggested by Artur Skawina - that will also make gcc unable to | ||
523 | * try to do the silly "optimize away loads" part because it won't | ||
524 | * see what the value will be). | ||
525 | * """ | ||
526 | */ | ||
527 | #if defined(__i386__) | ||
528 | # define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m)) | ||
529 | #else | ||
530 | # define DO_NOT_TRY_PROPAGATING(m) ((void)0) | ||
531 | #endif | ||
532 | |||
512 | #undef OP | 533 | #undef OP |
513 | #define OP(A,B,C,D,E, n) \ | 534 | #define OP(A,B,C,D,E, n) \ |
514 | do { \ | 535 | do { \ |
@@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
517 | work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ | 538 | work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ |
518 | if (n >= 16) \ | 539 | if (n >= 16) \ |
519 | work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \ | 540 | work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \ |
541 | DO_NOT_TRY_PROPAGATING(W[n & 15]); \ | ||
520 | E += work + rotl32(A, 5) + rconsts[n / 20]; \ | 542 | E += work + rotl32(A, 5) + rconsts[n / 20]; \ |
521 | B = rotl32(B, 30); \ | 543 | B = rotl32(B, 30); \ |
522 | } while (0) | 544 | } while (0) |