diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-12-30 13:07:12 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-12-30 13:17:16 +0100 |
commit | 25aadc893d21b35f7d34a9d1edc843632e7abd8f (patch) | |
tree | f8d2d860f06eeccb254e071e39213bc2a3f723ef | |
parent | 9173c9cce48dc4c867fb06bb72e8c762740c5c86 (diff) | |
download | busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.gz busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.bz2 busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.zip |
libbb/sha1: add config-selectable fully unrolled version, closes 14391
function old new delta
sha1_process_block64 364 4167 +3803
static.rconsts 16 - -16
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/Config.src | 25 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 84 |
2 files changed, 95 insertions, 14 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index 24b31fad9..13188ef03 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -42,21 +42,32 @@ config MD5_SMALL | |||
42 | default 1 # all "fast or small" options default to small | 42 | default 1 # all "fast or small" options default to small |
43 | range 0 3 | 43 | range 0 3 |
44 | help | 44 | help |
45 | Trade binary size versus speed for the md5sum algorithm. | 45 | Trade binary size versus speed for the md5 algorithm. |
46 | Approximate values running uClibc and hashing | 46 | Approximate values running uClibc and hashing |
47 | linux-2.4.4.tar.bz2 were: | 47 | linux-2.4.4.tar.bz2 were: |
48 | value user times (sec) text size (386) | 48 | value user times (sec) text size (386) |
49 | 0 (fastest) 1.1 6144 | 49 | 0 (fastest) 1.1 6144 |
50 | 1 1.4 5392 | 50 | 1 1.4 5392 |
51 | 2 3.0 5088 | 51 | 2 3.0 5088 |
52 | 3 (smallest) 5.1 4912 | 52 | 3 (smallest) 5.1 4912 |
53 | |||
54 | config SHA1_SMALL | ||
55 | int "SHA1: Trade bytes for speed (0:fast, 3:slow)" | ||
56 | default 3 # all "fast or small" options default to small | ||
57 | range 0 3 | ||
58 | help | ||
59 | Trade binary size versus speed for the sha1 algorithm. | ||
60 | throughput MB/s size of sha1_process_block64 | ||
61 | value 486 x86-64 486 x86-64 | ||
62 | 0 339 374 4149 4167 | ||
63 | 1,2,3 200 195 358 380 | ||
53 | 64 | ||
54 | config SHA3_SMALL | 65 | config SHA3_SMALL |
55 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" | 66 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" |
56 | default 1 # all "fast or small" options default to small | 67 | default 1 # all "fast or small" options default to small |
57 | range 0 1 | 68 | range 0 1 |
58 | help | 69 | help |
59 | Trade binary size versus speed for the sha3sum algorithm. | 70 | Trade binary size versus speed for the sha3 algorithm. |
60 | SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): | 71 | SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): |
61 | 64-bit x86: +270 bytes of code, 45% faster | 72 | 64-bit x86: +270 bytes of code, 45% faster |
62 | 32-bit x86: +450 bytes of code, 75% faster | 73 | 32-bit x86: +450 bytes of code, 75% faster |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index a468397e3..75673e334 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) | |||
390 | OP(FI, D, A, B, C, 11, 10, 0xbd3af235); | 390 | OP(FI, D, A, B, C, 11, 10, 0xbd3af235); |
391 | OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); | 391 | OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); |
392 | OP(FI, B, C, D, A, 9, 21, 0xeb86d391); | 392 | OP(FI, B, C, D, A, 9, 21, 0xeb86d391); |
393 | # undef OP | ||
394 | # endif | 393 | # endif |
395 | /* Add checksum to the starting values */ | 394 | /* Add checksum to the starting values */ |
396 | ctx->hash[0] += A; | 395 | ctx->hash[0] += A; |
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) | |||
399 | ctx->hash[3] += D; | 398 | ctx->hash[3] += D; |
400 | #endif | 399 | #endif |
401 | } | 400 | } |
401 | #undef OP | ||
402 | #undef FF | 402 | #undef FF |
403 | #undef FG | 403 | #undef FG |
404 | #undef FH | 404 | #undef FH |
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf) | |||
490 | * then rebuild and compare "shaNNNsum bigfile" results. | 490 | * then rebuild and compare "shaNNNsum bigfile" results. |
491 | */ | 491 | */ |
492 | 492 | ||
493 | #if CONFIG_SHA1_SMALL == 0 | ||
494 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | ||
495 | * It seems further speedup can be achieved by handling more than | ||
496 | * 64 bytes per one function call (coreutils does that). | ||
497 | */ | ||
498 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||
499 | { | ||
500 | static const uint32_t rconsts[] ALIGN4 = { | ||
501 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | ||
502 | }; | ||
503 | uint32_t W[16]; | ||
504 | uint32_t a, b, c, d, e; | ||
505 | |||
506 | a = ctx->hash[0]; | ||
507 | b = ctx->hash[1]; | ||
508 | c = ctx->hash[2]; | ||
509 | d = ctx->hash[3]; | ||
510 | e = ctx->hash[4]; | ||
511 | |||
512 | #undef OP | ||
513 | #define OP(A,B,C,D,E, n) \ | ||
514 | do { \ | ||
515 | uint32_t work = EXPR(B, C, D); \ | ||
516 | if (n <= 15) \ | ||
517 | work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ | ||
518 | if (n >= 16) \ | ||
519 | work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \ | ||
520 | E += work + rotl32(A, 5) + rconsts[n / 20]; \ | ||
521 | B = rotl32(B, 30); \ | ||
522 | } while (0) | ||
523 | #define OP20(n) \ | ||
524 | OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \ | ||
525 | OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \ | ||
526 | OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \ | ||
527 | OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19)) | ||
528 | |||
529 | /* 4 rounds of 20 operations each */ | ||
530 | #define EXPR(b,c,d) (((c ^ d) & b) ^ d) | ||
531 | OP20(0); | ||
532 | #undef EXPR | ||
533 | #define EXPR(b,c,d) (c ^ d ^ b) | ||
534 | OP20(20); | ||
535 | #undef EXPR | ||
536 | #define EXPR(b,c,d) (((b | c) & d) | (b & c)) | ||
537 | OP20(40); | ||
538 | #undef EXPR | ||
539 | #define EXPR(b,c,d) (c ^ d ^ b) | ||
540 | OP20(60); | ||
541 | |||
542 | #undef EXPR | ||
543 | #undef OP | ||
544 | #undef OP20 | ||
545 | |||
546 | ctx->hash[0] += a; | ||
547 | ctx->hash[1] += b; | ||
548 | ctx->hash[2] += c; | ||
549 | ctx->hash[3] += d; | ||
550 | ctx->hash[4] += e; | ||
551 | } | ||
552 | #else | ||
553 | /* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */ | ||
554 | |||
555 | /* Compact version, almost twice as slow as fully unrolled */ | ||
493 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | 556 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) |
494 | { | 557 | { |
495 | static const uint32_t rconsts[] ALIGN4 = { | 558 | static const uint32_t rconsts[] ALIGN4 = { |
496 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | 559 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 |
497 | }; | 560 | }; |
498 | int i, j; | 561 | int i, j; |
499 | int cnt; | 562 | int n; |
500 | uint32_t W[16+16]; | 563 | uint32_t W[16+16]; |
501 | uint32_t a, b, c, d, e; | 564 | uint32_t a, b, c, d, e; |
502 | 565 | ||
503 | /* On-stack work buffer frees up one register in the main loop | 566 | /* On-stack work buffer frees up one register in the main loop |
504 | * which otherwise will be needed to hold ctx pointer */ | 567 | * which otherwise will be needed to hold ctx pointer. |
568 | * | ||
569 | * The compiler is not smart enough to realize it, though. :( | ||
570 | * If __attribute__((optimize("2"))) is added to the function, | ||
571 | * only then gcc-9.3.1 spills "ctx" to stack and uses the freed | ||
572 | * register (making code 6 bytes smaller, not just faster). | ||
573 | */ | ||
505 | for (i = 0; i < 16; i++) | 574 | for (i = 0; i < 16; i++) |
506 | W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); | 575 | W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); |
507 | 576 | ||
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
512 | e = ctx->hash[4]; | 581 | e = ctx->hash[4]; |
513 | 582 | ||
514 | /* 4 rounds of 20 operations each */ | 583 | /* 4 rounds of 20 operations each */ |
515 | cnt = 0; | 584 | n = 0; |
516 | for (i = 0; i < 4; i++) { | 585 | for (i = 0; i < 4; i++) { |
517 | j = 19; | 586 | j = 19; |
518 | do { | 587 | do { |
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
529 | else /* i = 1 or 3 */ | 598 | else /* i = 1 or 3 */ |
530 | work ^= b; | 599 | work ^= b; |
531 | ge16: | 600 | ge16: |
532 | W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); | 601 | W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); |
533 | } | 602 | } |
534 | work += W[cnt]; | 603 | work += W[n]; |
535 | work += e + rotl32(a, 5) + rconsts[i]; | 604 | work += e + rotl32(a, 5) + rconsts[i]; |
536 | 605 | ||
537 | /* Rotate by one for next time */ | 606 | /* Rotate by one for next time */ |
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
540 | c = rotl32(b, 30); | 609 | c = rotl32(b, 30); |
541 | b = a; | 610 | b = a; |
542 | a = work; | 611 | a = work; |
543 | cnt = (cnt + 1) & 15; | 612 | n = (n + 1) & 15; |
544 | } while (--j >= 0); | 613 | } while (--j >= 0); |
545 | } | 614 | } |
546 | 615 | ||
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
550 | ctx->hash[3] += d; | 619 | ctx->hash[3] += d; |
551 | ctx->hash[4] += e; | 620 | ctx->hash[4] += e; |
552 | } | 621 | } |
622 | #endif | ||
553 | 623 | ||
554 | /* Constants for SHA512 from FIPS 180-2:4.2.3. | 624 | /* Constants for SHA512 from FIPS 180-2:4.2.3. |
555 | * SHA256 constants from FIPS 180-2:4.2.2 | 625 | * SHA256 constants from FIPS 180-2:4.2.2 |