aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-12-30 13:07:12 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-12-30 13:17:16 +0100
commit25aadc893d21b35f7d34a9d1edc843632e7abd8f (patch)
treef8d2d860f06eeccb254e071e39213bc2a3f723ef
parent9173c9cce48dc4c867fb06bb72e8c762740c5c86 (diff)
downloadbusybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.gz
busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.bz2
busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.zip
libbb/sha1: add config-selectable fully unrolled version, closes 14391
function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/Config.src25
-rw-r--r--libbb/hash_md5_sha.c84
2 files changed, 95 insertions, 14 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index 24b31fad9..13188ef03 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -42,21 +42,32 @@ config MD5_SMALL
42 default 1 # all "fast or small" options default to small 42 default 1 # all "fast or small" options default to small
43 range 0 3 43 range 0 3
44 help 44 help
45 Trade binary size versus speed for the md5sum algorithm. 45 Trade binary size versus speed for the md5 algorithm.
46 Approximate values running uClibc and hashing 46 Approximate values running uClibc and hashing
47 linux-2.4.4.tar.bz2 were: 47 linux-2.4.4.tar.bz2 were:
48 value user times (sec) text size (386) 48 value user times (sec) text size (386)
49 0 (fastest) 1.1 6144 49 0 (fastest) 1.1 6144
50 1 1.4 5392 50 1 1.4 5392
51 2 3.0 5088 51 2 3.0 5088
52 3 (smallest) 5.1 4912 52 3 (smallest) 5.1 4912
53
54config SHA1_SMALL
55 int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
56 default 3 # all "fast or small" options default to small
57 range 0 3
58 help
59 Trade binary size versus speed for the sha1 algorithm.
60 throughput MB/s size of sha1_process_block64
61 value 486 x86-64 486 x86-64
62 0 339 374 4149 4167
63 1,2,3 200 195 358 380
53 64
54config SHA3_SMALL 65config SHA3_SMALL
55 int "SHA3: Trade bytes for speed (0:fast, 1:slow)" 66 int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
56 default 1 # all "fast or small" options default to small 67 default 1 # all "fast or small" options default to small
57 range 0 1 68 range 0 1
58 help 69 help
59 Trade binary size versus speed for the sha3sum algorithm. 70 Trade binary size versus speed for the sha3 algorithm.
60 SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): 71 SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
61 64-bit x86: +270 bytes of code, 45% faster 72 64-bit x86: +270 bytes of code, 45% faster
62 32-bit x86: +450 bytes of code, 75% faster 73 32-bit x86: +450 bytes of code, 75% faster
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a468397e3..75673e334 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
390 OP(FI, D, A, B, C, 11, 10, 0xbd3af235); 390 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); 391 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392 OP(FI, B, C, D, A, 9, 21, 0xeb86d391); 392 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393# undef OP
394# endif 393# endif
395 /* Add checksum to the starting values */ 394 /* Add checksum to the starting values */
396 ctx->hash[0] += A; 395 ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
399 ctx->hash[3] += D; 398 ctx->hash[3] += D;
400#endif 399#endif
401} 400}
401#undef OP
402#undef FF 402#undef FF
403#undef FG 403#undef FG
404#undef FH 404#undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
490 * then rebuild and compare "shaNNNsum bigfile" results. 490 * then rebuild and compare "shaNNNsum bigfile" results.
491 */ 491 */
492 492
493#if CONFIG_SHA1_SMALL == 0
494/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
495 * It seems further speedup can be achieved by handling more than
496 * 64 bytes per one function call (coreutils does that).
497 */
498static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
499{
500 static const uint32_t rconsts[] ALIGN4 = {
501 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
502 };
503 uint32_t W[16];
504 uint32_t a, b, c, d, e;
505
506 a = ctx->hash[0];
507 b = ctx->hash[1];
508 c = ctx->hash[2];
509 d = ctx->hash[3];
510 e = ctx->hash[4];
511
512#undef OP
513#define OP(A,B,C,D,E, n) \
514 do { \
515 uint32_t work = EXPR(B, C, D); \
516 if (n <= 15) \
517 work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
518 if (n >= 16) \
519 work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
520 E += work + rotl32(A, 5) + rconsts[n / 20]; \
521 B = rotl32(B, 30); \
522 } while (0)
523#define OP20(n) \
524 OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
525 OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
526 OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
527 OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
528
529 /* 4 rounds of 20 operations each */
530#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
531 OP20(0);
532#undef EXPR
533#define EXPR(b,c,d) (c ^ d ^ b)
534 OP20(20);
535#undef EXPR
536#define EXPR(b,c,d) (((b | c) & d) | (b & c))
537 OP20(40);
538#undef EXPR
539#define EXPR(b,c,d) (c ^ d ^ b)
540 OP20(60);
541
542#undef EXPR
543#undef OP
544#undef OP20
545
546 ctx->hash[0] += a;
547 ctx->hash[1] += b;
548 ctx->hash[2] += c;
549 ctx->hash[3] += d;
550 ctx->hash[4] += e;
551}
552#else
553/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
554
555/* Compact version, almost twice as slow as fully unrolled */
493static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) 556static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494{ 557{
495 static const uint32_t rconsts[] ALIGN4 = { 558 static const uint32_t rconsts[] ALIGN4 = {
496 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 559 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497 }; 560 };
498 int i, j; 561 int i, j;
499 int cnt; 562 int n;
500 uint32_t W[16+16]; 563 uint32_t W[16+16];
501 uint32_t a, b, c, d, e; 564 uint32_t a, b, c, d, e;
502 565
503 /* On-stack work buffer frees up one register in the main loop 566 /* On-stack work buffer frees up one register in the main loop
504 * which otherwise will be needed to hold ctx pointer */ 567 * which otherwise will be needed to hold ctx pointer.
568 *
569 * The compiler is not smart enough to realize it, though. :(
570 * If __attribute__((optimize("2"))) is added to the function,
571 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
572 * register (making code 6 bytes smaller, not just faster).
573 */
505 for (i = 0; i < 16; i++) 574 for (i = 0; i < 16; i++)
506 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); 575 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507 576
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
512 e = ctx->hash[4]; 581 e = ctx->hash[4];
513 582
514 /* 4 rounds of 20 operations each */ 583 /* 4 rounds of 20 operations each */
515 cnt = 0; 584 n = 0;
516 for (i = 0; i < 4; i++) { 585 for (i = 0; i < 4; i++) {
517 j = 19; 586 j = 19;
518 do { 587 do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
529 else /* i = 1 or 3 */ 598 else /* i = 1 or 3 */
530 work ^= b; 599 work ^= b;
531 ge16: 600 ge16:
532 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); 601 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
533 } 602 }
534 work += W[cnt]; 603 work += W[n];
535 work += e + rotl32(a, 5) + rconsts[i]; 604 work += e + rotl32(a, 5) + rconsts[i];
536 605
537 /* Rotate by one for next time */ 606 /* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
540 c = rotl32(b, 30); 609 c = rotl32(b, 30);
541 b = a; 610 b = a;
542 a = work; 611 a = work;
543 cnt = (cnt + 1) & 15; 612 n = (n + 1) & 15;
544 } while (--j >= 0); 613 } while (--j >= 0);
545 } 614 }
546 615
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
550 ctx->hash[3] += d; 619 ctx->hash[3] += d;
551 ctx->hash[4] += e; 620 ctx->hash[4] += e;
552} 621}
622#endif
553 623
554/* Constants for SHA512 from FIPS 180-2:4.2.3. 624/* Constants for SHA512 from FIPS 180-2:4.2.3.
555 * SHA256 constants from FIPS 180-2:4.2.2 625 * SHA256 constants from FIPS 180-2:4.2.2