aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha.c
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-12-30 13:07:12 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-12-30 13:17:16 +0100
commit25aadc893d21b35f7d34a9d1edc843632e7abd8f (patch)
treef8d2d860f06eeccb254e071e39213bc2a3f723ef /libbb/hash_md5_sha.c
parent9173c9cce48dc4c867fb06bb72e8c762740c5c86 (diff)
downloadbusybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.gz
busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.bz2
busybox-w32-25aadc893d21b35f7d34a9d1edc843632e7abd8f.zip
libbb/sha1: add config-selectable fully unrolled version, closes 14391
function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb/hash_md5_sha.c')
-rw-r--r--libbb/hash_md5_sha.c84
1 files changed, 77 insertions, 7 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a468397e3..75673e334 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
390 OP(FI, D, A, B, C, 11, 10, 0xbd3af235); 390 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); 391 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392 OP(FI, B, C, D, A, 9, 21, 0xeb86d391); 392 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393# undef OP
394# endif 393# endif
395 /* Add checksum to the starting values */ 394 /* Add checksum to the starting values */
396 ctx->hash[0] += A; 395 ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
399 ctx->hash[3] += D; 398 ctx->hash[3] += D;
400#endif 399#endif
401} 400}
401#undef OP
402#undef FF 402#undef FF
403#undef FG 403#undef FG
404#undef FH 404#undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
490 * then rebuild and compare "shaNNNsum bigfile" results. 490 * then rebuild and compare "shaNNNsum bigfile" results.
491 */ 491 */
492 492
493#if CONFIG_SHA1_SMALL == 0
494/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
495 * It seems further speedup can be achieved by handling more than
496 * 64 bytes per one function call (coreutils does that).
497 */
498static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
499{
500 static const uint32_t rconsts[] ALIGN4 = {
501 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
502 };
503 uint32_t W[16];
504 uint32_t a, b, c, d, e;
505
506 a = ctx->hash[0];
507 b = ctx->hash[1];
508 c = ctx->hash[2];
509 d = ctx->hash[3];
510 e = ctx->hash[4];
511
512#undef OP
513#define OP(A,B,C,D,E, n) \
514 do { \
515 uint32_t work = EXPR(B, C, D); \
516 if (n <= 15) \
517 work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
518 if (n >= 16) \
519 work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
520 E += work + rotl32(A, 5) + rconsts[n / 20]; \
521 B = rotl32(B, 30); \
522 } while (0)
523#define OP20(n) \
524 OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
525 OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
526 OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
527 OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
528
529 /* 4 rounds of 20 operations each */
530#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
531 OP20(0);
532#undef EXPR
533#define EXPR(b,c,d) (c ^ d ^ b)
534 OP20(20);
535#undef EXPR
536#define EXPR(b,c,d) (((b | c) & d) | (b & c))
537 OP20(40);
538#undef EXPR
539#define EXPR(b,c,d) (c ^ d ^ b)
540 OP20(60);
541
542#undef EXPR
543#undef OP
544#undef OP20
545
546 ctx->hash[0] += a;
547 ctx->hash[1] += b;
548 ctx->hash[2] += c;
549 ctx->hash[3] += d;
550 ctx->hash[4] += e;
551}
552#else
553/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
554
555/* Compact version, almost twice as slow as fully unrolled */
493static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) 556static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494{ 557{
495 static const uint32_t rconsts[] ALIGN4 = { 558 static const uint32_t rconsts[] ALIGN4 = {
496 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 559 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497 }; 560 };
498 int i, j; 561 int i, j;
499 int cnt; 562 int n;
500 uint32_t W[16+16]; 563 uint32_t W[16+16];
501 uint32_t a, b, c, d, e; 564 uint32_t a, b, c, d, e;
502 565
503 /* On-stack work buffer frees up one register in the main loop 566 /* On-stack work buffer frees up one register in the main loop
504 * which otherwise will be needed to hold ctx pointer */ 567 * which otherwise will be needed to hold ctx pointer.
568 *
569 * The compiler is not smart enough to realize it, though. :(
570 * If __attribute__((optimize("2"))) is added to the function,
571 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
572 * register (making code 6 bytes smaller, not just faster).
573 */
505 for (i = 0; i < 16; i++) 574 for (i = 0; i < 16; i++)
506 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); 575 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507 576
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
512 e = ctx->hash[4]; 581 e = ctx->hash[4];
513 582
514 /* 4 rounds of 20 operations each */ 583 /* 4 rounds of 20 operations each */
515 cnt = 0; 584 n = 0;
516 for (i = 0; i < 4; i++) { 585 for (i = 0; i < 4; i++) {
517 j = 19; 586 j = 19;
518 do { 587 do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
529 else /* i = 1 or 3 */ 598 else /* i = 1 or 3 */
530 work ^= b; 599 work ^= b;
531 ge16: 600 ge16:
532 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); 601 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
533 } 602 }
534 work += W[cnt]; 603 work += W[n];
535 work += e + rotl32(a, 5) + rconsts[i]; 604 work += e + rotl32(a, 5) + rconsts[i];
536 605
537 /* Rotate by one for next time */ 606 /* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
540 c = rotl32(b, 30); 609 c = rotl32(b, 30);
541 b = a; 610 b = a;
542 a = work; 611 a = work;
543 cnt = (cnt + 1) & 15; 612 n = (n + 1) & 15;
544 } while (--j >= 0); 613 } while (--j >= 0);
545 } 614 }
546 615
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
550 ctx->hash[3] += d; 619 ctx->hash[3] += d;
551 ctx->hash[4] += e; 620 ctx->hash[4] += e;
552} 621}
622#endif
553 623
554/* Constants for SHA512 from FIPS 180-2:4.2.3. 624/* Constants for SHA512 from FIPS 180-2:4.2.3.
555 * SHA256 constants from FIPS 180-2:4.2.2 625 * SHA256 constants from FIPS 180-2:4.2.2