diff options
Diffstat (limited to 'libbb/hash_md5_sha.c')
-rw-r--r-- | libbb/hash_md5_sha.c | 415 |
1 files changed, 404 insertions, 11 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index e0db8ce67..ee19c1cb7 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -8,6 +8,9 @@ | |||
8 | */ | 8 | */ |
9 | #include "libbb.h" | 9 | #include "libbb.h" |
10 | 10 | ||
11 | #define STR1(s) #s | ||
12 | #define STR(s) STR1(s) | ||
13 | |||
11 | #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) | 14 | #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) |
12 | 15 | ||
13 | /* gcc 4.2.1 optimizes rotr64 better with inline than with macro | 16 | /* gcc 4.2.1 optimizes rotr64 better with inline than with macro |
@@ -390,7 +393,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) | |||
390 | OP(FI, D, A, B, C, 11, 10, 0xbd3af235); | 393 | OP(FI, D, A, B, C, 11, 10, 0xbd3af235); |
391 | OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); | 394 | OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); |
392 | OP(FI, B, C, D, A, 9, 21, 0xeb86d391); | 395 | OP(FI, B, C, D, A, 9, 21, 0xeb86d391); |
393 | # undef OP | ||
394 | # endif | 396 | # endif |
395 | /* Add checksum to the starting values */ | 397 | /* Add checksum to the starting values */ |
396 | ctx->hash[0] += A; | 398 | ctx->hash[0] += A; |
@@ -399,6 +401,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) | |||
399 | ctx->hash[3] += D; | 401 | ctx->hash[3] += D; |
400 | #endif | 402 | #endif |
401 | } | 403 | } |
404 | #undef OP | ||
402 | #undef FF | 405 | #undef FF |
403 | #undef FG | 406 | #undef FG |
404 | #undef FH | 407 | #undef FH |
@@ -490,18 +493,410 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf) | |||
490 | * then rebuild and compare "shaNNNsum bigfile" results. | 493 | * then rebuild and compare "shaNNNsum bigfile" results. |
491 | */ | 494 | */ |
492 | 495 | ||
496 | #if CONFIG_SHA1_SMALL == 0 | ||
497 | # if defined(__GNUC__) && defined(__i386__) | ||
498 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | ||
499 | { | ||
500 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76); | ||
501 | asm( | ||
502 | "\n\ | ||
503 | pushl %ebp # \n\ | ||
504 | pushl %edi # \n\ | ||
505 | pushl %esi # \n\ | ||
506 | pushl %ebx # \n\ | ||
507 | pushl %eax \n\ | ||
508 | movl $15, %edi \n\ | ||
509 | 1: \n\ | ||
510 | movl (%eax,%edi,4), %esi \n\ | ||
511 | bswap %esi \n\ | ||
512 | pushl %esi \n\ | ||
513 | decl %edi \n\ | ||
514 | jns 1b \n\ | ||
515 | movl 80(%eax), %ebx # b = ctx->hash[1] \n\ | ||
516 | movl 84(%eax), %ecx # c = ctx->hash[2] \n\ | ||
517 | movl 88(%eax), %edx # d = ctx->hash[3] \n\ | ||
518 | movl 92(%eax), %ebp # e = ctx->hash[4] \n\ | ||
519 | movl 76(%eax), %eax # a = ctx->hash[0] \n\ | ||
520 | #Register and stack use: \n\ | ||
521 | # eax..edx: a..d \n\ | ||
522 | # ebp: e \n\ | ||
523 | # esi,edi: temps \n\ | ||
524 | # 4*n(%esp): W[n] \n\ | ||
525 | " | ||
526 | #define RD1As(a,b,c,d,e, n, RCONST) \ | ||
527 | "\n\ | ||
528 | ##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\ | ||
529 | movl "c", %edi # c \n\ | ||
530 | xorl "d", %edi # ^d \n\ | ||
531 | andl "b", %edi # &b \n\ | ||
532 | xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
533 | leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\ | ||
534 | addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ | ||
535 | movl "a", %esi # \n\ | ||
536 | roll $5, %esi # rotl32(a,5) \n\ | ||
537 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
538 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
539 | " | ||
540 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ | ||
541 | "\n\ | ||
542 | movl 4*"n"(%esp), %esi # W[n] \n\ | ||
543 | movl "c", %edi # c \n\ | ||
544 | xorl "d", %edi # ^d \n\ | ||
545 | andl "b", %edi # &b \n\ | ||
546 | xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
547 | leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\ | ||
548 | addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ | ||
549 | movl "a", %esi # \n\ | ||
550 | roll $5, %esi # rotl32(a,5) \n\ | ||
551 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
552 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
553 | " | ||
554 | #define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
555 | "\n\ | ||
556 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
557 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
558 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
559 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
560 | roll %esi # \n\ | ||
561 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
562 | movl "c", %edi # c \n\ | ||
563 | xorl "d", %edi # ^d \n\ | ||
564 | andl "b", %edi # &b \n\ | ||
565 | xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
566 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
567 | addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ | ||
568 | movl "a", %esi # \n\ | ||
569 | roll $5, %esi # rotl32(a,5) \n\ | ||
570 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
571 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
572 | " | ||
573 | #define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST)) | ||
574 | #define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST)) | ||
575 | #define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | ||
576 | #undef RCONST | ||
577 | #define RCONST 0x5A827999 | ||
578 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) | ||
579 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) | ||
580 | RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) | ||
581 | RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) | ||
582 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
583 | "\n\ | ||
584 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
585 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
586 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
587 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
588 | roll %esi # \n\ | ||
589 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
590 | movl "c", %edi # c \n\ | ||
591 | xorl "d", %edi # ^d \n\ | ||
592 | xorl "b", %edi # ^b \n\ | ||
593 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
594 | addl %edi, "e" # e += (c ^ d ^ b) \n\ | ||
595 | movl "a", %esi # \n\ | ||
596 | roll $5, %esi # rotl32(a,5) \n\ | ||
597 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
598 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
599 | " | ||
600 | #define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST)) | ||
601 | #undef RCONST | ||
602 | #define RCONST 0x6ED9EBA1 | ||
603 | RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4) | ||
604 | RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9) | ||
605 | RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14) | ||
606 | RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19) | ||
607 | |||
608 | #define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
609 | "\n\ | ||
610 | movl "b", %edi # di: b \n\ | ||
611 | movl "b", %esi # si: b \n\ | ||
612 | orl "c", %edi # di: b | c \n\ | ||
613 | andl "c", %esi # si: b & c \n\ | ||
614 | andl "d", %edi # di: (b | c) & d \n\ | ||
615 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ | ||
616 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
617 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
618 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
619 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
620 | roll %esi # \n\ | ||
621 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
622 | addl %edi, "e" # += ((b | c) & d) | (b & c)\n\ | ||
623 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
624 | movl "a", %esi # \n\ | ||
625 | roll $5, %esi # rotl32(a,5) \n\ | ||
626 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
627 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
628 | " | ||
629 | #define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST)) | ||
630 | #undef RCONST | ||
631 | #define RCONST 0x8F1BBCDC | ||
632 | RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4) | ||
633 | RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9) | ||
634 | RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14) | ||
635 | RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19) | ||
636 | |||
637 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
638 | "\n\ | ||
639 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
640 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
641 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
642 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
643 | roll %esi # \n\ | ||
644 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
645 | movl "c", %edi # c \n\ | ||
646 | xorl "d", %edi # ^d \n\ | ||
647 | xorl "b", %edi # ^b \n\ | ||
648 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
649 | addl %edi, "e" # e += (c ^ d ^ b) \n\ | ||
650 | movl "a", %esi # \n\ | ||
651 | roll $5, %esi # rotl32(a,5) \n\ | ||
652 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
653 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
654 | " | ||
655 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
656 | "\n\ | ||
657 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
658 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
659 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
660 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
661 | roll %esi # \n\ | ||
662 | ##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\ | ||
663 | movl "c", %edi # c \n\ | ||
664 | xorl "d", %edi # ^d \n\ | ||
665 | xorl "b", %edi # ^b \n\ | ||
666 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
667 | addl %edi, "e" # e += (c ^ d ^ b) \n\ | ||
668 | movl "a", %esi # \n\ | ||
669 | roll $5, %esi # rotl32(a,5) \n\ | ||
670 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
671 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
672 | " | ||
673 | #define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) | ||
674 | #define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) | ||
675 | #undef RCONST | ||
676 | #define RCONST 0xCA62C1D6 | ||
677 | RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4) | ||
678 | RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9) | ||
679 | RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14) | ||
680 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) | ||
681 | |||
682 | "\n\ | ||
683 | movl 4*16(%esp), %esi # \n\ | ||
684 | addl $4*(16+1), %esp # \n\ | ||
685 | addl %eax, 76(%esi) # ctx->hash[0] += a \n\ | ||
686 | addl %ebx, 80(%esi) # ctx->hash[1] += b \n\ | ||
687 | addl %ecx, 84(%esi) # ctx->hash[2] += c \n\ | ||
688 | addl %edx, 88(%esi) # ctx->hash[3] += d \n\ | ||
689 | addl %ebp, 92(%esi) # ctx->hash[4] += e \n\ | ||
690 | popl %ebx # \n\ | ||
691 | popl %esi # \n\ | ||
692 | popl %edi # \n\ | ||
693 | popl %ebp # \n\ | ||
694 | " | ||
695 | ); /* asm */ | ||
696 | #undef RCONST | ||
697 | } | ||
698 | # elif defined(__GNUC__) && defined(__x86_64__) | ||
699 | |||
700 | /* in hash_md5_sha_x86-64.S */ | ||
701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | ||
702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); | ||
703 | |||
704 | # else | ||
705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | ||
706 | * It seems further speedup can be achieved by handling more than | ||
707 | * 64 bytes per one function call (coreutils does that). | ||
708 | */ | ||
709 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||
710 | { | ||
711 | static const uint32_t rconsts[] ALIGN4 = { | ||
712 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | ||
713 | }; | ||
714 | uint32_t W[16]; | ||
715 | uint32_t a, b, c, d, e; | ||
716 | |||
717 | a = ctx->hash[0]; | ||
718 | b = ctx->hash[1]; | ||
719 | c = ctx->hash[2]; | ||
720 | d = ctx->hash[3]; | ||
721 | e = ctx->hash[4]; | ||
722 | |||
723 | /* From kernel source comments: | ||
724 | * """ | ||
725 | * If you have 32 registers or more, the compiler can (and should) | ||
726 | * try to change the array[] accesses into registers. However, on | ||
727 | * machines with less than ~25 registers, that won't really work, | ||
728 | * and at least gcc will make an unholy mess of it. | ||
729 | * | ||
730 | * So to avoid that mess which just slows things down, we force | ||
731 | * the stores to memory to actually happen (we might be better off | ||
732 | * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as | ||
733 | * suggested by Artur Skawina - that will also make gcc unable to | ||
734 | * try to do the silly "optimize away loads" part because it won't | ||
735 | * see what the value will be). | ||
736 | * """ | ||
737 | */ | ||
738 | #if defined(__GNUC__) && defined(__i386__) | ||
739 | # define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m)) | ||
740 | #else | ||
741 | # define DO_NOT_TRY_PROPAGATING(m) ((void)0) | ||
742 | #endif | ||
743 | |||
744 | #undef OP | ||
745 | #define OP(A,B,C,D,E, n) \ | ||
746 | do { \ | ||
747 | uint32_t work = EXPR(B, C, D); \ | ||
748 | if (n <= 15) \ | ||
749 | work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ | ||
750 | if (n >= 16) \ | ||
751 | work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \ | ||
752 | DO_NOT_TRY_PROPAGATING(W[n & 15]); \ | ||
753 | E += work + rotl32(A, 5) + rconsts[n / 20]; \ | ||
754 | B = rotl32(B, 30); \ | ||
755 | } while (0) | ||
756 | #define OP20(n) \ | ||
757 | OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \ | ||
758 | OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \ | ||
759 | OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \ | ||
760 | OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19)) | ||
761 | |||
762 | /* 4 rounds of 20 operations each */ | ||
763 | #define EXPR(b,c,d) (((c ^ d) & b) ^ d) | ||
764 | OP20(0); | ||
765 | #undef EXPR | ||
766 | #define EXPR(b,c,d) (c ^ d ^ b) | ||
767 | OP20(20); | ||
768 | #undef EXPR | ||
769 | #define EXPR(b,c,d) (((b | c) & d) | (b & c)) | ||
770 | OP20(40); | ||
771 | #undef EXPR | ||
772 | #define EXPR(b,c,d) (c ^ d ^ b) | ||
773 | OP20(60); | ||
774 | |||
775 | #undef EXPR | ||
776 | #undef OP | ||
777 | #undef OP20 | ||
778 | |||
779 | ctx->hash[0] += a; | ||
780 | ctx->hash[1] += b; | ||
781 | ctx->hash[2] += c; | ||
782 | ctx->hash[3] += d; | ||
783 | ctx->hash[4] += e; | ||
784 | } | ||
785 | # endif | ||
786 | #elif CONFIG_SHA1_SMALL == 1 | ||
787 | /* Middle-sized version, +300 bytes of code on x86. */ | ||
788 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||
789 | { | ||
790 | static const uint32_t rconsts[] ALIGN4 = { | ||
791 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | ||
792 | }; | ||
793 | int j; | ||
794 | int n; | ||
795 | uint32_t W[16+16]; | ||
796 | uint32_t a, b, c, d, e; | ||
797 | |||
798 | a = ctx->hash[0]; | ||
799 | b = ctx->hash[1]; | ||
800 | c = ctx->hash[2]; | ||
801 | d = ctx->hash[3]; | ||
802 | e = ctx->hash[4]; | ||
803 | |||
804 | /* 1st round of 20 operations */ | ||
805 | n = 0; | ||
806 | do { | ||
807 | uint32_t work = ((c ^ d) & b) ^ d; | ||
808 | W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); | ||
809 | work += W[n]; | ||
810 | work += e + rotl32(a, 5) + rconsts[0]; | ||
811 | /* Rotate by one for next time */ | ||
812 | e = d; | ||
813 | d = c; | ||
814 | c = rotl32(b, 30); | ||
815 | b = a; | ||
816 | a = work; | ||
817 | n = (n + 1) & 15; | ||
818 | } while (n != 0); | ||
819 | do { | ||
820 | uint32_t work = ((c ^ d) & b) ^ d; | ||
821 | W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); | ||
822 | work += W[n]; | ||
823 | work += e + rotl32(a, 5) + rconsts[0]; | ||
824 | e = d; | ||
825 | d = c; | ||
826 | c = rotl32(b, 30); | ||
827 | b = a; | ||
828 | a = work; | ||
829 | n = (n + 1) /* & 15*/; | ||
830 | } while (n != 4); | ||
831 | /* 2nd round of 20 operations */ | ||
832 | j = 19; | ||
833 | do { | ||
834 | uint32_t work = c ^ d ^ b; | ||
835 | W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); | ||
836 | work += W[n]; | ||
837 | work += e + rotl32(a, 5) + rconsts[1]; | ||
838 | e = d; | ||
839 | d = c; | ||
840 | c = rotl32(b, 30); | ||
841 | b = a; | ||
842 | a = work; | ||
843 | n = (n + 1) & 15; | ||
844 | } while (--j >= 0); | ||
845 | /* 3rd round */ | ||
846 | j = 19; | ||
847 | do { | ||
848 | uint32_t work = ((b | c) & d) | (b & c); | ||
849 | W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); | ||
850 | work += W[n]; | ||
851 | work += e + rotl32(a, 5) + rconsts[2]; | ||
852 | e = d; | ||
853 | d = c; | ||
854 | c = rotl32(b, 30); | ||
855 | b = a; | ||
856 | a = work; | ||
857 | n = (n + 1) & 15; | ||
858 | } while (--j >= 0); | ||
859 | /* 4th round */ | ||
860 | j = 19; | ||
861 | do { | ||
862 | uint32_t work = c ^ d ^ b; | ||
863 | W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); | ||
864 | work += W[n]; | ||
865 | work += e + rotl32(a, 5) + rconsts[3]; | ||
866 | e = d; | ||
867 | d = c; | ||
868 | c = rotl32(b, 30); | ||
869 | b = a; | ||
870 | a = work; | ||
871 | n = (n + 1) & 15; | ||
872 | } while (--j >= 0); | ||
873 | |||
874 | ctx->hash[0] += a; | ||
875 | ctx->hash[1] += b; | ||
876 | ctx->hash[2] += c; | ||
877 | ctx->hash[3] += d; | ||
878 | ctx->hash[4] += e; | ||
879 | } | ||
880 | #else | ||
881 | /* Compact version, almost twice as slow as fully unrolled */ | ||
493 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | 882 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) |
494 | { | 883 | { |
495 | static const uint32_t rconsts[] ALIGN4 = { | 884 | static const uint32_t rconsts[] ALIGN4 = { |
496 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | 885 | 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 |
497 | }; | 886 | }; |
498 | int i, j; | 887 | int i, j; |
499 | int cnt; | 888 | int n; |
500 | uint32_t W[16+16]; | 889 | uint32_t W[16+16]; |
501 | uint32_t a, b, c, d, e; | 890 | uint32_t a, b, c, d, e; |
502 | 891 | ||
503 | /* On-stack work buffer frees up one register in the main loop | 892 | /* On-stack work buffer frees up one register in the main loop |
504 | * which otherwise will be needed to hold ctx pointer */ | 893 | * which otherwise will be needed to hold ctx pointer. |
894 | * | ||
895 | * The compiler is not smart enough to realize it, though. :( | ||
896 | * If __attribute__((optimize("2"))) is added to the function, | ||
897 | * only then gcc-9.3.1 spills "ctx" to stack and uses the freed | ||
898 | * register (making code 6 bytes smaller, not just faster). | ||
899 | */ | ||
505 | for (i = 0; i < 16; i++) | 900 | for (i = 0; i < 16; i++) |
506 | W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); | 901 | W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); |
507 | 902 | ||
@@ -512,7 +907,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
512 | e = ctx->hash[4]; | 907 | e = ctx->hash[4]; |
513 | 908 | ||
514 | /* 4 rounds of 20 operations each */ | 909 | /* 4 rounds of 20 operations each */ |
515 | cnt = 0; | 910 | n = 0; |
516 | for (i = 0; i < 4; i++) { | 911 | for (i = 0; i < 4; i++) { |
517 | j = 19; | 912 | j = 19; |
518 | do { | 913 | do { |
@@ -523,27 +918,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
523 | work = (work & b) ^ d; | 918 | work = (work & b) ^ d; |
524 | if (j <= 3) | 919 | if (j <= 3) |
525 | goto ge16; | 920 | goto ge16; |
526 | /* Used to do SWAP_BE32 here, but this | ||
527 | * requires ctx (see comment above) */ | ||
528 | work += W[cnt]; | ||
529 | } else { | 921 | } else { |
530 | if (i == 2) | 922 | if (i == 2) |
531 | work = ((b | c) & d) | (b & c); | 923 | work = ((b | c) & d) | (b & c); |
532 | else /* i = 1 or 3 */ | 924 | else /* i = 1 or 3 */ |
533 | work ^= b; | 925 | work ^= b; |
534 | ge16: | 926 | ge16: |
535 | W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); | 927 | W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); |
536 | work += W[cnt]; | ||
537 | } | 928 | } |
929 | work += W[n]; | ||
538 | work += e + rotl32(a, 5) + rconsts[i]; | 930 | work += e + rotl32(a, 5) + rconsts[i]; |
539 | 931 | ||
540 | /* Rotate by one for next time */ | 932 | /* Rotate by one for next time */ |
541 | e = d; | 933 | e = d; |
542 | d = c; | 934 | d = c; |
543 | c = /* b = */ rotl32(b, 30); | 935 | c = rotl32(b, 30); |
544 | b = a; | 936 | b = a; |
545 | a = work; | 937 | a = work; |
546 | cnt = (cnt + 1) & 15; | 938 | n = (n + 1) & 15; |
547 | } while (--j >= 0); | 939 | } while (--j >= 0); |
548 | } | 940 | } |
549 | 941 | ||
@@ -553,6 +945,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
553 | ctx->hash[3] += d; | 945 | ctx->hash[3] += d; |
554 | ctx->hash[4] += e; | 946 | ctx->hash[4] += e; |
555 | } | 947 | } |
948 | #endif | ||
556 | 949 | ||
557 | /* Constants for SHA512 from FIPS 180-2:4.2.3. | 950 | /* Constants for SHA512 from FIPS 180-2:4.2.3. |
558 | * SHA256 constants from FIPS 180-2:4.2.2 | 951 | * SHA256 constants from FIPS 180-2:4.2.2 |