aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha.c')
-rw-r--r--libbb/hash_md5_sha.c415
1 files changed, 404 insertions, 11 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index e0db8ce67..ee19c1cb7 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -8,6 +8,9 @@
8 */ 8 */
9#include "libbb.h" 9#include "libbb.h"
10 10
11#define STR1(s) #s
12#define STR(s) STR1(s)
13
11#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) 14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
12 15
13/* gcc 4.2.1 optimizes rotr64 better with inline than with macro 16/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
@@ -390,7 +393,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
390 OP(FI, D, A, B, C, 11, 10, 0xbd3af235); 393 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); 394 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392 OP(FI, B, C, D, A, 9, 21, 0xeb86d391); 395 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393# undef OP
394# endif 396# endif
395 /* Add checksum to the starting values */ 397 /* Add checksum to the starting values */
396 ctx->hash[0] += A; 398 ctx->hash[0] += A;
@@ -399,6 +401,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
399 ctx->hash[3] += D; 401 ctx->hash[3] += D;
400#endif 402#endif
401} 403}
404#undef OP
402#undef FF 405#undef FF
403#undef FG 406#undef FG
404#undef FH 407#undef FH
@@ -490,18 +493,410 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
490 * then rebuild and compare "shaNNNsum bigfile" results. 493 * then rebuild and compare "shaNNNsum bigfile" results.
491 */ 494 */
492 495
496#if CONFIG_SHA1_SMALL == 0
497# if defined(__GNUC__) && defined(__i386__)
498static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
499{
500 BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
501 asm(
502"\n\
503 pushl %ebp # \n\
504 pushl %edi # \n\
505 pushl %esi # \n\
506 pushl %ebx # \n\
507 pushl %eax \n\
508 movl $15, %edi \n\
5091: \n\
510 movl (%eax,%edi,4), %esi \n\
511 bswap %esi \n\
512 pushl %esi \n\
513 decl %edi \n\
514 jns 1b \n\
515 movl 80(%eax), %ebx # b = ctx->hash[1] \n\
516 movl 84(%eax), %ecx # c = ctx->hash[2] \n\
517 movl 88(%eax), %edx # d = ctx->hash[3] \n\
518 movl 92(%eax), %ebp # e = ctx->hash[4] \n\
519 movl 76(%eax), %eax # a = ctx->hash[0] \n\
520#Register and stack use: \n\
521# eax..edx: a..d \n\
522# ebp: e \n\
523# esi,edi: temps \n\
524# 4*n(%esp): W[n] \n\
525"
526#define RD1As(a,b,c,d,e, n, RCONST) \
527"\n\
528 ##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\
529 movl "c", %edi # c \n\
530 xorl "d", %edi # ^d \n\
531 andl "b", %edi # &b \n\
532 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
533 leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
534 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
535 movl "a", %esi # \n\
536 roll $5, %esi # rotl32(a,5) \n\
537 addl %esi, "e" # e += rotl32(a,5) \n\
538 rorl $2, "b" # b = rotl32(b,30) \n\
539"
540#define RD1Bs(a,b,c,d,e, n, RCONST) \
541"\n\
542 movl 4*"n"(%esp), %esi # W[n] \n\
543 movl "c", %edi # c \n\
544 xorl "d", %edi # ^d \n\
545 andl "b", %edi # &b \n\
546 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
547 leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
548 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
549 movl "a", %esi # \n\
550 roll $5, %esi # rotl32(a,5) \n\
551 addl %esi, "e" # e += rotl32(a,5) \n\
552 rorl $2, "b" # b = rotl32(b,30) \n\
553"
554#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
555"\n\
556 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
557 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
558 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
559 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
560 roll %esi # \n\
561 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
562 movl "c", %edi # c \n\
563 xorl "d", %edi # ^d \n\
564 andl "b", %edi # &b \n\
565 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
566 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
567 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
568 movl "a", %esi # \n\
569 roll $5, %esi # rotl32(a,5) \n\
570 addl %esi, "e" # e += rotl32(a,5) \n\
571 rorl $2, "b" # b = rotl32(b,30) \n\
572"
573#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
574#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
575#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
576#undef RCONST
577#define RCONST 0x5A827999
578 RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
579 RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
580 RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
581 RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
582#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
583"\n\
584 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
585 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
586 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
587 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
588 roll %esi # \n\
589 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
590 movl "c", %edi # c \n\
591 xorl "d", %edi # ^d \n\
592 xorl "b", %edi # ^b \n\
593 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
594 addl %edi, "e" # e += (c ^ d ^ b) \n\
595 movl "a", %esi # \n\
596 roll $5, %esi # rotl32(a,5) \n\
597 addl %esi, "e" # e += rotl32(a,5) \n\
598 rorl $2, "b" # b = rotl32(b,30) \n\
599"
600#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
601#undef RCONST
602#define RCONST 0x6ED9EBA1
603 RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
604 RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
605 RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
606 RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
607
608#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
609"\n\
610 movl "b", %edi # di: b \n\
611 movl "b", %esi # si: b \n\
612 orl "c", %edi # di: b | c \n\
613 andl "c", %esi # si: b & c \n\
614 andl "d", %edi # di: (b | c) & d \n\
615 orl %esi, %edi # ((b | c) & d) | (b & c) \n\
616 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
617 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
618 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
619 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
620 roll %esi # \n\
621 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
622 addl %edi, "e" # += ((b | c) & d) | (b & c)\n\
623 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
624 movl "a", %esi # \n\
625 roll $5, %esi # rotl32(a,5) \n\
626 addl %esi, "e" # e += rotl32(a,5) \n\
627 rorl $2, "b" # b = rotl32(b,30) \n\
628"
629#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
630#undef RCONST
631#define RCONST 0x8F1BBCDC
632 RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
633 RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
634 RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
635 RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
636
637#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
638"\n\
639 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
640 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
641 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
642 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
643 roll %esi # \n\
644 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
645 movl "c", %edi # c \n\
646 xorl "d", %edi # ^d \n\
647 xorl "b", %edi # ^b \n\
648 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
649 addl %edi, "e" # e += (c ^ d ^ b) \n\
650 movl "a", %esi # \n\
651 roll $5, %esi # rotl32(a,5) \n\
652 addl %esi, "e" # e += rotl32(a,5) \n\
653 rorl $2, "b" # b = rotl32(b,30) \n\
654"
655#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
656"\n\
657 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
658 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
659 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
660 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
661 roll %esi # \n\
662 ##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\
663 movl "c", %edi # c \n\
664 xorl "d", %edi # ^d \n\
665 xorl "b", %edi # ^b \n\
666 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
667 addl %edi, "e" # e += (c ^ d ^ b) \n\
668 movl "a", %esi # \n\
669 roll $5, %esi # rotl32(a,5) \n\
670 addl %esi, "e" # e += rotl32(a,5) \n\
671 rorl $2, "b" # b = rotl32(b,30) \n\
672"
673#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
674#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
675#undef RCONST
676#define RCONST 0xCA62C1D6
677 RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
678 RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
679 RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
680 RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
681
682"\n\
683 movl 4*16(%esp), %esi # \n\
684 addl $4*(16+1), %esp # \n\
685 addl %eax, 76(%esi) # ctx->hash[0] += a \n\
686 addl %ebx, 80(%esi) # ctx->hash[1] += b \n\
687 addl %ecx, 84(%esi) # ctx->hash[2] += c \n\
688 addl %edx, 88(%esi) # ctx->hash[3] += d \n\
689 addl %ebp, 92(%esi) # ctx->hash[4] += e \n\
690 popl %ebx # \n\
691 popl %esi # \n\
692 popl %edi # \n\
693 popl %ebp # \n\
694"
695 ); /* asm */
696#undef RCONST
697}
698# elif defined(__GNUC__) && defined(__x86_64__)
699
700/* in hash_md5_sha_x86-64.S */
701struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
702void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);
703
704# else
705/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
706 * It seems further speedup can be achieved by handling more than
707 * 64 bytes per one function call (coreutils does that).
708 */
709static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
710{
711 static const uint32_t rconsts[] ALIGN4 = {
712 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
713 };
714 uint32_t W[16];
715 uint32_t a, b, c, d, e;
716
717 a = ctx->hash[0];
718 b = ctx->hash[1];
719 c = ctx->hash[2];
720 d = ctx->hash[3];
721 e = ctx->hash[4];
722
723/* From kernel source comments:
724 * """
725 * If you have 32 registers or more, the compiler can (and should)
726 * try to change the array[] accesses into registers. However, on
727 * machines with less than ~25 registers, that won't really work,
728 * and at least gcc will make an unholy mess of it.
729 *
730 * So to avoid that mess which just slows things down, we force
731 * the stores to memory to actually happen (we might be better off
732 * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
733 * suggested by Artur Skawina - that will also make gcc unable to
734 * try to do the silly "optimize away loads" part because it won't
735 * see what the value will be).
736 * """
737 */
738#if defined(__GNUC__) && defined(__i386__)
739# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
740#else
741# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
742#endif
743
744#undef OP
745#define OP(A,B,C,D,E, n) \
746 do { \
747 uint32_t work = EXPR(B, C, D); \
748 if (n <= 15) \
749 work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
750 if (n >= 16) \
751 work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
752 DO_NOT_TRY_PROPAGATING(W[n & 15]); \
753 E += work + rotl32(A, 5) + rconsts[n / 20]; \
754 B = rotl32(B, 30); \
755 } while (0)
756#define OP20(n) \
757 OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
758 OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
759 OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
760 OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
761
762 /* 4 rounds of 20 operations each */
763#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
764 OP20(0);
765#undef EXPR
766#define EXPR(b,c,d) (c ^ d ^ b)
767 OP20(20);
768#undef EXPR
769#define EXPR(b,c,d) (((b | c) & d) | (b & c))
770 OP20(40);
771#undef EXPR
772#define EXPR(b,c,d) (c ^ d ^ b)
773 OP20(60);
774
775#undef EXPR
776#undef OP
777#undef OP20
778
779 ctx->hash[0] += a;
780 ctx->hash[1] += b;
781 ctx->hash[2] += c;
782 ctx->hash[3] += d;
783 ctx->hash[4] += e;
784}
785# endif
786#elif CONFIG_SHA1_SMALL == 1
787/* Middle-sized version, +300 bytes of code on x86. */
788static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
789{
790 static const uint32_t rconsts[] ALIGN4 = {
791 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
792 };
793 int j;
794 int n;
795 uint32_t W[16+16];
796 uint32_t a, b, c, d, e;
797
798 a = ctx->hash[0];
799 b = ctx->hash[1];
800 c = ctx->hash[2];
801 d = ctx->hash[3];
802 e = ctx->hash[4];
803
804 /* 1st round of 20 operations */
805 n = 0;
806 do {
807 uint32_t work = ((c ^ d) & b) ^ d;
808 W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]);
809 work += W[n];
810 work += e + rotl32(a, 5) + rconsts[0];
811 /* Rotate by one for next time */
812 e = d;
813 d = c;
814 c = rotl32(b, 30);
815 b = a;
816 a = work;
817 n = (n + 1) & 15;
818 } while (n != 0);
819 do {
820 uint32_t work = ((c ^ d) & b) ^ d;
821 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
822 work += W[n];
823 work += e + rotl32(a, 5) + rconsts[0];
824 e = d;
825 d = c;
826 c = rotl32(b, 30);
827 b = a;
828 a = work;
829 n = (n + 1) /* & 15*/;
830 } while (n != 4);
831 /* 2nd round of 20 operations */
832 j = 19;
833 do {
834 uint32_t work = c ^ d ^ b;
835 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
836 work += W[n];
837 work += e + rotl32(a, 5) + rconsts[1];
838 e = d;
839 d = c;
840 c = rotl32(b, 30);
841 b = a;
842 a = work;
843 n = (n + 1) & 15;
844 } while (--j >= 0);
845 /* 3rd round */
846 j = 19;
847 do {
848 uint32_t work = ((b | c) & d) | (b & c);
849 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
850 work += W[n];
851 work += e + rotl32(a, 5) + rconsts[2];
852 e = d;
853 d = c;
854 c = rotl32(b, 30);
855 b = a;
856 a = work;
857 n = (n + 1) & 15;
858 } while (--j >= 0);
859 /* 4th round */
860 j = 19;
861 do {
862 uint32_t work = c ^ d ^ b;
863 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
864 work += W[n];
865 work += e + rotl32(a, 5) + rconsts[3];
866 e = d;
867 d = c;
868 c = rotl32(b, 30);
869 b = a;
870 a = work;
871 n = (n + 1) & 15;
872 } while (--j >= 0);
873
874 ctx->hash[0] += a;
875 ctx->hash[1] += b;
876 ctx->hash[2] += c;
877 ctx->hash[3] += d;
878 ctx->hash[4] += e;
879}
880#else
881/* Compact version, almost twice as slow as fully unrolled */
493static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) 882static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494{ 883{
495 static const uint32_t rconsts[] ALIGN4 = { 884 static const uint32_t rconsts[] ALIGN4 = {
496 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 885 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497 }; 886 };
498 int i, j; 887 int i, j;
499 int cnt; 888 int n;
500 uint32_t W[16+16]; 889 uint32_t W[16+16];
501 uint32_t a, b, c, d, e; 890 uint32_t a, b, c, d, e;
502 891
503 /* On-stack work buffer frees up one register in the main loop 892 /* On-stack work buffer frees up one register in the main loop
504 * which otherwise will be needed to hold ctx pointer */ 893 * which otherwise will be needed to hold ctx pointer.
894 *
895 * The compiler is not smart enough to realize it, though. :(
896 * If __attribute__((optimize("2"))) is added to the function,
897 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
898 * register (making code 6 bytes smaller, not just faster).
899 */
505 for (i = 0; i < 16; i++) 900 for (i = 0; i < 16; i++)
506 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); 901 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507 902
@@ -512,7 +907,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
512 e = ctx->hash[4]; 907 e = ctx->hash[4];
513 908
514 /* 4 rounds of 20 operations each */ 909 /* 4 rounds of 20 operations each */
515 cnt = 0; 910 n = 0;
516 for (i = 0; i < 4; i++) { 911 for (i = 0; i < 4; i++) {
517 j = 19; 912 j = 19;
518 do { 913 do {
@@ -523,27 +918,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
523 work = (work & b) ^ d; 918 work = (work & b) ^ d;
524 if (j <= 3) 919 if (j <= 3)
525 goto ge16; 920 goto ge16;
526 /* Used to do SWAP_BE32 here, but this
527 * requires ctx (see comment above) */
528 work += W[cnt];
529 } else { 921 } else {
530 if (i == 2) 922 if (i == 2)
531 work = ((b | c) & d) | (b & c); 923 work = ((b | c) & d) | (b & c);
532 else /* i = 1 or 3 */ 924 else /* i = 1 or 3 */
533 work ^= b; 925 work ^= b;
534 ge16: 926 ge16:
535 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); 927 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
536 work += W[cnt];
537 } 928 }
929 work += W[n];
538 work += e + rotl32(a, 5) + rconsts[i]; 930 work += e + rotl32(a, 5) + rconsts[i];
539 931
540 /* Rotate by one for next time */ 932 /* Rotate by one for next time */
541 e = d; 933 e = d;
542 d = c; 934 d = c;
543 c = /* b = */ rotl32(b, 30); 935 c = rotl32(b, 30);
544 b = a; 936 b = a;
545 a = work; 937 a = work;
546 cnt = (cnt + 1) & 15; 938 n = (n + 1) & 15;
547 } while (--j >= 0); 939 } while (--j >= 0);
548 } 940 }
549 941
@@ -553,6 +945,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
553 ctx->hash[3] += d; 945 ctx->hash[3] += d;
554 ctx->hash[4] += e; 946 ctx->hash[4] += e;
555} 947}
948#endif
556 949
557/* Constants for SHA512 from FIPS 180-2:4.2.3. 950/* Constants for SHA512 from FIPS 180-2:4.2.3.
558 * SHA256 constants from FIPS 180-2:4.2.2 951 * SHA256 constants from FIPS 180-2:4.2.2