1 files changed, 404 insertions, 11 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index e0db8ce67..ee19c1cb7 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -8,6 +8,9 @@
 */
 #include "libbb.h"
+#define STR1(s) #s
+#define STR(s) STR1(s)
 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
@@ -390,7 +393,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
        OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
        OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
        OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
-# undef OP
 # endif
        /* Add checksum to the starting values */
        ctx->hash[0] += A;
@@ -399,6 +401,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
        ctx->hash[3] += D;
 #endif
 }
+#undef OP
 #undef FF
 #undef FG
 #undef FH
@@ -490,18 +493,410 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 * then rebuild and compare "shaNNNsum bigfile" results.
 */
+#if CONFIG_SHA1_SMALL == 0
+# if defined(__GNUC__) && defined(__i386__)
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
+{
+        BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
+        asm(
+"\n\
+        pushl   %ebp    #                                           \n\
+        pushl   %edi    #                                           \n\
+        pushl   %esi    #                                           \n\
+        pushl   %ebx    #                                           \n\
+        pushl   %eax                                                \n\
+        movl    $15, %edi                                           \n\
+1:                                                                  \n\
+        movl    (%eax,%edi,4), %esi                                 \n\
+        bswap   %esi                                                \n\
+        pushl   %esi                                                \n\
+        decl    %edi                                                \n\
+        jns     1b                                                  \n\
+        movl    80(%eax), %ebx  # b = ctx->hash[1]                  \n\
+        movl    84(%eax), %ecx  # c = ctx->hash[2]                  \n\
+        movl    88(%eax), %edx  # d = ctx->hash[3]                  \n\
+        movl    92(%eax), %ebp  # e = ctx->hash[4]                  \n\
+        movl    76(%eax), %eax  # a = ctx->hash[0]                  \n\
+#Register and stack use:                                            \n\
+# eax..edx: a..d                                                    \n\
+# ebp: e                                                            \n\
+# esi,edi: temps                                                    \n\
+# 4*n(%esp): W[n]                                                   \n\
+"
+#define RD1As(a,b,c,d,e, n, RCONST) \
+"\n\
+        ##movl  4*"n"(%esp), %esi       # n=0, W[0] already in %esi \n\
+        movl    "c", %edi               # c                         \n\
+        xorl    "d", %edi               # ^d                        \n\
+        andl    "b", %edi               # &b                        \n\
+        xorl    "d", %edi               # (((c ^ d) & b) ^ d)       \n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + W[n]        \n\
+        addl    %edi, "e"               # e += (((c ^ d) & b) ^ d)  \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD1Bs(a,b,c,d,e, n, RCONST) \
+"\n\
+        movl    4*"n"(%esp), %esi       # W[n]                      \n\
+        movl    "c", %edi               # c                         \n\
+        xorl    "d", %edi               # ^d                        \n\
+        andl    "b", %edi               # &b                        \n\
+        xorl    "d", %edi               # (((c ^ d) & b) ^ d)       \n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + W[n]        \n\
+        addl    %edi, "e"               # e += (((c ^ d) & b) ^ d)  \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+        movl    4*"n13"(%esp), %esi     # W[(n+13) & 15]            \n\
+        xorl    4*"n8"(%esp), %esi      # ^W[(n+8) & 15]            \n\
+        xorl    4*"n2"(%esp), %esi      # ^W[(n+2) & 15]            \n\
+        xorl    4*"n"(%esp), %esi       # ^W[n & 15]                \n\
+        roll    %esi                    #                           \n\
+        movl    %esi, 4*"n"(%esp)       # store to W[n & 15]        \n\
+        movl    "c", %edi               # c                         \n\
+        xorl    "d", %edi               # ^d                        \n\
+        andl    "b", %edi               # &b                        \n\
+        xorl    "d", %edi               # (((c ^ d) & b) ^ d)       \n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + mixed_W     \n\
+        addl    %edi, "e"               # e += (((c ^ d) & b) ^ d)  \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
+#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
+#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x5A827999
+        RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
+        RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
+        RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
+        RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
+#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+        movl    4*"n13"(%esp), %esi     # W[(n+13) & 15]            \n\
+        xorl    4*"n8"(%esp), %esi      # ^W[(n+8) & 15]            \n\
+        xorl    4*"n2"(%esp), %esi      # ^W[(n+2) & 15]            \n\
+        xorl    4*"n"(%esp), %esi       # ^W[n & 15]                \n\
+        roll    %esi                    #                           \n\
+        movl    %esi, 4*"n"(%esp)       # store to W[n & 15]        \n\
+        movl    "c", %edi               # c                         \n\
+        xorl    "d", %edi               # ^d                        \n\
+        xorl    "b", %edi               # ^b                        \n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + mixed_W     \n\
+        addl    %edi, "e"               # e += (c ^ d ^ b)          \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x6ED9EBA1
+        RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
+        RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
+        RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
+        RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
+#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+        movl    "b", %edi               # di: b                     \n\
+        movl    "b", %esi               # si: b                     \n\
+        orl     "c", %edi               # di: b | c                 \n\
+        andl    "c", %esi               # si: b & c                 \n\
+        andl    "d", %edi               # di: (b | c) & d           \n\
+        orl     %esi, %edi              # ((b | c) & d) | (b & c)   \n\
+        movl    4*"n13"(%esp), %esi     # W[(n+13) & 15]            \n\
+        xorl    4*"n8"(%esp), %esi      # ^W[(n+8) & 15]            \n\
+        xorl    4*"n2"(%esp), %esi      # ^W[(n+2) & 15]            \n\
+        xorl    4*"n"(%esp), %esi       # ^W[n & 15]                \n\
+        roll    %esi                    #                           \n\
+        movl    %esi, 4*"n"(%esp)       # store to W[n & 15]        \n\
+        addl    %edi, "e"               # += ((b | c) & d) | (b & c)\n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + mixed_W     \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x8F1BBCDC
+        RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
+        RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
+        RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
+        RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
+#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+        movl    4*"n13"(%esp), %esi     # W[(n+13) & 15]            \n\
+        xorl    4*"n8"(%esp), %esi      # ^W[(n+8) & 15]            \n\
+        xorl    4*"n2"(%esp), %esi      # ^W[(n+2) & 15]            \n\
+        xorl    4*"n"(%esp), %esi       # ^W[n & 15]                \n\
+        roll    %esi                    #                           \n\
+        movl    %esi, 4*"n"(%esp)       # store to W[n & 15]        \n\
+        movl    "c", %edi               # c                         \n\
+        xorl    "d", %edi               # ^d                        \n\
+        xorl    "b", %edi               # ^b                        \n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + mixed_W     \n\
+        addl    %edi, "e"               # e += (c ^ d ^ b)          \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+        movl    4*"n13"(%esp), %esi     # W[(n+13) & 15]            \n\
+        xorl    4*"n8"(%esp), %esi      # ^W[(n+8) & 15]            \n\
+        xorl    4*"n2"(%esp), %esi      # ^W[(n+2) & 15]            \n\
+        xorl    4*"n"(%esp), %esi       # ^W[n & 15]                \n\
+        roll    %esi                    #                           \n\
+        ##movl  %esi, 4*"n"(%esp)       # store to W[n & 15] elided \n\
+        movl    "c", %edi               # c                         \n\
+        xorl    "d", %edi               # ^d                        \n\
+        xorl    "b", %edi               # ^b                        \n\
+        leal    "RCONST"("e",%esi), "e" # e += RCONST + mixed_W     \n\
+        addl    %edi, "e"               # e += (c ^ d ^ b)          \n\
+        movl    "a", %esi               #                           \n\
+        roll    $5, %esi                # rotl32(a,5)               \n\
+        addl    %esi, "e"               # e += rotl32(a,5)          \n\
+        rorl    $2, "b"                 # b = rotl32(b,30)          \n\
+"
+#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
+#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0xCA62C1D6
+        RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
+        RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
+        RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
+        RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
+"\n\
+        movl    4*16(%esp), %esi        #                           \n\
+        addl    $4*(16+1), %esp         #                           \n\
+        addl    %eax, 76(%esi)          # ctx->hash[0] += a         \n\
+        addl    %ebx, 80(%esi)          # ctx->hash[1] += b         \n\
+        addl    %ecx, 84(%esi)          # ctx->hash[2] += c         \n\
+        addl    %edx, 88(%esi)          # ctx->hash[3] += d         \n\
+        addl    %ebp, 92(%esi)          # ctx->hash[4] += e         \n\
+        popl    %ebx                    #                           \n\
+        popl    %esi                    #                           \n\
+        popl    %edi                    #                           \n\
+        popl    %ebp                    #                           \n\
+"
+        ); /* asm */
+#undef RCONST
+}
+# elif defined(__GNUC__) && defined(__x86_64__)
+/* in hash_md5_sha_x86-64.S */
+struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
+void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);
+# else
+/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
+ * It seems further speedup can be achieved by handling more than
+ * 64 bytes per one function call (coreutils does that).
+ */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+        static const uint32_t rconsts[] ALIGN4 = {
+                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+        };
+        uint32_t W[16];
+        uint32_t a, b, c, d, e;
+        a = ctx->hash[0];
+        b = ctx->hash[1];
+        c = ctx->hash[2];
+        d = ctx->hash[3];
+        e = ctx->hash[4];
+/* From kernel source comments:
+ * """
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ * """
+ */
+#if defined(__GNUC__) && defined(__i386__)
+# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
+#else
+# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
+#endif
+#undef OP
+#define OP(A,B,C,D,E, n) \
+        do { \
+                uint32_t work = EXPR(B, C, D); \
+                if (n <= 15) \
+                        work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
+                if (n >= 16) \
+                        work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
+                DO_NOT_TRY_PROPAGATING(W[n & 15]); \
+                E += work + rotl32(A, 5) + rconsts[n / 20]; \
+                B = rotl32(B, 30); \
+        } while (0)
+#define OP20(n) \
+        OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
+        OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
+        OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
+        OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
+        /* 4 rounds of 20 operations each */
+#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
+        OP20(0);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+        OP20(20);
+#undef EXPR
+#define EXPR(b,c,d) (((b | c) & d) | (b & c))
+        OP20(40);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+        OP20(60);
+#undef EXPR
+#undef OP
+#undef OP20
+        ctx->hash[0] += a;
+        ctx->hash[1] += b;
+        ctx->hash[2] += c;
+        ctx->hash[3] += d;
+        ctx->hash[4] += e;
+}
+# endif
+#elif CONFIG_SHA1_SMALL == 1
+/* Middle-sized version, +300 bytes of code on x86. */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+        static const uint32_t rconsts[] ALIGN4 = {
+                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+        };
+        int j;
+        int n;
+        uint32_t W[16+16];
+        uint32_t a, b, c, d, e;
+        a = ctx->hash[0];
+        b = ctx->hash[1];
+        c = ctx->hash[2];
+        d = ctx->hash[3];
+        e = ctx->hash[4];
+        /* 1st round of 20 operations */
+        n = 0;
+        do {
+                uint32_t work = ((c ^ d) & b) ^ d;
+                W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]);
+                work += W[n];
+                work += e + rotl32(a, 5) + rconsts[0];
+                /* Rotate by one for next time */
+                e = d;
+                d = c;
+                c = rotl32(b, 30);
+                b = a;
+                a = work;
+                n = (n + 1) & 15;
+        } while (n != 0);
+        do {
+                uint32_t work = ((c ^ d) & b) ^ d;
+                W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+                work += W[n];
+                work += e + rotl32(a, 5) + rconsts[0];
+                e = d;
+                d = c;
+                c = rotl32(b, 30);
+                b = a;
+                a = work;
+                n = (n + 1) /* & 15*/;
+        } while (n != 4);
+        /* 2nd round of 20 operations */
+        j = 19;
+        do {
+                uint32_t work = c ^ d ^ b;
+                W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+                work += W[n];
+                work += e + rotl32(a, 5) + rconsts[1];
+                e = d;
+                d = c;
+                c = rotl32(b, 30);
+                b = a;
+                a = work;
+                n = (n + 1) & 15;
+        } while (--j >= 0);
+        /* 3rd round */
+        j = 19;
+        do {
+                uint32_t work = ((b | c) & d) | (b & c);
+                W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+                work += W[n];
+                work += e + rotl32(a, 5) + rconsts[2];
+                e = d;
+                d = c;
+                c = rotl32(b, 30);
+                b = a;
+                a = work;
+                n = (n + 1) & 15;
+        } while (--j >= 0);
+        /* 4th round */
+        j = 19;
+        do {
+                uint32_t work = c ^ d ^ b;
+                W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+                work += W[n];
+                work += e + rotl32(a, 5) + rconsts[3];
+                e = d;
+                d = c;
+                c = rotl32(b, 30);
+                b = a;
+                a = work;
+                n = (n + 1) & 15;
+        } while (--j >= 0);
+        ctx->hash[0] += a;
+        ctx->hash[1] += b;
+        ctx->hash[2] += c;
+        ctx->hash[3] += d;
+        ctx->hash[4] += e;
+}
+#else
+/* Compact version, almost twice as slow as fully unrolled */
 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 {
        static const uint32_t rconsts[] ALIGN4 = {
                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
        };
        int i, j;
-        int cnt;
+        int n;
        uint32_t W[16+16];
        uint32_t a, b, c, d, e;
        /* On-stack work buffer frees up one register in the main loop
-         * which otherwise will be needed to hold ctx pointer */
+         * which otherwise will be needed to hold ctx pointer.
+         *
+         * The compiler is not smart enough to realize it, though. :(
+         * If __attribute__((optimize("2"))) is added to the function,
+         * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
+         * register (making code 6 bytes smaller, not just faster).
+         */
        for (i = 0; i < 16; i++)
                W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
@@ -512,7 +907,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
        e = ctx->hash[4];
        /* 4 rounds of 20 operations each */
-        cnt = 0;
+        n = 0;
        for (i = 0; i < 4; i++) {
                j = 19;
                do {
@@ -523,27 +918,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
                                work = (work & b) ^ d;
                                if (j <= 3)
                                        goto ge16;
-                                /* Used to do SWAP_BE32 here, but this
-                                 * requires ctx (see comment above) */
-                                work += W[cnt];
                        } else {
                                if (i == 2)
                                        work = ((b | c) & d) | (b & c);
                                else /* i = 1 or 3 */
                                        work ^= b;
 ge16:
-                                W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
+                                W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
-                                work += W[cnt];
                        }
+                        work += W[n];
                        work += e + rotl32(a, 5) + rconsts[i];
                        /* Rotate by one for next time */
                        e = d;
                        d = c;
-                        c = /* b = */ rotl32(b, 30);
+                        c = rotl32(b, 30);
                        b = a;
                        a = work;
-                        cnt = (cnt + 1) & 15;
+                        n = (n + 1) & 15;
                } while (--j >= 0);
        }
@@ -553,6 +945,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
        ctx->hash[3] += d;
        ctx->hash[4] += e;
 }
+#endif
 /* Constants for SHA512 from FIPS 180-2:4.2.3.
 * SHA256 constants from FIPS 180-2:4.2.2

diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index e0db8ce67..ee19c1cb7 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -8,6 +8,9 @@
8	*/	8	*/
9	#include "libbb.h"	9	#include "libbb.h"
10		10
		11	#define STR1(s) #s
		12	#define STR(s) STR1(s)
		13
11	#define NEED_SHA512 (ENABLE_SHA512SUM \|\| ENABLE_USE_BB_CRYPT_SHA)	14	#define NEED_SHA512 (ENABLE_SHA512SUM \|\| ENABLE_USE_BB_CRYPT_SHA)
12		15
13	/* gcc 4.2.1 optimizes rotr64 better with inline than with macro	16	/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
@@ -390,7 +393,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
390	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);	393	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);	394	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);	395	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393	# undef OP
394	# endif	396	# endif
395	/* Add checksum to the starting values */	397	/* Add checksum to the starting values */
396	ctx->hash[0] += A;	398	ctx->hash[0] += A;
@@ -399,6 +401,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
399	ctx->hash[3] += D;	401	ctx->hash[3] += D;
400	#endif	402	#endif
401	}	403	}
		404	#undef OP
402	#undef FF	405	#undef FF
403	#undef FG	406	#undef FG
404	#undef FH	407	#undef FH
@@ -490,18 +493,410 @@ unsigned FAST_FUNC md5_end(md5_ctx_t ctx, void resbuf)
490	* then rebuild and compare "shaNNNsum bigfile" results.	493	* then rebuild and compare "shaNNNsum bigfile" results.
491	*/	494	*/
492		495
		496	#if CONFIG_SHA1_SMALL == 0
		497	# if defined(__GNUC__) && defined(__i386__)
		498	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
		499	{
		500	BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
		501	asm(
		502	"\n\
		503	pushl %ebp # \n\
		504	pushl %edi # \n\
		505	pushl %esi # \n\
		506	pushl %ebx # \n\
		507	pushl %eax \n\
		508	movl $15, %edi \n\
		509	1: \n\
		510	movl (%eax,%edi,4), %esi \n\
		511	bswap %esi \n\
		512	pushl %esi \n\
		513	decl %edi \n\
		514	jns 1b \n\
		515	movl 80(%eax), %ebx # b = ctx->hash[1] \n\
		516	movl 84(%eax), %ecx # c = ctx->hash[2] \n\
		517	movl 88(%eax), %edx # d = ctx->hash[3] \n\
		518	movl 92(%eax), %ebp # e = ctx->hash[4] \n\
		519	movl 76(%eax), %eax # a = ctx->hash[0] \n\
		520	#Register and stack use: \n\
		521	# eax..edx: a..d \n\
		522	# ebp: e \n\
		523	# esi,edi: temps \n\
		524	# 4*n(%esp): W[n] \n\
		525	"
		526	#define RD1As(a,b,c,d,e, n, RCONST) \
		527	"\n\
		528	##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\
		529	movl "c", %edi # c \n\
		530	xorl "d", %edi # ^d \n\
		531	andl "b", %edi # &b \n\
		532	xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
		533	leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
		534	addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
		535	movl "a", %esi # \n\
		536	roll $5, %esi # rotl32(a,5) \n\
		537	addl %esi, "e" # e += rotl32(a,5) \n\
		538	rorl $2, "b" # b = rotl32(b,30) \n\
		539	"
		540	#define RD1Bs(a,b,c,d,e, n, RCONST) \
		541	"\n\
		542	movl 4*"n"(%esp), %esi # W[n] \n\
		543	movl "c", %edi # c \n\
		544	xorl "d", %edi # ^d \n\
		545	andl "b", %edi # &b \n\
		546	xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
		547	leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
		548	addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
		549	movl "a", %esi # \n\
		550	roll $5, %esi # rotl32(a,5) \n\
		551	addl %esi, "e" # e += rotl32(a,5) \n\
		552	rorl $2, "b" # b = rotl32(b,30) \n\
		553	"
		554	#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
		555	"\n\
		556	movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
		557	xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
		558	xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
		559	xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
		560	roll %esi # \n\
		561	movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
		562	movl "c", %edi # c \n\
		563	xorl "d", %edi # ^d \n\
		564	andl "b", %edi # &b \n\
		565	xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
		566	leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
		567	addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
		568	movl "a", %esi # \n\
		569	roll $5, %esi # rotl32(a,5) \n\
		570	addl %esi, "e" # e += rotl32(a,5) \n\
		571	rorl $2, "b" # b = rotl32(b,30) \n\
		572	"
		573	#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
		574	#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
		575	#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
		576	#undef RCONST
		577	#define RCONST 0x5A827999
		578	RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
		579	RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
		580	RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
		581	RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
		582	#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
		583	"\n\
		584	movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
		585	xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
		586	xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
		587	xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
		588	roll %esi # \n\
		589	movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
		590	movl "c", %edi # c \n\
		591	xorl "d", %edi # ^d \n\
		592	xorl "b", %edi # ^b \n\
		593	leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
		594	addl %edi, "e" # e += (c ^ d ^ b) \n\
		595	movl "a", %esi # \n\
		596	roll $5, %esi # rotl32(a,5) \n\
		597	addl %esi, "e" # e += rotl32(a,5) \n\
		598	rorl $2, "b" # b = rotl32(b,30) \n\
		599	"
		600	#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
		601	#undef RCONST
		602	#define RCONST 0x6ED9EBA1
		603	RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
		604	RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
		605	RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
		606	RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
		607
		608	#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
		609	"\n\
		610	movl "b", %edi # di: b \n\
		611	movl "b", %esi # si: b \n\
		612	orl "c", %edi # di: b \| c \n\
		613	andl "c", %esi # si: b & c \n\
		614	andl "d", %edi # di: (b \| c) & d \n\
		615	orl %esi, %edi # ((b \| c) & d) \| (b & c) \n\
		616	movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
		617	xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
		618	xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
		619	xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
		620	roll %esi # \n\
		621	movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
		622	addl %edi, "e" # += ((b \| c) & d) \| (b & c)\n\
		623	leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
		624	movl "a", %esi # \n\
		625	roll $5, %esi # rotl32(a,5) \n\
		626	addl %esi, "e" # e += rotl32(a,5) \n\
		627	rorl $2, "b" # b = rotl32(b,30) \n\
		628	"
		629	#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
		630	#undef RCONST
		631	#define RCONST 0x8F1BBCDC
		632	RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
		633	RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
		634	RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
		635	RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
		636
		637	#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
		638	"\n\
		639	movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
		640	xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
		641	xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
		642	xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
		643	roll %esi # \n\
		644	movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
		645	movl "c", %edi # c \n\
		646	xorl "d", %edi # ^d \n\
		647	xorl "b", %edi # ^b \n\
		648	leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
		649	addl %edi, "e" # e += (c ^ d ^ b) \n\
		650	movl "a", %esi # \n\
		651	roll $5, %esi # rotl32(a,5) \n\
		652	addl %esi, "e" # e += rotl32(a,5) \n\
		653	rorl $2, "b" # b = rotl32(b,30) \n\
		654	"
		655	#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
		656	"\n\
		657	movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
		658	xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
		659	xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
		660	xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
		661	roll %esi # \n\
		662	##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\
		663	movl "c", %edi # c \n\
		664	xorl "d", %edi # ^d \n\
		665	xorl "b", %edi # ^b \n\
		666	leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
		667	addl %edi, "e" # e += (c ^ d ^ b) \n\
		668	movl "a", %esi # \n\
		669	roll $5, %esi # rotl32(a,5) \n\
		670	addl %esi, "e" # e += rotl32(a,5) \n\
		671	rorl $2, "b" # b = rotl32(b,30) \n\
		672	"
		673	#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
		674	#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
		675	#undef RCONST
		676	#define RCONST 0xCA62C1D6
		677	RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
		678	RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
		679	RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
		680	RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
		681
		682	"\n\
		683	movl 4*16(%esp), %esi # \n\
		684	addl $4*(16+1), %esp # \n\
		685	addl %eax, 76(%esi) # ctx->hash[0] += a \n\
		686	addl %ebx, 80(%esi) # ctx->hash[1] += b \n\
		687	addl %ecx, 84(%esi) # ctx->hash[2] += c \n\
		688	addl %edx, 88(%esi) # ctx->hash[3] += d \n\
		689	addl %ebp, 92(%esi) # ctx->hash[4] += e \n\
		690	popl %ebx # \n\
		691	popl %esi # \n\
		692	popl %edi # \n\
		693	popl %ebp # \n\
		694	"
		695	); /* asm */
		696	#undef RCONST
		697	}
		698	# elif defined(__GNUC__) && defined(__x86_64__)
		699
		700	/* in hash_md5_sha_x86-64.S */
		701	struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
		702	void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);
		703
		704	# else
		705	/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
		706	* It seems further speedup can be achieved by handling more than
		707	* 64 bytes per one function call (coreutils does that).
		708	*/
		709	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
		710	{
		711	static const uint32_t rconsts[] ALIGN4 = {
		712	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
		713	};
		714	uint32_t W[16];
		715	uint32_t a, b, c, d, e;
		716
		717	a = ctx->hash[0];
		718	b = ctx->hash[1];
		719	c = ctx->hash[2];
		720	d = ctx->hash[3];
		721	e = ctx->hash[4];
		722
		723	/* From kernel source comments:
		724	* """
		725	* If you have 32 registers or more, the compiler can (and should)
		726	* try to change the array[] accesses into registers. However, on
		727	* machines with less than ~25 registers, that won't really work,
		728	* and at least gcc will make an unholy mess of it.
		729	*
		730	* So to avoid that mess which just slows things down, we force
		731	* the stores to memory to actually happen (we might be better off
		732	* with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
		733	* suggested by Artur Skawina - that will also make gcc unable to
		734	* try to do the silly "optimize away loads" part because it won't
		735	* see what the value will be).
		736	* """
		737	*/
		738	#if defined(__GNUC__) && defined(__i386__)
		739	# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
		740	#else
		741	# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
		742	#endif
		743
		744	#undef OP
		745	#define OP(A,B,C,D,E, n) \
		746	do { \
		747	uint32_t work = EXPR(B, C, D); \
		748	if (n <= 15) \
		749	work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
		750	if (n >= 16) \
		751	work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
		752	DO_NOT_TRY_PROPAGATING(W[n & 15]); \
		753	E += work + rotl32(A, 5) + rconsts[n / 20]; \
		754	B = rotl32(B, 30); \
		755	} while (0)
		756	#define OP20(n) \
		757	OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
		758	OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
		759	OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
		760	OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
		761
		762	/* 4 rounds of 20 operations each */
		763	#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
		764	OP20(0);
		765	#undef EXPR
		766	#define EXPR(b,c,d) (c ^ d ^ b)
		767	OP20(20);
		768	#undef EXPR
		769	#define EXPR(b,c,d) (((b \| c) & d) \| (b & c))
		770	OP20(40);
		771	#undef EXPR
		772	#define EXPR(b,c,d) (c ^ d ^ b)
		773	OP20(60);
		774
		775	#undef EXPR
		776	#undef OP
		777	#undef OP20
		778
		779	ctx->hash[0] += a;
		780	ctx->hash[1] += b;
		781	ctx->hash[2] += c;
		782	ctx->hash[3] += d;
		783	ctx->hash[4] += e;
		784	}
		785	# endif
		786	#elif CONFIG_SHA1_SMALL == 1
		787	/* Middle-sized version, +300 bytes of code on x86. */
		788	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
		789	{
		790	static const uint32_t rconsts[] ALIGN4 = {
		791	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
		792	};
		793	int j;
		794	int n;
		795	uint32_t W[16+16];
		796	uint32_t a, b, c, d, e;
		797
		798	a = ctx->hash[0];
		799	b = ctx->hash[1];
		800	c = ctx->hash[2];
		801	d = ctx->hash[3];
		802	e = ctx->hash[4];
		803
		804	/* 1st round of 20 operations */
		805	n = 0;
		806	do {
		807	uint32_t work = ((c ^ d) & b) ^ d;
		808	W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]);
		809	work += W[n];
		810	work += e + rotl32(a, 5) + rconsts[0];
		811	/* Rotate by one for next time */
		812	e = d;
		813	d = c;
		814	c = rotl32(b, 30);
		815	b = a;
		816	a = work;
		817	n = (n + 1) & 15;
		818	} while (n != 0);
		819	do {
		820	uint32_t work = ((c ^ d) & b) ^ d;
		821	W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
		822	work += W[n];
		823	work += e + rotl32(a, 5) + rconsts[0];
		824	e = d;
		825	d = c;
		826	c = rotl32(b, 30);
		827	b = a;
		828	a = work;
		829	n = (n + 1) /* & 15*/;
		830	} while (n != 4);
		831	/* 2nd round of 20 operations */
		832	j = 19;
		833	do {
		834	uint32_t work = c ^ d ^ b;
		835	W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
		836	work += W[n];
		837	work += e + rotl32(a, 5) + rconsts[1];
		838	e = d;
		839	d = c;
		840	c = rotl32(b, 30);
		841	b = a;
		842	a = work;
		843	n = (n + 1) & 15;
		844	} while (--j >= 0);
		845	/* 3rd round */
		846	j = 19;
		847	do {
		848	uint32_t work = ((b \| c) & d) \| (b & c);
		849	W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
		850	work += W[n];
		851	work += e + rotl32(a, 5) + rconsts[2];
		852	e = d;
		853	d = c;
		854	c = rotl32(b, 30);
		855	b = a;
		856	a = work;
		857	n = (n + 1) & 15;
		858	} while (--j >= 0);
		859	/* 4th round */
		860	j = 19;
		861	do {
		862	uint32_t work = c ^ d ^ b;
		863	W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
		864	work += W[n];
		865	work += e + rotl32(a, 5) + rconsts[3];
		866	e = d;
		867	d = c;
		868	c = rotl32(b, 30);
		869	b = a;
		870	a = work;
		871	n = (n + 1) & 15;
		872	} while (--j >= 0);
		873
		874	ctx->hash[0] += a;
		875	ctx->hash[1] += b;
		876	ctx->hash[2] += c;
		877	ctx->hash[3] += d;
		878	ctx->hash[4] += e;
		879	}
		880	#else
		881	/* Compact version, almost twice as slow as fully unrolled */
493	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)	882	static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494	{	883	{
495	static const uint32_t rconsts[] ALIGN4 = {	884	static const uint32_t rconsts[] ALIGN4 = {
496	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6	885	0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497	};	886	};
498	int i, j;	887	int i, j;
499	int cnt;	888	int n;
500	uint32_t W[16+16];	889	uint32_t W[16+16];
501	uint32_t a, b, c, d, e;	890	uint32_t a, b, c, d, e;
502		891
503	/* On-stack work buffer frees up one register in the main loop	892	/* On-stack work buffer frees up one register in the main loop
504	* which otherwise will be needed to hold ctx pointer */	893	* which otherwise will be needed to hold ctx pointer.
		894	*
		895	* The compiler is not smart enough to realize it, though. :(
		896	* If __attribute__((optimize("2"))) is added to the function,
		897	* only then gcc-9.3.1 spills "ctx" to stack and uses the freed
		898	* register (making code 6 bytes smaller, not just faster).
		899	*/
505	for (i = 0; i < 16; i++)	900	for (i = 0; i < 16; i++)
506	W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);	901	W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507		902
@@ -512,7 +907,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
512	e = ctx->hash[4];	907	e = ctx->hash[4];
513		908
514	/* 4 rounds of 20 operations each */	909	/* 4 rounds of 20 operations each */
515	cnt = 0;	910	n = 0;
516	for (i = 0; i < 4; i++) {	911	for (i = 0; i < 4; i++) {
517	j = 19;	912	j = 19;
518	do {	913	do {
@@ -523,27 +918,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
523	work = (work & b) ^ d;	918	work = (work & b) ^ d;
524	if (j <= 3)	919	if (j <= 3)
525	goto ge16;	920	goto ge16;
526	/* Used to do SWAP_BE32 here, but this
527	* requires ctx (see comment above) */
528	work += W[cnt];
529	} else {	921	} else {
530	if (i == 2)	922	if (i == 2)
531	work = ((b \| c) & d) \| (b & c);	923	work = ((b \| c) & d) \| (b & c);
532	else /* i = 1 or 3 */	924	else /* i = 1 or 3 */
533	work ^= b;	925	work ^= b;
534	ge16:	926	ge16:
535	W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);	927	W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
536	work += W[cnt];
537	}	928	}
		929	work += W[n];
538	work += e + rotl32(a, 5) + rconsts[i];	930	work += e + rotl32(a, 5) + rconsts[i];
539		931
540	/* Rotate by one for next time */	932	/* Rotate by one for next time */
541	e = d;	933	e = d;
542	d = c;	934	d = c;
543	c = /* b = */ rotl32(b, 30);	935	c = rotl32(b, 30);
544	b = a;	936	b = a;
545	a = work;	937	a = work;
546	cnt = (cnt + 1) & 15;	938	n = (n + 1) & 15;
547	} while (--j >= 0);	939	} while (--j >= 0);
548	}	940	}
549		941
@@ -553,6 +945,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
553	ctx->hash[3] += d;	945	ctx->hash[3] += d;
554	ctx->hash[4] += e;	946	ctx->hash[4] += e;
555	}	947	}
		948	#endif
556		949
557	/* Constants for SHA512 from FIPS 180-2:4.2.3.	950	/* Constants for SHA512 from FIPS 180-2:4.2.3.
558	* SHA256 constants from FIPS 180-2:4.2.2	951	* SHA256 constants from FIPS 180-2:4.2.2