From f1d06462e872270f38c497e36f8cd018ee7415bf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 28 Dec 2021 09:05:12 +0100 Subject: libbb: cose shrink in sha1 function old new delta sha1_process_block64 356 342 -14 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index e0db8ce67..a468397e3 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -523,9 +523,6 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) work = (work & b) ^ d; if (j <= 3) goto ge16; - /* Used to do SWAP_BE32 here, but this - * requires ctx (see comment above) */ - work += W[cnt]; } else { if (i == 2) work = ((b | c) & d) | (b & c); @@ -533,14 +530,14 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) work ^= b; ge16: W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); - work += W[cnt]; } + work += W[cnt]; work += e + rotl32(a, 5) + rconsts[i]; /* Rotate by one for next time */ e = d; d = c; - c = /* b = */ rotl32(b, 30); + c = rotl32(b, 30); b = a; a = work; cnt = (cnt + 1) & 15; -- cgit v1.2.3-55-g6feb From 0fcc7f5f738e38766cde59ffd193643458c26cba Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 28 Dec 2021 21:05:59 +0100 Subject: scripts/echo.c: fix NUL handling in "abc\0 def" Signed-off-by: Denys Vlasenko --- scripts/echo.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/scripts/echo.c b/scripts/echo.c index 7474ccdd4..e3a07adf0 100644 --- a/scripts/echo.c +++ b/scripts/echo.c @@ -153,25 +153,32 @@ int main(int argc, char **argv) if (!eflag) { /* optimization for very common case */ fputs(arg, stdout); - } else while ((c = *arg++)) { - if (c == eflag) { /* Check for escape seq. */ + } else + while ((c = *arg++) != '\0') { + if (c == eflag) { + /* This is an "\x" sequence */ + if (*arg == 'c') { - /* '\c' means cancel newline and + /* "\c" means cancel newline and * ignore all subsequent chars. */ goto ret; } - { - /* Since SUSv3 mandates a first digit of 0, 4-digit octals - * of the form \0### are accepted. */ - if (*arg == '0') { - /* NB: don't turn "...\0" into "...\" */ - if (arg[1] && ((unsigned char)(arg[1]) - '0') < 8) { - arg++; - } + /* Since SUSv3 mandates a first digit of 0, 4-digit octals + * of the form \0### are accepted. */ + if (*arg == '0') { + if ((unsigned char)(arg[1] - '0') < 8) { + /* 2nd char is 0..7: skip leading '0' */ + arg++; } - /* bb_process_escape_sequence handles NUL correctly - * ("...\" case. */ - c = bb_process_escape_sequence(&arg); + } + /* bb_process_escape_sequence handles NUL correctly + * ("...\" case). */ + { + /* optimization: don't force arg to be on-stack, + * use another variable for that. ~30 bytes win */ + const char *z = arg; + c = bb_process_escape_sequence(&z); + arg = z; } } putchar(c); -- cgit v1.2.3-55-g6feb From 0e2cb6d1e2553675bba2999829bbc29219aea987 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 29 Dec 2021 06:41:05 +0100 Subject: echo: add FIXME comment Signed-off-by: Denys Vlasenko --- coreutils/echo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/coreutils/echo.c b/coreutils/echo.c index 82f0358b6..44b2eb5d0 100644 --- a/coreutils/echo.c +++ b/coreutils/echo.c @@ -321,6 +321,8 @@ int echo_main(int argc, char **argv) if (*arg == '0' && (unsigned char)(arg[1] - '0') < 8) { arg++; } +//FIXME? we also accept non-0 starting sequences (see echo-prints-slash_41 test) +// echo -ne '-\41-' prints "-!-". bash 5.0.17 does not (prints "-\41-"). /* bb_process_escape_sequence can handle nul correctly */ c = bb_process_escape_sequence( (void*) &arg); } -- cgit v1.2.3-55-g6feb From 9173c9cce48dc4c867fb06bb72e8c762740c5c86 Mon Sep 17 00:00:00 2001 From: Sören Tempel Date: Wed, 29 Dec 2021 16:15:50 +0100 Subject: ed: add support for -s command-line option as mandated by POSIX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apart from the -p option, POSIX also mandates an -s option which suppresses the output of byte counts for the e, E, r, and w command. From these commands, Busybox ed presently only implements the r and w commands. This commit ensures that these two command do not output any bytes counts when the -s option is passed. The shell escape command, also effected by the -s option, is not implemented by Busybox at the moment. function old new delta packed_usage 34096 34115 +19 doCommands 1887 1900 +13 readLines 388 397 +9 .rodata 104196 104200 +4 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 4/0 up/down: 45/0) Total: 45 bytes Signed-off-by: Sören Tempel Signed-off-by: Denys Vlasenko --- editors/ed.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/editors/ed.c b/editors/ed.c index dfe0f1a77..209ce9942 100644 --- a/editors/ed.c +++ b/editors/ed.c @@ -18,7 +18,7 @@ //applet:IF_ED(APPLET(ed, BB_DIR_BIN, BB_SUID_DROP)) -//usage:#define ed_trivial_usage "[-p PROMPT] [FILE]" +//usage:#define ed_trivial_usage "[-p PROMPT] [-s] [FILE]" //usage:#define ed_full_usage "" #include "libbb.h" @@ -71,6 +71,11 @@ struct globals { SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \ } while (0) +#define OPTION_STR "sp:" +enum { + OPT_s = (1 << 0), +}; + static int bad_nums(int num1, int num2, const char *for_what) { if ((num1 < 1) || (num2 > lastNum) || (num1 > num2)) { @@ -458,7 +463,8 @@ static int readLines(const char *file, int num) * in the following format: * "%d\n", */ - printf("%u\n", charCount); + if (!(option_mask32 & OPT_s)) + printf("%u\n", charCount); return TRUE; } @@ -510,7 +516,8 @@ static int writeLines(const char *file, int num1, int num2) * unless the -s option was specified, in the following format: * "%d\n", */ - printf("%u\n", charCount); + if (!(option_mask32 & OPT_s)) + printf("%u\n", charCount); return TRUE; } @@ -1005,7 +1012,7 @@ int ed_main(int argc UNUSED_PARAM, char **argv) lines.prev = &lines; prompt = ""; /* no prompt by default */ - getopt32(argv, "p:", &prompt); + getopt32(argv, OPTION_STR, &prompt); argv += optind; if (argv[0]) { -- cgit v1.2.3-55-g6feb From 25aadc893d21b35f7d34a9d1edc843632e7abd8f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 30 Dec 2021 13:07:12 +0100 Subject: libbb/sha1: add config-selectable fully unrolled version, closes 14391 function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko --- libbb/Config.src | 25 +++++++++++----- libbb/hash_md5_sha.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 95 insertions(+), 14 deletions(-) diff --git a/libbb/Config.src b/libbb/Config.src index 24b31fad9..13188ef03 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -42,21 +42,32 @@ config MD5_SMALL default 1 # all "fast or small" options default to small range 0 3 help - Trade binary size versus speed for the md5sum algorithm. + Trade binary size versus speed for the md5 algorithm. Approximate values running uClibc and hashing linux-2.4.4.tar.bz2 were: - value user times (sec) text size (386) - 0 (fastest) 1.1 6144 - 1 1.4 5392 - 2 3.0 5088 - 3 (smallest) 5.1 4912 + value user times (sec) text size (386) + 0 (fastest) 1.1 6144 + 1 1.4 5392 + 2 3.0 5088 + 3 (smallest) 5.1 4912 + +config SHA1_SMALL + int "SHA1: Trade bytes for speed (0:fast, 3:slow)" + default 3 # all "fast or small" options default to small + range 0 3 + help + Trade binary size versus speed for the sha1 algorithm. + throughput MB/s size of sha1_process_block64 + value 486 x86-64 486 x86-64 + 0 339 374 4149 4167 + 1,2,3 200 195 358 380 config SHA3_SMALL int "SHA3: Trade bytes for speed (0:fast, 1:slow)" default 1 # all "fast or small" options default to small range 0 1 help - Trade binary size versus speed for the sha3sum algorithm. + Trade binary size versus speed for the sha3 algorithm. SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): 64-bit x86: +270 bytes of code, 45% faster 32-bit x86: +450 bytes of code, 75% faster diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index a468397e3..75673e334 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) OP(FI, D, A, B, C, 11, 10, 0xbd3af235); OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); OP(FI, B, C, D, A, 9, 21, 0xeb86d391); -# undef OP # endif /* Add checksum to the starting values */ ctx->hash[0] += A; @@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) ctx->hash[3] += D; #endif } +#undef OP #undef FF #undef FG #undef FH @@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf) * then rebuild and compare "shaNNNsum bigfile" results. */ +#if CONFIG_SHA1_SMALL == 0 +/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. + * It seems further speedup can be achieved by handling more than + * 64 bytes per one function call (coreutils does that). + */ +static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) +{ + static const uint32_t rconsts[] ALIGN4 = { + 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 + }; + uint32_t W[16]; + uint32_t a, b, c, d, e; + + a = ctx->hash[0]; + b = ctx->hash[1]; + c = ctx->hash[2]; + d = ctx->hash[3]; + e = ctx->hash[4]; + +#undef OP +#define OP(A,B,C,D,E, n) \ + do { \ + uint32_t work = EXPR(B, C, D); \ + if (n <= 15) \ + work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ + if (n >= 16) \ + work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \ + E += work + rotl32(A, 5) + rconsts[n / 20]; \ + B = rotl32(B, 30); \ + } while (0) +#define OP20(n) \ + OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \ + OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \ + OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \ + OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19)) + + /* 4 rounds of 20 operations each */ +#define EXPR(b,c,d) (((c ^ d) & b) ^ d) + OP20(0); +#undef EXPR +#define EXPR(b,c,d) (c ^ d ^ b) + OP20(20); +#undef EXPR +#define EXPR(b,c,d) (((b | c) & d) | (b & c)) + OP20(40); +#undef EXPR +#define EXPR(b,c,d) (c ^ d ^ b) + OP20(60); + +#undef EXPR +#undef OP +#undef OP20 + + ctx->hash[0] += a; + ctx->hash[1] += b; + ctx->hash[2] += c; + ctx->hash[3] += d; + ctx->hash[4] += e; +} +#else +/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */ + +/* Compact version, almost twice as slow as fully unrolled */ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) { static const uint32_t rconsts[] ALIGN4 = { 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 }; int i, j; - int cnt; + int n; uint32_t W[16+16]; uint32_t a, b, c, d, e; /* On-stack work buffer frees up one register in the main loop - * which otherwise will be needed to hold ctx pointer */ + * which otherwise will be needed to hold ctx pointer. + * + * The compiler is not smart enough to realize it, though. :( + * If __attribute__((optimize("2"))) is added to the function, + * only then gcc-9.3.1 spills "ctx" to stack and uses the freed + * register (making code 6 bytes smaller, not just faster). + */ for (i = 0; i < 16; i++) W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); @@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) e = ctx->hash[4]; /* 4 rounds of 20 operations each */ - cnt = 0; + n = 0; for (i = 0; i < 4; i++) { j = 19; do { @@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) else /* i = 1 or 3 */ work ^= b; ge16: - W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); + W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); } - work += W[cnt]; + work += W[n]; work += e + rotl32(a, 5) + rconsts[i]; /* Rotate by one for next time */ @@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) c = rotl32(b, 30); b = a; a = work; - cnt = (cnt + 1) & 15; + n = (n + 1) & 15; } while (--j >= 0); } @@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) ctx->hash[3] += d; ctx->hash[4] += e; } +#endif /* Constants for SHA512 from FIPS 180-2:4.2.3. * SHA256 constants from FIPS 180-2:4.2.2 -- cgit v1.2.3-55-g6feb From 0b62a08777e29c34f947c791a1eded5b97e05699 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 30 Dec 2021 18:54:02 +0100 Subject: libbb/sha1: add config-selectable partially unrolled version function old new delta sha1_process_block64 364 732 +368 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 368/-16) Total: 352 bytes Signed-off-by: Denys Vlasenko --- libbb/Config.src | 3 +- libbb/hash_md5_sha.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 5 deletions(-) diff --git a/libbb/Config.src b/libbb/Config.src index 13188ef03..c793f5939 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -60,7 +60,8 @@ config SHA1_SMALL throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 0 339 374 4149 4167 - 1,2,3 200 195 358 380 + 1 224 229 654 732 + 2,3 200 195 358 380 config SHA3_SMALL int "SHA3: Trade bytes for speed (0:fast, 1:slow)" diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 75673e334..053ebe291 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -514,9 +514,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) do { \ uint32_t work = EXPR(B, C, D); \ if (n <= 15) \ - work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ + work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ if (n >= 16) \ - work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \ + work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \ E += work + rotl32(A, 5) + rconsts[n / 20]; \ B = rotl32(B, 30); \ } while (0) @@ -549,9 +549,101 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) ctx->hash[3] += d; ctx->hash[4] += e; } -#else -/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */ +#elif CONFIG_SHA1_SMALL == 1 +/* Middle-sized version, +300 bytes of code on x86. */ +static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) +{ + static const uint32_t rconsts[] ALIGN4 = { + 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 + }; + int j; + int n; + uint32_t W[16+16]; + uint32_t a, b, c, d, e; + + a = ctx->hash[0]; + b = ctx->hash[1]; + c = ctx->hash[2]; + d = ctx->hash[3]; + e = ctx->hash[4]; + + /* 1st round of 20 operations */ + n = 0; + do { + uint32_t work = ((c ^ d) & b) ^ d; + W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); + work += W[n]; + work += e + rotl32(a, 5) + rconsts[0]; + /* Rotate by one for next time */ + e = d; + d = c; + c = rotl32(b, 30); + b = a; + a = work; + n = (n + 1) & 15; + } while (n != 0); + do { + uint32_t work = ((c ^ d) & b) ^ d; + W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); + work += W[n]; + work += e + rotl32(a, 5) + rconsts[0]; + e = d; + d = c; + c = rotl32(b, 30); + b = a; + a = work; + n = (n + 1) & 15; + } while (n != 4); + /* 2nd round of 20 operations */ + j = 19; + do { + uint32_t work = c ^ d ^ b; + W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); + work += W[n]; + work += e + rotl32(a, 5) + rconsts[1]; + e = d; + d = c; + c = rotl32(b, 30); + b = a; + a = work; + n = (n + 1) & 15; + } while (--j >= 0); + /* 3rd round */ + j = 19; + do { + uint32_t work = ((b | c) & d) | (b & c); + W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); + work += W[n]; + work += e + rotl32(a, 5) + rconsts[2]; + e = d; + d = c; + c = rotl32(b, 30); + b = a; + a = work; + n = (n + 1) & 15; + } while (--j >= 0); + /* 4th round */ + j = 19; + do { + uint32_t work = c ^ d ^ b; + W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); + work += W[n]; + work += e + rotl32(a, 5) + rconsts[3]; + e = d; + d = c; + c = rotl32(b, 30); + b = a; + a = work; + n = (n + 1) & 15; + } while (--j >= 0); + ctx->hash[0] += a; + ctx->hash[1] += b; + ctx->hash[2] += c; + ctx->hash[3] += d; + ctx->hash[4] += e; +} +#else /* Compact version, almost twice as slow as fully unrolled */ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) { -- cgit v1.2.3-55-g6feb From f09d088fdf6eeeba902fb5627930145a3058a5f0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 31 Dec 2021 17:06:00 +0100 Subject: libbb/sha1: shrink and speed up fully unrolled version function old new delta sha1_process_block64 4149 3950 -199 Signed-off-by: Denys Vlasenko --- libbb/Config.src | 2 +- libbb/hash_md5_sha.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/libbb/Config.src b/libbb/Config.src index c793f5939..d2054dc63 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -59,7 +59,7 @@ config SHA1_SMALL Trade binary size versus speed for the sha1 algorithm. throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 - 0 339 374 4149 4167 + 0 360 374 3950 4167 1 224 229 654 732 2,3 200 195 358 380 diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 053ebe291..faf485df5 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) d = ctx->hash[3]; e = ctx->hash[4]; +/* From kernel source comments: + * """ + * If you have 32 registers or more, the compiler can (and should) + * try to change the array[] accesses into registers. However, on + * machines with less than ~25 registers, that won't really work, + * and at least gcc will make an unholy mess of it. + * + * So to avoid that mess which just slows things down, we force + * the stores to memory to actually happen (we might be better off + * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as + * suggested by Artur Skawina - that will also make gcc unable to + * try to do the silly "optimize away loads" part because it won't + * see what the value will be). + * """ + */ +#if defined(__i386__) +# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m)) +#else +# define DO_NOT_TRY_PROPAGATING(m) ((void)0) +#endif + #undef OP #define OP(A,B,C,D,E, n) \ do { \ @@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ if (n >= 16) \ work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \ + DO_NOT_TRY_PROPAGATING(W[n & 15]); \ E += work + rotl32(A, 5) + rconsts[n / 20]; \ B = rotl32(B, 30); \ } while (0) -- cgit v1.2.3-55-g6feb From 5f6817020467598868b7d1c9ca477d7ccd66b87d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 1 Jan 2022 12:21:01 +0100 Subject: libbb/sha1: assembly versions for x86 32 bits: function old new delta sha1_process_block64 3950 3657 -293 64 bits: sha1_process_block64 4167 3683 -484 Signed-off-by: Denys Vlasenko --- libbb/Config.src | 2 +- libbb/hash_md5_sha.c | 417 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 418 insertions(+), 1 deletion(-) diff --git a/libbb/Config.src b/libbb/Config.src index d2054dc63..e027c14a8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -59,7 +59,7 @@ config SHA1_SMALL Trade binary size versus speed for the sha1 algorithm. throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 - 0 360 374 3950 4167 + 0 367 367 3657 3683 1 224 229 654 732 2,3 200 195 358 380 diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index faf485df5..9de30dfe6 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -8,6 +8,9 @@ */ #include "libbb.h" +#define STR1(s) #s +#define STR(s) STR1(s) + #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) /* gcc 4.2.1 optimizes rotr64 better with inline than with macro @@ -491,6 +494,419 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf) */ #if CONFIG_SHA1_SMALL == 0 +# if defined(__GNUC__) && defined(__i386__) +static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) +{ + BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76); + asm( +"\n\ + pushl %ebp # \n\ + pushl %edi # \n\ + pushl %esi # \n\ + pushl %ebx # \n\ + pushl %eax \n\ + movl $15, %edi \n\ +1: \n\ + movl (%eax,%edi,4), %esi \n\ + bswap %esi \n\ + pushl %esi \n\ + decl %edi \n\ + jns 1b \n\ + movl 80(%eax), %ebx # b = ctx->hash[1] \n\ + movl 84(%eax), %ecx # c = ctx->hash[2] \n\ + movl 88(%eax), %edx # d = ctx->hash[3] \n\ + movl 92(%eax), %ebp # e = ctx->hash[4] \n\ + movl 76(%eax), %eax # a = ctx->hash[0] \n\ +#Register and stack use: \n\ +# eax..edx: a..d \n\ +# ebp: e \n\ +# esi,edi: temps \n\ +# 4*n(%esp): W[n] \n\ +" +#define RD1As(a,b,c,d,e, n, RCONST) \ +"\n\ + ##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\ + movl "c", %edi # c \n\ + xorl "d", %edi # ^d \n\ + andl "b", %edi # &b \n\ + xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\ + addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD1Bs(a,b,c,d,e, n, RCONST) \ +"\n\ + movl 4*"n"(%esp), %esi # W[n] \n\ + movl "c", %edi # c \n\ + xorl "d", %edi # ^d \n\ + andl "b", %edi # &b \n\ + xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\ + addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ + xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ + xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ + xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ + movl "c", %edi # c \n\ + xorl "d", %edi # ^d \n\ + andl "b", %edi # &b \n\ + xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ + addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST)) +#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST)) +#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) +#undef RCONST +#define RCONST 0x5A827999 + RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) + RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) + RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) + RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) +#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ + xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ + xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ + xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ + movl "c", %edi # c \n\ + xorl "d", %edi # ^d \n\ + xorl "b", %edi # ^b \n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ + addl %edi, "e" # e += (c ^ d ^ b) \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST)) +#undef RCONST +#define RCONST 0x6ED9EBA1 + RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4) + RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9) + RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14) + RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19) + +#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl "b", %edi # di: b \n\ + movl "b", %esi # si: b \n\ + orl "c", %edi # di: b | c \n\ + andl "c", %esi # si: b & c \n\ + andl "d", %edi # di: (b | c) & d \n\ + orl %esi, %edi # ((b | c) & d) | (b & c) \n\ + movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ + xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ + xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ + xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ + addl %edi, "e" # += ((b | c) & d) | (b & c)\n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST)) +#undef RCONST +#define RCONST 0x8F1BBCDC + RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4) + RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9) + RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14) + RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19) + +#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ + xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ + xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ + xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ + movl "c", %edi # c \n\ + xorl "d", %edi # ^d \n\ + xorl "b", %edi # ^b \n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ + addl %edi, "e" # e += (c ^ d ^ b) \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ + xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ + xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ + xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + ##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\ + movl "c", %edi # c \n\ + xorl "d", %edi # ^d \n\ + xorl "b", %edi # ^b \n\ + leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ + addl %edi, "e" # e += (c ^ d ^ b) \n\ + movl "a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, "e" # e += rotl32(a,5) \n\ + rorl $2, "b" # b = rotl32(b,30) \n\ +" +#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) +#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) +#undef RCONST +#define RCONST 0xCA62C1D6 + RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4) + RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9) + RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14) + RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) + +"\n\ + movl 4*16(%esp), %esi # \n\ + addl $4*(16+1), %esp # \n\ + addl %eax, 76(%esi) # ctx->hash[0] += a \n\ + addl %ebx, 80(%esi) # ctx->hash[1] += b \n\ + addl %ecx, 84(%esi) # ctx->hash[2] += c \n\ + addl %edx, 88(%esi) # ctx->hash[3] += d \n\ + addl %ebp, 92(%esi) # ctx->hash[4] += e \n\ + popl %ebx # \n\ + popl %esi # \n\ + popl %edi # \n\ + popl %ebp # \n\ +" + ); /* asm */ +#undef RCONST +} +# elif defined(__GNUC__) && defined(__x86_64__) +static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) +{ + BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); + asm( +// TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save) +"\n\ + ##pushq %r15 # \n\ + ##pushq %r14 # \n\ + ##pushq %r13 # \n\ + ##pushq %r12 # \n\ + ##pushq %rbp # \n\ + ##pushq %rbx # \n\ + movq %rbp, %r8 # callee-saved \n\ + movq %rbx, %r9 # callee-saved \n\ + movq %rdi, %r10 # we need ctx at the end \n\ + movl $15, %eax \n\ +1: \n\ + movl (%rdi,%rax,4), %esi \n\ + bswap %esi \n\ + movl %esi, -64(%rsp,%rax,4) \n\ + decl %eax \n\ + jns 1b \n\ + movl 80(%rdi), %eax # a = ctx->hash[0] \n\ + movl 84(%rdi), %ebx # b = ctx->hash[1] \n\ + movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ + movl 92(%rdi), %edx # d = ctx->hash[3] \n\ + movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ +#Register and stack use: \n\ +# eax..edx: a..d \n\ +# ebp: e \n\ +# esi,edi: temps \n\ +# -64+4*n(%rsp): W[n] \n\ +" +#define RD1As(a,b,c,d,e, n, RCONST) \ +"\n\ + ##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + andl %e"b", %edi # &b \n\ + xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\ + addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD1Bs(a,b,c,d,e, n, RCONST) \ +"\n\ + movl -64+4*"n"(%rsp), %esi # W[n] \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + andl %e"b", %edi # &b \n\ + xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\ + addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ + xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ + xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ + xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + andl %e"b", %edi # &b \n\ + xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ + addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) +#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) +#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) +#undef RCONST +#define RCONST 0x5A827999 + RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) + RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) + RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) + RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) +#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ + xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ + xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ + xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + xorl %e"b", %edi # ^b \n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ + addl %edi, %e"e" # e += (c ^ d ^ b) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST)) +#undef RCONST +#define RCONST 0x6ED9EBA1 + RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4) + RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9) + RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14) + RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19) + +#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl %e"b", %edi # di: b \n\ + movl %e"b", %esi # si: b \n\ + orl %e"c", %edi # di: b | c \n\ + andl %e"c", %esi # si: b & c \n\ + andl %e"d", %edi # di: (b | c) & d \n\ + orl %esi, %edi # ((b | c) & d) | (b & c) \n\ + movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ + xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ + xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ + xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST)) +#undef RCONST +//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement" +#define RCONST -0x70e44324 + RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4) + RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9) + RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14) + RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19) + +#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ + xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ + xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ + xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + xorl %e"b", %edi # ^b \n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ + addl %edi, %e"e" # e += (c ^ d ^ b) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ + xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ + xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ + xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + roll %esi # \n\ + ##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + xorl %e"b", %edi # ^b \n\ + leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ + addl %edi, %e"e" # e += (c ^ d ^ b) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) +#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) +#undef RCONST +//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement" +#define RCONST -0x359d3e2a + RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4) + RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9) + RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14) + RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) + +"\n\ + movq %r10, %rdi # \n\ + addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ + addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ + addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ + addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ + addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ + movq %r9, %rbx # callee-saved \n\ + movq %r8, %rbp # callee-saved \n\ + ##popq %rbx # \n\ + ##popq %rbp # \n\ + ##popq %r12 # \n\ + ##popq %r13 # \n\ + ##popq %r14 # \n\ + ##popq %r15 # \n\ +" + ); /* asm */ +#undef RCONST +} +# else /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. * It seems further speedup can be achieved by handling more than * 64 bytes per one function call (coreutils does that). @@ -571,6 +987,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) ctx->hash[3] += d; ctx->hash[4] += e; } +# endif #elif CONFIG_SHA1_SMALL == 1 /* Middle-sized version, +300 bytes of code on x86. */ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) -- cgit v1.2.3-55-g6feb From d643010feeef312c77d7f51c3dd476d4e605c982 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 1 Jan 2022 15:01:53 +0100 Subject: libbb/sha1: shrink x86_64 version - use r8..15 for W[8..15] function old new delta sha1_process_block64 3683 3562 -121 Signed-off-by: Denys Vlasenko --- libbb/Config.src | 2 +- libbb/hash_md5_sha.c | 299 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 240 insertions(+), 61 deletions(-) diff --git a/libbb/Config.src b/libbb/Config.src index e027c14a8..f66f65f81 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -59,7 +59,7 @@ config SHA1_SMALL Trade binary size versus speed for the sha1 algorithm. throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 - 0 367 367 3657 3683 + 0 367 367 3657 3562 1 224 229 654 732 2,3 200 195 358 380 diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 9de30dfe6..a4e36066a 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -700,22 +700,194 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) { BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); asm( -// TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save) "\n\ - ##pushq %r15 # \n\ - ##pushq %r14 # \n\ - ##pushq %r13 # \n\ - ##pushq %r12 # \n\ - ##pushq %rbp # \n\ - ##pushq %rbx # \n\ - movq %rbp, %r8 # callee-saved \n\ - movq %rbx, %r9 # callee-saved \n\ - movq %rdi, %r10 # we need ctx at the end \n\ - movl $15, %eax \n\ + pushq %r15 # \n\ + pushq %r14 # \n\ + pushq %r13 # \n\ + pushq %r12 # \n\ + pushq %rbp # \n\ + pushq %rbx # \n\ + pushq %rdi # we need ctx at the end \n\ + \n\ +#Register and stack use: \n\ +# eax..edx: a..d \n\ +# ebp: e \n\ +# esi,edi: temps \n\ +# -32+4*n(%rsp),r8...r15: W[0..7,8..15] \n\ + .macro loadW n,r \n\ + .if \\n == 0 \n\ + movl -32+4*0(%rsp),\\r \n\ + .endif \n\ + .if \\n == 1 \n\ + movl -32+4*1(%rsp),\\r \n\ + .endif \n\ + .if \\n == 2 \n\ + movl -32+4*2(%rsp),\\r \n\ + .endif \n\ + .if \\n == 3 \n\ + movl -32+4*3(%rsp),\\r \n\ + .endif \n\ + .if \\n == 4 \n\ + movl -32+4*4(%rsp),\\r \n\ + .endif \n\ + .if \\n == 5 \n\ + movl -32+4*5(%rsp),\\r \n\ + .endif \n\ + .if \\n == 6 \n\ + movl -32+4*6(%rsp),\\r \n\ + .endif \n\ + .if \\n == 7 \n\ + movl -32+4*7(%rsp),\\r \n\ + .endif \n\ + .if \\n == 8 \n\ + movl %r8d,\\r \n\ + .endif \n\ + .if \\n == 9 \n\ + movl %r9d,\\r \n\ + .endif \n\ + .if \\n == 10 \n\ + movl %r10d,\\r \n\ + .endif \n\ + .if \\n == 11 \n\ + movl %r11d,\\r \n\ + .endif \n\ + .if \\n == 12 \n\ + movl %r12d,\\r \n\ + .endif \n\ + .if \\n == 13 \n\ + movl %r13d,\\r \n\ + .endif \n\ + .if \\n == 14 \n\ + movl %r14d,\\r \n\ + .endif \n\ + .if \\n == 15 \n\ + movl %r15d,\\r \n\ + .endif \n\ + .endm \n\ + \n\ + .macro storeW r,n \n\ + .if \\n == 0 \n\ + movl \\r,-32+4*0(%rsp) \n\ + .endif \n\ + .if \\n == 1 \n\ + movl \\r,-32+4*1(%rsp) \n\ + .endif \n\ + .if \\n == 2 \n\ + movl \\r,-32+4*2(%rsp) \n\ + .endif \n\ + .if \\n == 3 \n\ + movl \\r,-32+4*3(%rsp) \n\ + .endif \n\ + .if \\n == 4 \n\ + movl \\r,-32+4*4(%rsp) \n\ + .endif \n\ + .if \\n == 5 \n\ + movl \\r,-32+4*5(%rsp) \n\ + .endif \n\ + .if \\n == 6 \n\ + movl \\r,-32+4*6(%rsp) \n\ + .endif \n\ + .if \\n == 7 \n\ + movl \\r,-32+4*7(%rsp) \n\ + .endif \n\ + .if \\n == 8 \n\ + movl \\r,%r8d \n\ + .endif \n\ + .if \\n == 9 \n\ + movl \\r,%r9d \n\ + .endif \n\ + .if \\n == 10 \n\ + movl \\r,%r10d \n\ + .endif \n\ + .if \\n == 11 \n\ + movl \\r,%r11d \n\ + .endif \n\ + .if \\n == 12 \n\ + movl \\r,%r12d \n\ + .endif \n\ + .if \\n == 13 \n\ + movl \\r,%r13d \n\ + .endif \n\ + .if \\n == 14 \n\ + movl \\r,%r14d \n\ + .endif \n\ + .if \\n == 15 \n\ + movl \\r,%r15d \n\ + .endif \n\ + .endm \n\ + \n\ + .macro xorW n,r \n\ + .if \\n == 0 \n\ + xorl -32+4*0(%rsp),\\r \n\ + .endif \n\ + .if \\n == 1 \n\ + xorl -32+4*1(%rsp),\\r \n\ + .endif \n\ + .if \\n == 2 \n\ + xorl -32+4*2(%rsp),\\r \n\ + .endif \n\ + .if \\n == 3 \n\ + xorl -32+4*3(%rsp),\\r \n\ + .endif \n\ + .if \\n == 4 \n\ + xorl -32+4*4(%rsp),\\r \n\ + .endif \n\ + .if \\n == 5 \n\ + xorl -32+4*5(%rsp),\\r \n\ + .endif \n\ + .if \\n == 6 \n\ + xorl -32+4*6(%rsp),\\r \n\ + .endif \n\ + .if \\n == 7 \n\ + xorl -32+4*7(%rsp),\\r \n\ + .endif \n\ + .if \\n == 8 \n\ + xorl %r8d,\\r \n\ + .endif \n\ + .if \\n == 9 \n\ + xorl %r9d,\\r \n\ + .endif \n\ + .if \\n == 10 \n\ + xorl %r10d,\\r \n\ + .endif \n\ + .if \\n == 11 \n\ + xorl %r11d,\\r \n\ + .endif \n\ + .if \\n == 12 \n\ + xorl %r12d,\\r \n\ + .endif \n\ + .if \\n == 13 \n\ + xorl %r13d,\\r \n\ + .endif \n\ + .if \\n == 14 \n\ + xorl %r14d,\\r \n\ + .endif \n\ + .if \\n == 15 \n\ + xorl %r15d,\\r \n\ + .endif \n\ + .endm \n\ + \n\ + movl 4*8(%rdi), %r8d \n\ + bswap %r8d \n\ + movl 4*9(%rdi), %r9d \n\ + bswap %r9d \n\ + movl 4*10(%rdi), %r10d \n\ + bswap %r10d \n\ + movl 4*11(%rdi), %r11d \n\ + bswap %r11d \n\ + movl 4*12(%rdi), %r12d \n\ + bswap %r12d \n\ + movl 4*13(%rdi), %r13d \n\ + bswap %r13d \n\ + movl 4*14(%rdi), %r14d \n\ + bswap %r14d \n\ + movl 4*15(%rdi), %r15d \n\ + bswap %r15d \n\ + movl $7, %eax \n\ 1: \n\ movl (%rdi,%rax,4), %esi \n\ bswap %esi \n\ - movl %esi, -64(%rsp,%rax,4) \n\ + movl %esi, -32(%rsp,%rax,4) \n\ decl %eax \n\ jns 1b \n\ movl 80(%rdi), %eax # a = ctx->hash[0] \n\ @@ -723,15 +895,10 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ movl 92(%rdi), %edx # d = ctx->hash[3] \n\ movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ -#Register and stack use: \n\ -# eax..edx: a..d \n\ -# ebp: e \n\ -# esi,edi: temps \n\ -# -64+4*n(%rsp): W[n] \n\ " #define RD1As(a,b,c,d,e, n, RCONST) \ "\n\ - ##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\ + ##loadW "n", %esi # n=0, W[0] already in %esi \n\ movl %e"c", %edi # c \n\ xorl %e"d", %edi # ^d \n\ andl %e"b", %edi # &b \n\ @@ -745,7 +912,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) " #define RD1Bs(a,b,c,d,e, n, RCONST) \ "\n\ - movl -64+4*"n"(%rsp), %esi # W[n] \n\ + loadW "n", %esi # W[n] \n\ movl %e"c", %edi # c \n\ xorl %e"d", %edi # ^d \n\ andl %e"b", %edi # &b \n\ @@ -757,14 +924,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) addl %esi, %e"e" # e += rotl32(a,5) \n\ rorl $2, %e"b" # b = rotl32(b,30) \n\ " -#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +#define RD1Cs(a,b,c,d,e, n, RCONST) \ "\n\ - movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ - xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ - xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ - xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + movl %e"c", %edi # c \n\ + xorl %e"d", %edi # ^d \n\ + andl %e"b", %edi # &b \n\ + xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ + leal "RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n] \n\ + addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ + movl %e"a", %esi # \n\ + roll $5, %esi # rotl32(a,5) \n\ + addl %esi, %e"e" # e += rotl32(a,5) \n\ + rorl $2, %e"b" # b = rotl32(b,30) \n\ +" +#define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \ +"\n\ + loadW "n13", %esi # W[(n+13) & 15] \n\ + xorW "n8", %esi # ^W[(n+8) & 15] \n\ + xorW "n2", %esi # ^W[(n+2) & 15] \n\ + xorW "n", %esi # ^W[n & 15] \n\ roll %esi # \n\ - movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + storeW %esi, "n" # store to W[n & 15] \n\ movl %e"c", %edi # c \n\ xorl %e"d", %edi # ^d \n\ andl %e"b", %edi # &b \n\ @@ -776,23 +956,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) addl %esi, %e"e" # e += rotl32(a,5) \n\ rorl $2, %e"b" # b = rotl32(b,30) \n\ " -#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) -#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) -#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) +#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) +#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) +#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) +#define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) #undef RCONST #define RCONST 0x5A827999 RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) - RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) - RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) - RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) + RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9) + RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14) + RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19) #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ "\n\ - movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ - xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ - xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ - xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + loadW "n13", %esi # W[(n+13) & 15] \n\ + xorW "n8", %esi # ^W[(n+8) & 15] \n\ + xorW "n2", %esi # ^W[(n+2) & 15] \n\ + xorW "n", %esi # ^W[n & 15] \n\ roll %esi # \n\ - movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + storeW %esi, "n" # store to W[n & 15] \n\ movl %e"c", %edi # c \n\ xorl %e"d", %edi # ^d \n\ xorl %e"b", %edi # ^b \n\ @@ -819,12 +1000,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) andl %e"c", %esi # si: b & c \n\ andl %e"d", %edi # di: (b | c) & d \n\ orl %esi, %edi # ((b | c) & d) | (b & c) \n\ - movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ - xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ - xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ - xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + loadW "n13", %esi # W[(n+13) & 15] \n\ + xorW "n8", %esi # ^W[(n+8) & 15] \n\ + xorW "n2", %esi # ^W[(n+2) & 15] \n\ + xorW "n", %esi # ^W[n & 15] \n\ roll %esi # \n\ - movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + storeW %esi, "n" # store to W[n & 15] \n\ addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ movl %e"a", %esi # \n\ @@ -843,12 +1024,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ "\n\ - movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ - xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ - xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ - xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + loadW "n13", %esi # W[(n+13) & 15] \n\ + xorW "n8", %esi # ^W[(n+8) & 15] \n\ + xorW "n2", %esi # ^W[(n+2) & 15] \n\ + xorW "n", %esi # ^W[n & 15] \n\ roll %esi # \n\ - movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ + storeW %esi, "n" # store to W[n & 15] \n\ movl %e"c", %edi # c \n\ xorl %e"d", %edi # ^d \n\ xorl %e"b", %edi # ^b \n\ @@ -861,12 +1042,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) " #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ "\n\ - movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ - xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ - xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ - xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ + loadW "n13", %esi # W[(n+13) & 15] \n\ + xorW "n8", %esi # ^W[(n+8) & 15] \n\ + xorW "n2", %esi # ^W[(n+2) & 15] \n\ + xorW "n", %esi # ^W[n & 15] \n\ roll %esi # \n\ - ##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\ + #storeW %esi, "n" # store to W[n & 15] elided \n\ movl %e"c", %edi # c \n\ xorl %e"d", %edi # ^d \n\ xorl %e"b", %edi # ^b \n\ @@ -888,20 +1069,18 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) "\n\ - movq %r10, %rdi # \n\ + popq %rdi # \n\ addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ - movq %r9, %rbx # callee-saved \n\ - movq %r8, %rbp # callee-saved \n\ - ##popq %rbx # \n\ - ##popq %rbp # \n\ - ##popq %r12 # \n\ - ##popq %r13 # \n\ - ##popq %r14 # \n\ - ##popq %r15 # \n\ + popq %rbx # \n\ + popq %rbp # \n\ + popq %r12 # \n\ + popq %r13 # \n\ + popq %r14 # \n\ + popq %r15 # \n\ " ); /* asm */ #undef RCONST -- cgit v1.2.3-55-g6feb From 4d4f1f2096f06d69a6f205f0d8e33d4398f25677 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 1 Jan 2022 15:42:15 +0100 Subject: libbb/sha1: x86_64 version: bswap in 64-bit chunks function old new delta sha1_process_block64 3562 3570 +8 Signed-off-by: Denys Vlasenko --- libbb/Config.src | 2 +- libbb/hash_md5_sha.c | 42 ++++++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/libbb/Config.src b/libbb/Config.src index f66f65f81..42a2283aa 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -59,7 +59,7 @@ config SHA1_SMALL Trade binary size versus speed for the sha1 algorithm. throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 - 0 367 367 3657 3562 + 0 367 367 3657 3570 1 224 229 654 732 2,3 200 195 358 380 diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index a4e36066a..959bfc951 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -867,27 +867,29 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) .endif \n\ .endm \n\ \n\ - movl 4*8(%rdi), %r8d \n\ - bswap %r8d \n\ - movl 4*9(%rdi), %r9d \n\ - bswap %r9d \n\ - movl 4*10(%rdi), %r10d \n\ - bswap %r10d \n\ - movl 4*11(%rdi), %r11d \n\ - bswap %r11d \n\ - movl 4*12(%rdi), %r12d \n\ - bswap %r12d \n\ - movl 4*13(%rdi), %r13d \n\ - bswap %r13d \n\ - movl 4*14(%rdi), %r14d \n\ - bswap %r14d \n\ - movl 4*15(%rdi), %r15d \n\ - bswap %r15d \n\ - movl $7, %eax \n\ + movq 4*8(%rdi), %r8 \n\ + bswap %r8 \n\ + movl %r8d, %r9d \n\ + shrq $32, %r8 \n\ + movq 4*10(%rdi), %r10 \n\ + bswap %r10 \n\ + movl %r10d, %r11d \n\ + shrq $32, %r10 \n\ + movq 4*12(%rdi), %r12 \n\ + bswap %r12 \n\ + movl %r12d, %r13d \n\ + shrq $32, %r12 \n\ + movq 4*14(%rdi), %r14 \n\ + bswap %r14 \n\ + movl %r14d, %r15d \n\ + shrq $32, %r14 \n\ + \n\ + movl $3, %eax \n\ 1: \n\ - movl (%rdi,%rax,4), %esi \n\ - bswap %esi \n\ - movl %esi, -32(%rsp,%rax,4) \n\ + movq (%rdi,%rax,8), %rsi \n\ + bswap %rsi \n\ + rolq $32, %rsi \n\ + movq %rsi, -32(%rsp,%rax,8) \n\ decl %eax \n\ jns 1b \n\ movl 80(%rdi), %eax # a = ctx->hash[0] \n\ -- cgit v1.2.3-55-g6feb From 5c0c5582319a5123635c9fd62f8e99ef01cceb3f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 2 Jan 2022 01:56:35 +0100 Subject: libbb/sha1: code shrink in medium-speed version function old new delta sha1_process_block64 654 641 -13 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 959bfc951..7eca3de4d 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -1121,7 +1121,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) * see what the value will be). * """ */ -#if defined(__i386__) +#if defined(__GNUC__) && defined(__i386__) # define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m)) #else # define DO_NOT_TRY_PROPAGATING(m) ((void)0) @@ -1212,7 +1212,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) c = rotl32(b, 30); b = a; a = work; - n = (n + 1) & 15; + n = (n + 1) /* & 15*/; } while (n != 4); /* 2nd round of 20 operations */ j = 19; -- cgit v1.2.3-55-g6feb From 05fd13ebec869fc5e6f226481a2405a2685e8db1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 3 Jan 2022 01:57:29 +0100 Subject: libbb/sha1: x86_64 version: move to a separate .S file, no code changes Signed-off-by: Denys Vlasenko --- libbb/Kbuild.src | 1 + libbb/hash_md5_sha.c | 392 +------------ libbb/hash_md5_sha_x86-64.S | 1349 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1353 insertions(+), 389 deletions(-) create mode 100644 libbb/hash_md5_sha_x86-64.S diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 2fa239857..19b8aad60 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -56,6 +56,7 @@ lib-y += login.o lib-y += make_directory.o lib-y += makedev.o lib-y += hash_md5_sha.o +lib-y += hash_md5_sha_x86-64.o # Alternative (disabled) MD5 implementation #lib-y += hash_md5prime.o lib-y += messages.o diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 7eca3de4d..ee19c1cb7 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -696,397 +696,11 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) #undef RCONST } # elif defined(__GNUC__) && defined(__x86_64__) -static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) -{ - BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); - asm( -"\n\ - pushq %r15 # \n\ - pushq %r14 # \n\ - pushq %r13 # \n\ - pushq %r12 # \n\ - pushq %rbp # \n\ - pushq %rbx # \n\ - pushq %rdi # we need ctx at the end \n\ - \n\ -#Register and stack use: \n\ -# eax..edx: a..d \n\ -# ebp: e \n\ -# esi,edi: temps \n\ -# -32+4*n(%rsp),r8...r15: W[0..7,8..15] \n\ - .macro loadW n,r \n\ - .if \\n == 0 \n\ - movl -32+4*0(%rsp),\\r \n\ - .endif \n\ - .if \\n == 1 \n\ - movl -32+4*1(%rsp),\\r \n\ - .endif \n\ - .if \\n == 2 \n\ - movl -32+4*2(%rsp),\\r \n\ - .endif \n\ - .if \\n == 3 \n\ - movl -32+4*3(%rsp),\\r \n\ - .endif \n\ - .if \\n == 4 \n\ - movl -32+4*4(%rsp),\\r \n\ - .endif \n\ - .if \\n == 5 \n\ - movl -32+4*5(%rsp),\\r \n\ - .endif \n\ - .if \\n == 6 \n\ - movl -32+4*6(%rsp),\\r \n\ - .endif \n\ - .if \\n == 7 \n\ - movl -32+4*7(%rsp),\\r \n\ - .endif \n\ - .if \\n == 8 \n\ - movl %r8d,\\r \n\ - .endif \n\ - .if \\n == 9 \n\ - movl %r9d,\\r \n\ - .endif \n\ - .if \\n == 10 \n\ - movl %r10d,\\r \n\ - .endif \n\ - .if \\n == 11 \n\ - movl %r11d,\\r \n\ - .endif \n\ - .if \\n == 12 \n\ - movl %r12d,\\r \n\ - .endif \n\ - .if \\n == 13 \n\ - movl %r13d,\\r \n\ - .endif \n\ - .if \\n == 14 \n\ - movl %r14d,\\r \n\ - .endif \n\ - .if \\n == 15 \n\ - movl %r15d,\\r \n\ - .endif \n\ - .endm \n\ - \n\ - .macro storeW r,n \n\ - .if \\n == 0 \n\ - movl \\r,-32+4*0(%rsp) \n\ - .endif \n\ - .if \\n == 1 \n\ - movl \\r,-32+4*1(%rsp) \n\ - .endif \n\ - .if \\n == 2 \n\ - movl \\r,-32+4*2(%rsp) \n\ - .endif \n\ - .if \\n == 3 \n\ - movl \\r,-32+4*3(%rsp) \n\ - .endif \n\ - .if \\n == 4 \n\ - movl \\r,-32+4*4(%rsp) \n\ - .endif \n\ - .if \\n == 5 \n\ - movl \\r,-32+4*5(%rsp) \n\ - .endif \n\ - .if \\n == 6 \n\ - movl \\r,-32+4*6(%rsp) \n\ - .endif \n\ - .if \\n == 7 \n\ - movl \\r,-32+4*7(%rsp) \n\ - .endif \n\ - .if \\n == 8 \n\ - movl \\r,%r8d \n\ - .endif \n\ - .if \\n == 9 \n\ - movl \\r,%r9d \n\ - .endif \n\ - .if \\n == 10 \n\ - movl \\r,%r10d \n\ - .endif \n\ - .if \\n == 11 \n\ - movl \\r,%r11d \n\ - .endif \n\ - .if \\n == 12 \n\ - movl \\r,%r12d \n\ - .endif \n\ - .if \\n == 13 \n\ - movl \\r,%r13d \n\ - .endif \n\ - .if \\n == 14 \n\ - movl \\r,%r14d \n\ - .endif \n\ - .if \\n == 15 \n\ - movl \\r,%r15d \n\ - .endif \n\ - .endm \n\ - \n\ - .macro xorW n,r \n\ - .if \\n == 0 \n\ - xorl -32+4*0(%rsp),\\r \n\ - .endif \n\ - .if \\n == 1 \n\ - xorl -32+4*1(%rsp),\\r \n\ - .endif \n\ - .if \\n == 2 \n\ - xorl -32+4*2(%rsp),\\r \n\ - .endif \n\ - .if \\n == 3 \n\ - xorl -32+4*3(%rsp),\\r \n\ - .endif \n\ - .if \\n == 4 \n\ - xorl -32+4*4(%rsp),\\r \n\ - .endif \n\ - .if \\n == 5 \n\ - xorl -32+4*5(%rsp),\\r \n\ - .endif \n\ - .if \\n == 6 \n\ - xorl -32+4*6(%rsp),\\r \n\ - .endif \n\ - .if \\n == 7 \n\ - xorl -32+4*7(%rsp),\\r \n\ - .endif \n\ - .if \\n == 8 \n\ - xorl %r8d,\\r \n\ - .endif \n\ - .if \\n == 9 \n\ - xorl %r9d,\\r \n\ - .endif \n\ - .if \\n == 10 \n\ - xorl %r10d,\\r \n\ - .endif \n\ - .if \\n == 11 \n\ - xorl %r11d,\\r \n\ - .endif \n\ - .if \\n == 12 \n\ - xorl %r12d,\\r \n\ - .endif \n\ - .if \\n == 13 \n\ - xorl %r13d,\\r \n\ - .endif \n\ - .if \\n == 14 \n\ - xorl %r14d,\\r \n\ - .endif \n\ - .if \\n == 15 \n\ - xorl %r15d,\\r \n\ - .endif \n\ - .endm \n\ - \n\ - movq 4*8(%rdi), %r8 \n\ - bswap %r8 \n\ - movl %r8d, %r9d \n\ - shrq $32, %r8 \n\ - movq 4*10(%rdi), %r10 \n\ - bswap %r10 \n\ - movl %r10d, %r11d \n\ - shrq $32, %r10 \n\ - movq 4*12(%rdi), %r12 \n\ - bswap %r12 \n\ - movl %r12d, %r13d \n\ - shrq $32, %r12 \n\ - movq 4*14(%rdi), %r14 \n\ - bswap %r14 \n\ - movl %r14d, %r15d \n\ - shrq $32, %r14 \n\ - \n\ - movl $3, %eax \n\ -1: \n\ - movq (%rdi,%rax,8), %rsi \n\ - bswap %rsi \n\ - rolq $32, %rsi \n\ - movq %rsi, -32(%rsp,%rax,8) \n\ - decl %eax \n\ - jns 1b \n\ - movl 80(%rdi), %eax # a = ctx->hash[0] \n\ - movl 84(%rdi), %ebx # b = ctx->hash[1] \n\ - movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ - movl 92(%rdi), %edx # d = ctx->hash[3] \n\ - movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ -" -#define RD1As(a,b,c,d,e, n, RCONST) \ -"\n\ - ##loadW "n", %esi # n=0, W[0] already in %esi \n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - andl %e"b", %edi # &b \n\ - xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\ - addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD1Bs(a,b,c,d,e, n, RCONST) \ -"\n\ - loadW "n", %esi # W[n] \n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - andl %e"b", %edi # &b \n\ - xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\ - addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD1Cs(a,b,c,d,e, n, RCONST) \ -"\n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - andl %e"b", %edi # &b \n\ - xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ - leal "RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n] \n\ - addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \ -"\n\ - loadW "n13", %esi # W[(n+13) & 15] \n\ - xorW "n8", %esi # ^W[(n+8) & 15] \n\ - xorW "n2", %esi # ^W[(n+2) & 15] \n\ - xorW "n", %esi # ^W[n & 15] \n\ - roll %esi # \n\ - storeW %esi, "n" # store to W[n & 15] \n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - andl %e"b", %edi # &b \n\ - xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ - addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) -#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) -#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) -#define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) -#undef RCONST -#define RCONST 0x5A827999 - RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) - RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9) - RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14) - RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19) -#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ -"\n\ - loadW "n13", %esi # W[(n+13) & 15] \n\ - xorW "n8", %esi # ^W[(n+8) & 15] \n\ - xorW "n2", %esi # ^W[(n+2) & 15] \n\ - xorW "n", %esi # ^W[n & 15] \n\ - roll %esi # \n\ - storeW %esi, "n" # store to W[n & 15] \n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - xorl %e"b", %edi # ^b \n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ - addl %edi, %e"e" # e += (c ^ d ^ b) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST)) -#undef RCONST -#define RCONST 0x6ED9EBA1 - RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4) - RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9) - RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14) - RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19) - -#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ -"\n\ - movl %e"b", %edi # di: b \n\ - movl %e"b", %esi # si: b \n\ - orl %e"c", %edi # di: b | c \n\ - andl %e"c", %esi # si: b & c \n\ - andl %e"d", %edi # di: (b | c) & d \n\ - orl %esi, %edi # ((b | c) & d) | (b & c) \n\ - loadW "n13", %esi # W[(n+13) & 15] \n\ - xorW "n8", %esi # ^W[(n+8) & 15] \n\ - xorW "n2", %esi # ^W[(n+2) & 15] \n\ - xorW "n", %esi # ^W[n & 15] \n\ - roll %esi # \n\ - storeW %esi, "n" # store to W[n & 15] \n\ - addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST)) -#undef RCONST -//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement" -#define RCONST -0x70e44324 - RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4) - RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9) - RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14) - RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19) -#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ -"\n\ - loadW "n13", %esi # W[(n+13) & 15] \n\ - xorW "n8", %esi # ^W[(n+8) & 15] \n\ - xorW "n2", %esi # ^W[(n+2) & 15] \n\ - xorW "n", %esi # ^W[n & 15] \n\ - roll %esi # \n\ - storeW %esi, "n" # store to W[n & 15] \n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - xorl %e"b", %edi # ^b \n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ - addl %edi, %e"e" # e += (c ^ d ^ b) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ -"\n\ - loadW "n13", %esi # W[(n+13) & 15] \n\ - xorW "n8", %esi # ^W[(n+8) & 15] \n\ - xorW "n2", %esi # ^W[(n+2) & 15] \n\ - xorW "n", %esi # ^W[n & 15] \n\ - roll %esi # \n\ - #storeW %esi, "n" # store to W[n & 15] elided \n\ - movl %e"c", %edi # c \n\ - xorl %e"d", %edi # ^d \n\ - xorl %e"b", %edi # ^b \n\ - leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ - addl %edi, %e"e" # e += (c ^ d ^ b) \n\ - movl %e"a", %esi # \n\ - roll $5, %esi # rotl32(a,5) \n\ - addl %esi, %e"e" # e += rotl32(a,5) \n\ - rorl $2, %e"b" # b = rotl32(b,30) \n\ -" -#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) -#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) -#undef RCONST -//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement" -#define RCONST -0x359d3e2a - RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4) - RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9) - RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14) - RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) +/* in hash_md5_sha_x86-64.S */ +struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; +void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); -"\n\ - popq %rdi # \n\ - addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ - addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ - addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ - addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ - addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ - popq %rbx # \n\ - popq %rbp # \n\ - popq %r12 # \n\ - popq %r13 # \n\ - popq %r14 # \n\ - popq %r15 # \n\ -" - ); /* asm */ -#undef RCONST -} # else /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. * It seems further speedup can be achieved by handling more than diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S new file mode 100644 index 000000000..466cd9ae9 --- /dev/null +++ b/libbb/hash_md5_sha_x86-64.S @@ -0,0 +1,1349 @@ +### Generated by hash_md5_sha_x86-64.S.sh ### +#if defined(__GNUC__) && defined(__x86_64__) + .section .text.sha1_process_block64,"ax",@progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function +sha1_process_block64: + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi: temps +# -32+4*n(%rsp),r8...r15: W[0..7,8..15] + + movq 4*8(%rdi), %r8 + bswapq %r8 + movl %r8d, %r9d + shrq $32, %r8 + movq 4*10(%rdi), %r10 + bswapq %r10 + movl %r10d, %r11d + shrq $32, %r10 + movq 4*12(%rdi), %r12 + bswapq %r12 + movl %r12d, %r13d + shrq $32, %r12 + movq 4*14(%rdi), %r14 + bswapq %r14 + movl %r14d, %r15d + shrq $32, %r14 + + movl $3, %eax +1: + movq (%rdi,%rax,8), %rsi + bswapq %rsi + rolq $32, %rsi + movq %rsi, -32(%rsp,%rax,8) + decl %eax + jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + +# 0 + # W[0], already in %esi + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n] + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 1 + movl -32+4*1(%rsp), %esi # W[n] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n] + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 2 + movl -32+4*2(%rsp), %esi # W[n] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n] + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 3 + movl -32+4*3(%rsp), %esi # W[n] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n] + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 4 + movl -32+4*4(%rsp), %esi # W[n] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n] + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 5 + movl -32+4*5(%rsp), %esi # W[n] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n] + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 6 + movl -32+4*6(%rsp), %esi # W[n] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n] + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 7 + movl -32+4*7(%rsp), %esi # W[n] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n] + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 8 + # W[n], in %r8 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbx,%r8),%ebx # e += RCONST + W[n] + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 9 + # W[n], in %r9 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rax,%r9),%eax # e += RCONST + W[n] + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 10 + # W[n], in %r10 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbp,%r10),%ebp # e += RCONST + W[n] + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 11 + # W[n], in %r11 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rdx,%r11),%edx # e += RCONST + W[n] + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 12 + # W[n], in %r12 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rcx,%r12),%ecx # e += RCONST + W[n] + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 13 + # W[n], in %r13 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbx,%r13),%ebx # e += RCONST + W[n] + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 14 + # W[n], in %r14 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rax,%r14),%eax # e += RCONST + W[n] + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 15 + # W[n], in %r15 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbp,%r15),%ebp # e += RCONST + W[n] + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 16 + movl %r13d, %esi # W[(n+13) & 15] + xorl %r8d, %esi # ^W[(n+8) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*0(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*0(%rsp) # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n] + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 17 + movl %r14d, %esi # W[(n+13) & 15] + xorl %r9d, %esi # ^W[(n+8) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*1(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*1(%rsp) # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n] + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 18 + movl %r15d, %esi # W[(n+13) & 15] + xorl %r10d, %esi # ^W[(n+8) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*2(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*2(%rsp) # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n] + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 19 + movl -32+4*0(%rsp), %esi # W[(n+13) & 15] + xorl %r11d, %esi # ^W[(n+8) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*3(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*3(%rsp) # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + leal 0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n] + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 20 + movl -32+4*1(%rsp), %esi # W[(n+13) & 15] + xorl %r12d, %esi # ^W[(n+8) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*4(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*4(%rsp) # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 21 + movl -32+4*2(%rsp), %esi # W[(n+13) & 15] + xorl %r13d, %esi # ^W[(n+8) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*5(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*5(%rsp) # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 22 + movl -32+4*3(%rsp), %esi # W[(n+13) & 15] + xorl %r14d, %esi # ^W[(n+8) & 15] + xorl %r8d, %esi # ^W[(n+2) & 15] + xorl -32+4*6(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*6(%rsp) # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 23 + movl -32+4*4(%rsp), %esi # W[(n+13) & 15] + xorl %r15d, %esi # ^W[(n+8) & 15] + xorl %r9d, %esi # ^W[(n+2) & 15] + xorl -32+4*7(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*7(%rsp) # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 24 + movl -32+4*5(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] + xorl %r10d, %esi # ^W[(n+2) & 15] + xorl %r8d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r8d # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 25 + movl -32+4*6(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] + xorl %r11d, %esi # ^W[(n+2) & 15] + xorl %r9d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r9d # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 26 + movl -32+4*7(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] + xorl %r12d, %esi # ^W[(n+2) & 15] + xorl %r10d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r10d # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 27 + movl %r8d, %esi # W[(n+13) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] + xorl %r13d, %esi # ^W[(n+2) & 15] + xorl %r11d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r11d # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 28 + movl %r9d, %esi # W[(n+13) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] + xorl %r14d, %esi # ^W[(n+2) & 15] + xorl %r12d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r12d # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 29 + movl %r10d, %esi # W[(n+13) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] + xorl %r15d, %esi # ^W[(n+2) & 15] + xorl %r13d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r13d # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 30 + movl %r11d, %esi # W[(n+13) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] + xorl %r14d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r14d # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 31 + movl %r12d, %esi # W[(n+13) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] + xorl %r15d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r15d # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 32 + movl %r13d, %esi # W[(n+13) & 15] + xorl %r8d, %esi # ^W[(n+8) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*0(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*0(%rsp) # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 33 + movl %r14d, %esi # W[(n+13) & 15] + xorl %r9d, %esi # ^W[(n+8) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*1(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*1(%rsp) # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 34 + movl %r15d, %esi # W[(n+13) & 15] + xorl %r10d, %esi # ^W[(n+8) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*2(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*2(%rsp) # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 35 + movl -32+4*0(%rsp), %esi # W[(n+13) & 15] + xorl %r11d, %esi # ^W[(n+8) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*3(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*3(%rsp) # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 36 + movl -32+4*1(%rsp), %esi # W[(n+13) & 15] + xorl %r12d, %esi # ^W[(n+8) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*4(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*4(%rsp) # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 37 + movl -32+4*2(%rsp), %esi # W[(n+13) & 15] + xorl %r13d, %esi # ^W[(n+8) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*5(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*5(%rsp) # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 38 + movl -32+4*3(%rsp), %esi # W[(n+13) & 15] + xorl %r14d, %esi # ^W[(n+8) & 15] + xorl %r8d, %esi # ^W[(n+2) & 15] + xorl -32+4*6(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*6(%rsp) # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 39 + movl -32+4*4(%rsp), %esi # W[(n+13) & 15] + xorl %r15d, %esi # ^W[(n+8) & 15] + xorl %r9d, %esi # ^W[(n+2) & 15] + xorl -32+4*7(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*7(%rsp) # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 40 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*5(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] + xorl %r10d, %esi # ^W[(n+2) & 15] + xorl %r8d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r8d # store to W[n & 15] + addl %edi, %ebp # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 41 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*6(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] + xorl %r11d, %esi # ^W[(n+2) & 15] + xorl %r9d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r9d # store to W[n & 15] + addl %edi, %edx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 42 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*7(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] + xorl %r12d, %esi # ^W[(n+2) & 15] + xorl %r10d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r10d # store to W[n & 15] + addl %edi, %ecx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 43 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r8d, %esi # W[(n+13) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] + xorl %r13d, %esi # ^W[(n+2) & 15] + xorl %r11d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r11d # store to W[n & 15] + addl %edi, %ebx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 44 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r9d, %esi # W[(n+13) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] + xorl %r14d, %esi # ^W[(n+2) & 15] + xorl %r12d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r12d # store to W[n & 15] + addl %edi, %eax # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 45 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r10d, %esi # W[(n+13) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] + xorl %r15d, %esi # ^W[(n+2) & 15] + xorl %r13d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r13d # store to W[n & 15] + addl %edi, %ebp # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 46 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r11d, %esi # W[(n+13) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] + xorl %r14d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r14d # store to W[n & 15] + addl %edi, %edx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 47 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r12d, %esi # W[(n+13) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] + xorl %r15d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r15d # store to W[n & 15] + addl %edi, %ecx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 48 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r13d, %esi # W[(n+13) & 15] + xorl %r8d, %esi # ^W[(n+8) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*0(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*0(%rsp) # store to W[n & 15] + addl %edi, %ebx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 49 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r14d, %esi # W[(n+13) & 15] + xorl %r9d, %esi # ^W[(n+8) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*1(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*1(%rsp) # store to W[n & 15] + addl %edi, %eax # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 50 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r15d, %esi # W[(n+13) & 15] + xorl %r10d, %esi # ^W[(n+8) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*2(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*2(%rsp) # store to W[n & 15] + addl %edi, %ebp # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 51 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*0(%rsp), %esi # W[(n+13) & 15] + xorl %r11d, %esi # ^W[(n+8) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*3(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*3(%rsp) # store to W[n & 15] + addl %edi, %edx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 52 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*1(%rsp), %esi # W[(n+13) & 15] + xorl %r12d, %esi # ^W[(n+8) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*4(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*4(%rsp) # store to W[n & 15] + addl %edi, %ecx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 53 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*2(%rsp), %esi # W[(n+13) & 15] + xorl %r13d, %esi # ^W[(n+8) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*5(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*5(%rsp) # store to W[n & 15] + addl %edi, %ebx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 54 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*3(%rsp), %esi # W[(n+13) & 15] + xorl %r14d, %esi # ^W[(n+8) & 15] + xorl %r8d, %esi # ^W[(n+2) & 15] + xorl -32+4*6(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*6(%rsp) # store to W[n & 15] + addl %edi, %eax # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 55 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*4(%rsp), %esi # W[(n+13) & 15] + xorl %r15d, %esi # ^W[(n+8) & 15] + xorl %r9d, %esi # ^W[(n+2) & 15] + xorl -32+4*7(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*7(%rsp) # store to W[n & 15] + addl %edi, %ebp # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 56 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*5(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] + xorl %r10d, %esi # ^W[(n+2) & 15] + xorl %r8d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r8d # store to W[n & 15] + addl %edi, %edx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 57 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*6(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] + xorl %r11d, %esi # ^W[(n+2) & 15] + xorl %r9d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r9d # store to W[n & 15] + addl %edi, %ecx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 58 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl -32+4*7(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] + xorl %r12d, %esi # ^W[(n+2) & 15] + xorl %r10d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r10d # store to W[n & 15] + addl %edi, %ebx # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 59 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + movl %r8d, %esi # W[(n+13) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] + xorl %r13d, %esi # ^W[(n+2) & 15] + xorl %r11d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r11d # store to W[n & 15] + addl %edi, %eax # += ((b | c) & d) | (b & c) + leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 60 + movl %r9d, %esi # W[(n+13) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] + xorl %r14d, %esi # ^W[(n+2) & 15] + xorl %r12d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r12d # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 61 + movl %r10d, %esi # W[(n+13) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] + xorl %r15d, %esi # ^W[(n+2) & 15] + xorl %r13d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r13d # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 62 + movl %r11d, %esi # W[(n+13) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] + xorl %r14d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r14d # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 63 + movl %r12d, %esi # W[(n+13) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] + xorl %r15d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r15d # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 64 + movl %r13d, %esi # W[(n+13) & 15] + xorl %r8d, %esi # ^W[(n+8) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*0(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*0(%rsp) # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 65 + movl %r14d, %esi # W[(n+13) & 15] + xorl %r9d, %esi # ^W[(n+8) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*1(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*1(%rsp) # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 66 + movl %r15d, %esi # W[(n+13) & 15] + xorl %r10d, %esi # ^W[(n+8) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*2(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*2(%rsp) # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 67 + movl -32+4*0(%rsp), %esi # W[(n+13) & 15] + xorl %r11d, %esi # ^W[(n+8) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*3(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*3(%rsp) # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 68 + movl -32+4*1(%rsp), %esi # W[(n+13) & 15] + xorl %r12d, %esi # ^W[(n+8) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*4(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*4(%rsp) # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 69 + movl -32+4*2(%rsp), %esi # W[(n+13) & 15] + xorl %r13d, %esi # ^W[(n+8) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] + xorl -32+4*5(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*5(%rsp) # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 70 + movl -32+4*3(%rsp), %esi # W[(n+13) & 15] + xorl %r14d, %esi # ^W[(n+8) & 15] + xorl %r8d, %esi # ^W[(n+2) & 15] + xorl -32+4*6(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*6(%rsp) # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 71 + movl -32+4*4(%rsp), %esi # W[(n+13) & 15] + xorl %r15d, %esi # ^W[(n+8) & 15] + xorl %r9d, %esi # ^W[(n+2) & 15] + xorl -32+4*7(%rsp), %esi # ^W[n & 15] + roll %esi # + movl %esi, -32+4*7(%rsp) # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 72 + movl -32+4*5(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] + xorl %r10d, %esi # ^W[(n+2) & 15] + xorl %r8d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r8d # store to W[n & 15] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 73 + movl -32+4*6(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] + xorl %r11d, %esi # ^W[(n+2) & 15] + xorl %r9d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r9d # store to W[n & 15] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 74 + movl -32+4*7(%rsp), %esi # W[(n+13) & 15] + xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] + xorl %r12d, %esi # ^W[(n+2) & 15] + xorl %r10d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r10d # store to W[n & 15] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 75 + movl %r8d, %esi # W[(n+13) & 15] + xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] + xorl %r13d, %esi # ^W[(n+2) & 15] + xorl %r11d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r11d # store to W[n & 15] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 76 + movl %r9d, %esi # W[(n+13) & 15] + xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] + xorl %r14d, %esi # ^W[(n+2) & 15] + xorl %r12d, %esi # ^W[n & 15] + roll %esi # + movl %esi, %r12d # store to W[n & 15] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 77 + movl %r10d, %esi # W[(n+13) & 15] + xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] + xorl %r15d, %esi # ^W[(n+2) & 15] + xorl %r13d, %esi # ^W[n & 15] + roll %esi # + # store to W[n & 15] - unused, not done + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 78 + movl %r11d, %esi # W[(n+13) & 15] + xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] + xorl %r14d, %esi # ^W[n & 15] + roll %esi # + # store to W[n & 15] - unused, not done + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 79 + movl %r12d, %esi # W[(n+13) & 15] + xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] + xorl %r15d, %esi # ^W[n & 15] + roll %esi # + # store to W[n & 15] - unused, not done + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) + + popq %rdi # + addl %eax, 80(%rdi) # ctx->hash[0] += a + addl %ebx, 84(%rdi) # ctx->hash[1] += b + addl %ecx, 88(%rdi) # ctx->hash[2] += c + addl %edx, 92(%rdi) # ctx->hash[3] += d + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbx # + popq %rbp # + popq %r12 # + popq %r13 # + popq %r14 # + popq %r15 # + + ret + .size sha1_process_block64, .-sha1_process_block64 +#endif -- cgit v1.2.3-55-g6feb From 947bef0deaba7b2ce432d515379091dcd4cf747f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 3 Jan 2022 13:00:07 +0100 Subject: libbb/sha1: x86_64 version: generate from a script, optimize a bit function old new delta sha1_process_block64 3569 3502 -67 Signed-off-by: Denys Vlasenko --- libbb/Config.src | 2 +- libbb/hash_md5_sha_x86-64.S | 472 ++++++++++++++++++----------------------- libbb/hash_md5_sha_x86-64.S.sh | 267 +++++++++++++++++++++++ 3 files changed, 474 insertions(+), 267 deletions(-) create mode 100755 libbb/hash_md5_sha_x86-64.S.sh diff --git a/libbb/Config.src b/libbb/Config.src index 42a2283aa..c80bee286 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -59,7 +59,7 @@ config SHA1_SMALL Trade binary size versus speed for the sha1 algorithm. throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 - 0 367 367 3657 3570 + 0 367 375 3657 3502 1 224 229 654 732 2,3 200 195 358 380 diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 466cd9ae9..3e1c4b455 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -1,23 +1,27 @@ ### Generated by hash_md5_sha_x86-64.S.sh ### -#if defined(__GNUC__) && defined(__x86_64__) + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) .section .text.sha1_process_block64,"ax",@progbits - .globl sha1_process_block64 - .hidden sha1_process_block64 + .globl sha1_process_block64 + .hidden sha1_process_block64 .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 4 first insns sha1_process_block64: - pushq %r15 # - pushq %r14 # - pushq %r13 # - pushq %r12 # - pushq %rbp # - pushq %rbx # - pushq %rdi # we need ctx at the end + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + pushq %rdi # we need ctx at the end #Register and stack use: # eax..edx: a..d # ebp: e # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] +# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?) movq 4*8(%rdi), %r8 bswapq %r8 @@ -253,7 +257,7 @@ sha1_process_block64: xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -270,7 +274,7 @@ sha1_process_block64: xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -287,7 +291,7 @@ sha1_process_block64: xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -304,7 +308,7 @@ sha1_process_block64: xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -320,7 +324,7 @@ sha1_process_block64: movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -336,7 +340,7 @@ sha1_process_block64: movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -352,7 +356,7 @@ sha1_process_block64: movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -368,135 +372,119 @@ sha1_process_block64: movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 24 - movl -32+4*5(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] - xorl %r10d, %esi # ^W[(n+2) & 15] - xorl %r8d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r8d # store to W[n & 15] + xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] + xorl %r10d, %r8d # ^W[(n+2) & 15] + roll %r8d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 25 - movl -32+4*6(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] - xorl %r11d, %esi # ^W[(n+2) & 15] - xorl %r9d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r9d # store to W[n & 15] + xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] + xorl %r11d, %r9d # ^W[(n+2) & 15] + roll %r9d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 26 - movl -32+4*7(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] - xorl %r12d, %esi # ^W[(n+2) & 15] - xorl %r10d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r10d # store to W[n & 15] + xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] + xorl %r12d, %r10d # ^W[(n+2) & 15] + roll %r10d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 27 - movl %r8d, %esi # W[(n+13) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] - xorl %r13d, %esi # ^W[(n+2) & 15] - xorl %r11d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r11d # store to W[n & 15] + xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] + xorl %r13d, %r11d # ^W[(n+2) & 15] + roll %r11d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 28 - movl %r9d, %esi # W[(n+13) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] - xorl %r14d, %esi # ^W[(n+2) & 15] - xorl %r12d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r12d # store to W[n & 15] + xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] + xorl %r14d, %r12d # ^W[(n+2) & 15] + roll %r12d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 29 - movl %r10d, %esi # W[(n+13) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] - xorl %r15d, %esi # ^W[(n+2) & 15] - xorl %r13d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r13d # store to W[n & 15] + xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] + xorl %r15d, %r13d # ^W[(n+2) & 15] + roll %r13d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 30 - movl %r11d, %esi # W[(n+13) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] - xorl %r14d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r14d # store to W[n & 15] + xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] + roll %r14d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 31 - movl %r12d, %esi # W[(n+13) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] - xorl %r15d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r15d # store to W[n & 15] + xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] + roll %r15d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -512,7 +500,7 @@ sha1_process_block64: movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -528,7 +516,7 @@ sha1_process_block64: movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -544,7 +532,7 @@ sha1_process_block64: movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -560,7 +548,7 @@ sha1_process_block64: movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -576,7 +564,7 @@ sha1_process_block64: movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -592,7 +580,7 @@ sha1_process_block64: movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -608,7 +596,7 @@ sha1_process_block64: movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -624,7 +612,7 @@ sha1_process_block64: movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W + leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -637,14 +625,12 @@ sha1_process_block64: andl %ecx, %esi # si: b & c andl %edx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*5(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] - xorl %r10d, %esi # ^W[(n+2) & 15] - xorl %r8d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r8d # store to W[n & 15] + xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] + xorl %r10d, %r8d # ^W[(n+2) & 15] + roll %r8d # addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -656,14 +642,12 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*6(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] - xorl %r11d, %esi # ^W[(n+2) & 15] - xorl %r9d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r9d # store to W[n & 15] + xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] + xorl %r11d, %r9d # ^W[(n+2) & 15] + roll %r9d # addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -675,14 +659,12 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*7(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] - xorl %r12d, %esi # ^W[(n+2) & 15] - xorl %r10d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r10d # store to W[n & 15] + xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] + xorl %r12d, %r10d # ^W[(n+2) & 15] + roll %r10d # addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -694,14 +676,12 @@ sha1_process_block64: andl %ebp, %esi # si: b & c andl %eax, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r8d, %esi # W[(n+13) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] - xorl %r13d, %esi # ^W[(n+2) & 15] - xorl %r11d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r11d # store to W[n & 15] + xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] + xorl %r13d, %r11d # ^W[(n+2) & 15] + roll %r11d # addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -713,14 +693,12 @@ sha1_process_block64: andl %edx, %esi # si: b & c andl %ebp, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r9d, %esi # W[(n+13) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] - xorl %r14d, %esi # ^W[(n+2) & 15] - xorl %r12d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r12d # store to W[n & 15] + xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] + xorl %r14d, %r12d # ^W[(n+2) & 15] + roll %r12d # addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) @@ -732,14 +710,12 @@ sha1_process_block64: andl %ecx, %esi # si: b & c andl %edx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r10d, %esi # W[(n+13) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] - xorl %r15d, %esi # ^W[(n+2) & 15] - xorl %r13d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r13d # store to W[n & 15] + xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] + xorl %r15d, %r13d # ^W[(n+2) & 15] + roll %r13d # addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -751,14 +727,12 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r11d, %esi # W[(n+13) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] - xorl %r14d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r14d # store to W[n & 15] + xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] + roll %r14d # addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -770,14 +744,12 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r12d, %esi # W[(n+13) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] - xorl %r15d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r15d # store to W[n & 15] + xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] + roll %r15d # addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -796,7 +768,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*0(%rsp) # store to W[n & 15] addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -815,7 +787,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*1(%rsp) # store to W[n & 15] addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) @@ -834,7 +806,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*2(%rsp) # store to W[n & 15] addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -853,7 +825,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*3(%rsp) # store to W[n & 15] addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -872,7 +844,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*4(%rsp) # store to W[n & 15] addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -891,7 +863,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*5(%rsp) # store to W[n & 15] addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -910,7 +882,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*6(%rsp) # store to W[n & 15] addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) @@ -929,7 +901,7 @@ sha1_process_block64: roll %esi # movl %esi, -32+4*7(%rsp) # store to W[n & 15] addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -941,14 +913,12 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*5(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] - xorl %r10d, %esi # ^W[(n+2) & 15] - xorl %r8d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r8d # store to W[n & 15] + xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] + xorl %r10d, %r8d # ^W[(n+2) & 15] + roll %r8d # addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -960,14 +930,12 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*6(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] - xorl %r11d, %esi # ^W[(n+2) & 15] - xorl %r9d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r9d # store to W[n & 15] + xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] + xorl %r11d, %r9d # ^W[(n+2) & 15] + roll %r9d # addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -979,14 +947,12 @@ sha1_process_block64: andl %ebp, %esi # si: b & c andl %eax, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*7(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] - xorl %r12d, %esi # ^W[(n+2) & 15] - xorl %r10d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r10d # store to W[n & 15] + xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] + xorl %r12d, %r10d # ^W[(n+2) & 15] + roll %r10d # addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -998,77 +964,67 @@ sha1_process_block64: andl %edx, %esi # si: b & c andl %ebp, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r8d, %esi # W[(n+13) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] - xorl %r13d, %esi # ^W[(n+2) & 15] - xorl %r11d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r11d # store to W[n & 15] + xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] + xorl %r13d, %r11d # ^W[(n+2) & 15] + roll %r11d # addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 60 - movl %r9d, %esi # W[(n+13) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] - xorl %r14d, %esi # ^W[(n+2) & 15] - xorl %r12d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r12d # store to W[n & 15] + xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] + xorl %r14d, %r12d # ^W[(n+2) & 15] + roll %r12d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 61 - movl %r10d, %esi # W[(n+13) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] - xorl %r15d, %esi # ^W[(n+2) & 15] - xorl %r13d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r13d # store to W[n & 15] + xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] + xorl %r15d, %r13d # ^W[(n+2) & 15] + roll %r13d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 62 - movl %r11d, %esi # W[(n+13) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] - xorl %r14d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r14d # store to W[n & 15] + xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] + roll %r14d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 63 - movl %r12d, %esi # W[(n+13) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] - xorl %r15d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r15d # store to W[n & 15] + xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] + roll %r15d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -1084,7 +1040,7 @@ sha1_process_block64: movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -1100,7 +1056,7 @@ sha1_process_block64: movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -1116,7 +1072,7 @@ sha1_process_block64: movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -1132,7 +1088,7 @@ sha1_process_block64: movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -1148,7 +1104,7 @@ sha1_process_block64: movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -1164,7 +1120,7 @@ sha1_process_block64: movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -1180,7 +1136,7 @@ sha1_process_block64: movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -1196,135 +1152,119 @@ sha1_process_block64: movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 72 - movl -32+4*5(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+8) & 15] - xorl %r10d, %esi # ^W[(n+2) & 15] - xorl %r8d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r8d # store to W[n & 15] + xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] + xorl %r10d, %r8d # ^W[(n+2) & 15] + roll %r8d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 73 - movl -32+4*6(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+8) & 15] - xorl %r11d, %esi # ^W[(n+2) & 15] - xorl %r9d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r9d # store to W[n & 15] + xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] + xorl %r11d, %r9d # ^W[(n+2) & 15] + roll %r9d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 74 - movl -32+4*7(%rsp), %esi # W[(n+13) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+8) & 15] - xorl %r12d, %esi # ^W[(n+2) & 15] - xorl %r10d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r10d # store to W[n & 15] + xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] + xorl %r12d, %r10d # ^W[(n+2) & 15] + roll %r10d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 75 - movl %r8d, %esi # W[(n+13) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+8) & 15] - xorl %r13d, %esi # ^W[(n+2) & 15] - xorl %r11d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r11d # store to W[n & 15] + xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] + xorl %r13d, %r11d # ^W[(n+2) & 15] + roll %r11d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W + leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 76 - movl %r9d, %esi # W[(n+13) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+8) & 15] - xorl %r14d, %esi # ^W[(n+2) & 15] - xorl %r12d, %esi # ^W[n & 15] - roll %esi # - movl %esi, %r12d # store to W[n & 15] + xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] + xorl %r14d, %r12d # ^W[(n+2) & 15] + roll %r12d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W + leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 77 - movl %r10d, %esi # W[(n+13) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+8) & 15] - xorl %r15d, %esi # ^W[(n+2) & 15] - xorl %r13d, %esi # ^W[n & 15] - roll %esi # - # store to W[n & 15] - unused, not done + xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] + xorl %r15d, %r13d # ^W[(n+2) & 15] + roll %r13d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W + leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 78 - movl %r11d, %esi # W[(n+13) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %esi # ^W[(n+2) & 15] - xorl %r14d, %esi # ^W[n & 15] - roll %esi # - # store to W[n & 15] - unused, not done + xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] + xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] + roll %r14d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W + leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 79 - movl %r12d, %esi # W[(n+13) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %esi # ^W[(n+2) & 15] - xorl %r15d, %esi # ^W[n & 15] - roll %esi # - # store to W[n & 15] - unused, not done + xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] + xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] + xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] + roll %r15d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W + leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh new file mode 100755 index 000000000..931c0f0fd --- /dev/null +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -0,0 +1,267 @@ +#!/bin/sh + +# We don't regenerate it on every "make" invocation - only by hand. +# The reason is that the changes to generated code are difficult +# to visualize by looking only at this script, it helps when the commit +# also contains the diff of the generated file. +exec >hash_md5_sha_x86-64.S + +echo \ +'### Generated by hash_md5_sha_x86-64.S.sh ### + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) + .section .text.sha1_process_block64,"ax",@progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 4 first insns +sha1_process_block64: + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi: temps +# -32+4*n(%rsp),r8...r15: W[0..7,8..15] +# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?) + + movq 4*8(%rdi), %r8 + bswapq %r8 + movl %r8d, %r9d + shrq $32, %r8 + movq 4*10(%rdi), %r10 + bswapq %r10 + movl %r10d, %r11d + shrq $32, %r10 + movq 4*12(%rdi), %r12 + bswapq %r12 + movl %r12d, %r13d + shrq $32, %r12 + movq 4*14(%rdi), %r14 + bswapq %r14 + movl %r14d, %r15d + shrq $32, %r14 + + movl $3, %eax +1: + movq (%rdi,%rax,8), %rsi + bswapq %rsi + rolq $32, %rsi + movq %rsi, -32(%rsp,%rax,8) + decl %eax + jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] +' +W32() { +test "$1" || exit 1 +test "$1" -lt 0 && exit 1 +test "$1" -gt 15 && exit 1 +test "$1" -lt 8 && echo "-32+4*$1(%rsp)" +test "$1" -ge 8 && echo "%r${1}d" +} + +RD1A() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +echo "# $n" +test $n = 0 && echo " + # W[0], already in %esi +";test $n != 0 && test $n -lt 8 && echo " + movl `W32 $n`, %esi # W[n] +";test $n -ge 8 && echo " + # W[n], in %r$n +";echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) +";test $n -lt 8 && echo " + leal $RCONST(%r$e,%rsi),%e$e # e += RCONST + W[n] +";test $n -ge 8 && echo " + leal $RCONST(%r$e,%r$n),%e$e # e += RCONST + W[n] +";echo " + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +RD1B() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n +";test $n0 -lt 8 && echo " + movl `W32 $n13`, %esi # W[(n+13) & 15] + xorl `W32 $n8`, %esi # ^W[(n+8) & 15] + xorl `W32 $n2`, %esi # ^W[(n+2) & 15] + xorl `W32 $n0`, %esi # ^W[n & 15] + roll %esi # + movl %esi, `W32 $n0` # store to W[n & 15] +";test $n0 -ge 8 && echo " + xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] + xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] + xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] + roll `W32 $n0` # +"; echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] +";echo " + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +{ +RCONST=0x5A827999 +RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 +RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 +RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 +RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 +} | grep -v '^$' + +RD2() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n +";test $n0 -lt 8 && echo " + movl `W32 $n13`, %esi # W[(n+13) & 15] + xorl `W32 $n8`, %esi # ^W[(n+8) & 15] + xorl `W32 $n2`, %esi # ^W[(n+2) & 15] + xorl `W32 $n0`, %esi # ^W[n & 15] + roll %esi # + movl %esi, `W32 $n0` # store to W[n & 15] +";test $n0 -ge 8 && echo " + xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] + xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] + xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] + roll `W32 $n0` # +"; echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + xorl %e$b, %edi # ^b +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] +";echo " + addl %edi, %e$e # e += (c ^ d ^ b) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +{ +RCONST=0x6ED9EBA1 +RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 +RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 +RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 +RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 +} | grep -v '^$' + +RD3() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$b, %edi # di: b + movl %e$b, %esi # si: b + orl %e$c, %edi # di: b | c + andl %e$c, %esi # si: b & c + andl %e$d, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) +";test $n0 -lt 8 && echo " + movl `W32 $n13`, %esi # W[(n+13) & 15] + xorl `W32 $n8`, %esi # ^W[(n+8) & 15] + xorl `W32 $n2`, %esi # ^W[(n+2) & 15] + xorl `W32 $n0`, %esi # ^W[n & 15] + roll %esi # + movl %esi, `W32 $n0` # store to W[n & 15] +";test $n0 -ge 8 && echo " + xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] + xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] + xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] + roll `W32 $n0` # +"; echo " + addl %edi, %e$e # += ((b | c) & d) | (b & c) +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] +";echo " + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +{ +#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" +RCONST=-0x70E44324 +RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 +RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 +RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 +RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 +} | grep -v '^$' + +# Round 4 has the same logic as round 2, only n and RCONST are different +{ +#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" +RCONST=-0x359D3E2A +RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 +RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 +RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 +RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 +} | grep -v '^$' + +echo " + popq %rdi # + addl %eax, 80(%rdi) # ctx->hash[0] += a + addl %ebx, 84(%rdi) # ctx->hash[1] += b + addl %ecx, 88(%rdi) # ctx->hash[2] += c + addl %edx, 92(%rdi) # ctx->hash[3] += d + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbx # + popq %rbp # + popq %r12 # + popq %r13 # + popq %r14 # + popq %r15 # + + ret + .size sha1_process_block64, .-sha1_process_block64 +#endif" -- cgit v1.2.3-55-g6feb From 4387077f8e69c26ce5ce4a8119c225cc1c461f88 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 3 Jan 2022 13:14:09 +0100 Subject: typo fix Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 2 +- libbb/hash_md5_sha_x86-64.S.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 3e1c4b455..ec4e63765 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -21,7 +21,7 @@ sha1_process_block64: # ebp: e # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] -# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?) +# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) movq 4*8(%rdi), %r8 bswapq %r8 diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 931c0f0fd..5f09546b2 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -30,7 +30,7 @@ sha1_process_block64: # ebp: e # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] -# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?) +# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) movq 4*8(%rdi), %r8 bswapq %r8 -- cgit v1.2.3-55-g6feb From 7abb2bb96e0cd584f44dd8b219ad16d0232a6485 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 3 Jan 2022 17:02:48 +0100 Subject: libbb/sha1: x86_64 version: tidying up, no code changes Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 32 ++++++++++++++++---------------- libbb/hash_md5_sha_x86-64.S.sh | 33 ++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index ec4e63765..95b85d80a 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -60,7 +60,7 @@ sha1_process_block64: xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -72,7 +72,7 @@ sha1_process_block64: xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -84,7 +84,7 @@ sha1_process_block64: xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -96,7 +96,7 @@ sha1_process_block64: xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -108,7 +108,7 @@ sha1_process_block64: xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -120,7 +120,7 @@ sha1_process_block64: xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -132,7 +132,7 @@ sha1_process_block64: xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -144,7 +144,7 @@ sha1_process_block64: xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -156,7 +156,7 @@ sha1_process_block64: xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%r8),%ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -168,7 +168,7 @@ sha1_process_block64: xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%r9),%eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -180,7 +180,7 @@ sha1_process_block64: xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%r10),%ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) @@ -192,7 +192,7 @@ sha1_process_block64: xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%r11),%edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) @@ -204,7 +204,7 @@ sha1_process_block64: xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%r12),%ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) @@ -216,7 +216,7 @@ sha1_process_block64: xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%r13),%ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) @@ -228,7 +228,7 @@ sha1_process_block64: xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%r14),%eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -240,7 +240,7 @@ sha1_process_block64: xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%r15),%ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 5f09546b2..c5f0ef504 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -74,22 +74,24 @@ test "$1" -ge 8 && echo "%r${1}d" RD1A() { local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 local n=$(($6)) -echo "# $n" -test $n = 0 && echo " +local n0=$(((n+0) & 15)) +echo " +# $n +";test $n0 = 0 && echo " # W[0], already in %esi -";test $n != 0 && test $n -lt 8 && echo " - movl `W32 $n`, %esi # W[n] -";test $n -ge 8 && echo " - # W[n], in %r$n +";test $n0 != 0 && test $n0 -lt 8 && echo " + movl `W32 $n0`, %esi # W[n] +";test $n0 -ge 8 && echo " + # W[n], in %r$n0 ";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d andl %e$b, %edi # &b xorl %e$d, %edi # (((c ^ d) & b) ^ d) -";test $n -lt 8 && echo " - leal $RCONST(%r$e,%rsi),%e$e # e += RCONST + W[n] -";test $n -ge 8 && echo " - leal $RCONST(%r$e,%r$n),%e$e # e += RCONST + W[n] +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] ";echo " addl %edi, %e$e # e += (((c ^ d) & b) ^ d) movl %e$a, %esi # @@ -119,7 +121,7 @@ echo " xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] roll `W32 $n0` # -"; echo " +";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d andl %e$b, %edi # &b @@ -165,7 +167,7 @@ echo " xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] roll `W32 $n0` # -"; echo " +";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d xorl %e$b, %edi # ^b @@ -216,7 +218,7 @@ echo " xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] roll `W32 $n0` # -"; echo " +";echo " addl %edi, %e$e # += ((b | c) & d) | (b & c) ";test $n0 -lt 8 && echo " leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] @@ -246,6 +248,11 @@ RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx b RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 +# Note: new W[n&15] values generated in last 3 iterations +# (W[13,14,15]) are unused after each of these iterations. +# Since we use r8..r15 for W[8..15], this does not matter. +# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] +# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. } | grep -v '^$' echo " -- cgit v1.2.3-55-g6feb From 1fc520ed286f815cae1da1e9f8014cb18a256744 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 00:50:32 +0100 Subject: md5/shaXsum: use FEATURE_COPYBUF_KB to size the buffer instead of fixed 4k function old new delta md5_sha1_sum_main 536 565 +29 hash_file 419 401 -18 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/1 up/down: 29/-18) Total: 11 bytes In my test, for unrolled sha1, COPYBUF_KB=64 increases throughput from 367 MB/s to 457 MB/s. Signed-off-by: Denys Vlasenko --- coreutils/md5_sha1_sum.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/coreutils/md5_sha1_sum.c b/coreutils/md5_sha1_sum.c index 4efa23061..3b389cb6b 100644 --- a/coreutils/md5_sha1_sum.c +++ b/coreutils/md5_sha1_sum.c @@ -151,10 +151,12 @@ static unsigned char *hash_bin_to_hex(unsigned char *hash_value, return (unsigned char *)hex_value; } +#define BUFSZ (CONFIG_FEATURE_COPYBUF_KB < 4 ? 4096 : CONFIG_FEATURE_COPYBUF_KB * 1024) + #if !ENABLE_SHA3SUM -# define hash_file(f,w) hash_file(f) +# define hash_file(b,f,w) hash_file(b,f) #endif -static uint8_t *hash_file(const char *filename, unsigned sha3_width) +static uint8_t *hash_file(unsigned char *in_buf, const char *filename, unsigned sha3_width) { int src_fd, hash_len, count; union _ctx_ { @@ -227,8 +229,7 @@ static uint8_t *hash_file(const char *filename, unsigned sha3_width) } { - RESERVE_CONFIG_UBUFFER(in_buf, 4096); - while ((count = safe_read(src_fd, in_buf, 4096)) > 0) { + while ((count = safe_read(src_fd, in_buf, BUFSZ)) > 0) { update(&context, in_buf, count); } hash_value = NULL; @@ -238,7 +239,6 @@ static uint8_t *hash_file(const char *filename, unsigned sha3_width) final(&context, in_buf); hash_value = hash_bin_to_hex(in_buf, hash_len); } - RELEASE_CONFIG_BUFFER(in_buf); } if (src_fd != STDIN_FILENO) { @@ -251,6 +251,7 @@ static uint8_t *hash_file(const char *filename, unsigned sha3_width) int md5_sha1_sum_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv) { + unsigned char *in_buf; int return_value = EXIT_SUCCESS; unsigned flags; #if ENABLE_SHA3SUM @@ -279,6 +280,12 @@ int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv) if (!*argv) *--argv = (char*)"-"; + /* The buffer is not alloc/freed for each input file: + * for big values of COPYBUF_KB, this helps to keep its pages + * pre-faulted and possibly even fully cached on local CPU. + */ + in_buf = xmalloc(BUFSZ); + do { if (ENABLE_FEATURE_MD5_SHA1_SUM_CHECK && (flags & FLAG_CHECK)) { FILE *pre_computed_stream; @@ -310,7 +317,7 @@ int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv) *filename_ptr = '\0'; filename_ptr += 2; - hash_value = hash_file(filename_ptr, sha3_width); + hash_value = hash_file(in_buf, filename_ptr, sha3_width); if (hash_value && (strcmp((char*)hash_value, line) == 0)) { if (!(flags & FLAG_SILENT)) @@ -339,7 +346,7 @@ int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv) } fclose_if_not_stdin(pre_computed_stream); } else { - uint8_t *hash_value = hash_file(*argv, sha3_width); + uint8_t *hash_value = hash_file(in_buf, *argv, sha3_width); if (hash_value == NULL) { return_value = EXIT_FAILURE; } else { -- cgit v1.2.3-55-g6feb From c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 01:45:13 +0100 Subject: libbb/sha1: x86_64 version: reorder prologue/epilogue insns Not clear exactly why, but this increases hashing speed on Skylake from 454 MB/s to 464 MB/s. Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 60 ++++++++++++++++++------------------- libbb/hash_md5_sha_x86-64.S.sh | 67 +++++++++++++++++++++++------------------- 2 files changed, 67 insertions(+), 60 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 95b85d80a..ff78fc049 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -6,14 +6,14 @@ .hidden sha1_process_block64 .type sha1_process_block64, @function - .balign 8 # allow decoders to fetch at least 4 first insns + .balign 8 # allow decoders to fetch at least 5 first insns sha1_process_block64: - pushq %r15 # - pushq %r14 # - pushq %r13 # - pushq %r12 # - pushq %rbp # - pushq %rbx # + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn + pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn pushq %rdi # we need ctx at the end #Register and stack use: @@ -22,24 +22,6 @@ sha1_process_block64: # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) - - movq 4*8(%rdi), %r8 - bswapq %r8 - movl %r8d, %r9d - shrq $32, %r8 - movq 4*10(%rdi), %r10 - bswapq %r10 - movl %r10d, %r11d - shrq $32, %r10 - movq 4*12(%rdi), %r12 - bswapq %r12 - movl %r12d, %r13d - shrq $32, %r12 - movq 4*14(%rdi), %r14 - bswapq %r14 - movl %r14d, %r15d - shrq $32, %r14 - movl $3, %eax 1: movq (%rdi,%rax,8), %rsi @@ -48,12 +30,30 @@ sha1_process_block64: movq %rsi, -32(%rsp,%rax,8) decl %eax jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] movl 84(%rdi), %ebx # b = ctx->hash[1] movl 88(%rdi), %ecx # c = ctx->hash[2] movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] + movq 4*8(%rdi), %r8 + movq 4*10(%rdi), %r10 + bswapq %r8 + bswapq %r10 + movq 4*12(%rdi), %r12 + movq 4*14(%rdi), %r14 + bswapq %r12 + bswapq %r14 + movl %r8d, %r9d + shrq $32, %r8 + movl %r10d, %r11d + shrq $32, %r10 + movl %r12d, %r13d + shrq $32, %r12 + movl %r14d, %r15d + shrq $32, %r14 + # 0 # W[0], already in %esi movl %ecx, %edi # c @@ -1272,17 +1272,17 @@ sha1_process_block64: rorl $2, %ecx # b = rotl32(b,30) popq %rdi # + popq %r12 # addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # addl %ecx, 88(%rdi) # ctx->hash[2] += c + popq %r15 # addl %edx, 92(%rdi) # ctx->hash[3] += d - addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbp # - popq %r12 # - popq %r13 # - popq %r14 # - popq %r15 # ret .size sha1_process_block64, .-sha1_process_block64 diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index c5f0ef504..7e50b64fb 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -15,14 +15,14 @@ echo \ .hidden sha1_process_block64 .type sha1_process_block64, @function - .balign 8 # allow decoders to fetch at least 4 first insns + .balign 8 # allow decoders to fetch at least 5 first insns sha1_process_block64: - pushq %r15 # - pushq %r14 # - pushq %r13 # - pushq %r12 # - pushq %rbp # - pushq %rbx # + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn + pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn pushq %rdi # we need ctx at the end #Register and stack use: @@ -31,24 +31,6 @@ sha1_process_block64: # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) - - movq 4*8(%rdi), %r8 - bswapq %r8 - movl %r8d, %r9d - shrq $32, %r8 - movq 4*10(%rdi), %r10 - bswapq %r10 - movl %r10d, %r11d - shrq $32, %r10 - movq 4*12(%rdi), %r12 - bswapq %r12 - movl %r12d, %r13d - shrq $32, %r12 - movq 4*14(%rdi), %r14 - bswapq %r14 - movl %r14d, %r15d - shrq $32, %r14 - movl $3, %eax 1: movq (%rdi,%rax,8), %rsi @@ -57,11 +39,29 @@ sha1_process_block64: movq %rsi, -32(%rsp,%rax,8) decl %eax jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] movl 84(%rdi), %ebx # b = ctx->hash[1] movl 88(%rdi), %ecx # c = ctx->hash[2] movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] + + movq 4*8(%rdi), %r8 + movq 4*10(%rdi), %r10 + bswapq %r8 + bswapq %r10 + movq 4*12(%rdi), %r12 + movq 4*14(%rdi), %r14 + bswapq %r12 + bswapq %r14 + movl %r8d, %r9d + shrq $32, %r8 + movl %r10d, %r11d + shrq $32, %r10 + movl %r12d, %r13d + shrq $32, %r12 + movl %r14d, %r15d + shrq $32, %r14 ' W32() { test "$1" || exit 1 @@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)" test "$1" -ge 8 && echo "%r${1}d" } +# It's possible to interleave insns in rounds to mostly eliminate +# dependency chains, but this likely to only help old Pentium-based +# CPUs (ones without OOO, which can only simultaneously execute a pair +# of _adjacent_ insns). +# Testing on old-ish Silvermont CPU (which has OOO window of only +# about ~8 insns) shows very small (~1%) speedup. + RD1A() { local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 local n=$(($6)) @@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b echo " popq %rdi # + popq %r12 # addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # addl %ecx, 88(%rdi) # ctx->hash[2] += c + popq %r15 # addl %edx, 92(%rdi) # ctx->hash[3] += d - addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbp # - popq %r12 # - popq %r13 # - popq %r14 # - popq %r15 # ret .size sha1_process_block64, .-sha1_process_block64 -- cgit v1.2.3-55-g6feb From ed2af2e82dbcfccb7392e9fbc3f837de1594c103 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 14:32:41 +0100 Subject: build system: detect if build host has no bzip2 Signed-off-by: Denys Vlasenko --- scripts/bb_release | 6 +++--- scripts/embedded_scripts | 6 ++++++ scripts/mkconfigs | 11 +++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/scripts/bb_release b/scripts/bb_release index 545440d3a..180ad8f2e 100755 --- a/scripts/bb_release +++ b/scripts/bb_release @@ -17,7 +17,7 @@ VERSION=`ls busybox-*.tar.gz | sed 's/busybox-\(.*\)\.tar\.gz/\1/'` zcat busybox-$VERSION.tar.gz | bzip2 > busybox-$VERSION.tar.bz2 for releasefile in busybox-$VERSION.tar.gz busybox-$VERSION.tar.bz2; do - test -f $releasefile || { echo "no $releasefile"; exit 1; } - gpg --detach-sign $releasefile - sha256sum $releasefile > $releasefile.sha256 + test -f $releasefile || { echo "no $releasefile"; exit 1; } + gpg --detach-sign $releasefile + sha256sum $releasefile > $releasefile.sha256 done diff --git a/scripts/embedded_scripts b/scripts/embedded_scripts index aa7bf3e8a..205ac591a 100755 --- a/scripts/embedded_scripts +++ b/scripts/embedded_scripts @@ -23,6 +23,12 @@ if test $? != 0; then exit 1 fi +bzip2 /dev/null +if test $? != 0; then + echo 'bzip2 is not installed' + exit 1 +fi + custom_scripts="" if [ -d "$custom_loc" ] then diff --git a/scripts/mkconfigs b/scripts/mkconfigs index 6a26fe1dd..1bbf10c3a 100755 --- a/scripts/mkconfigs +++ b/scripts/mkconfigs @@ -28,6 +28,17 @@ config=.config +od -v -b /dev/null +if test $? != 0; then + echo 'od tool is not installed or cannot accept "-v -b" options' + exit 1 +fi +bzip2 /dev/null +if test $? != 0; then + echo 'bzip2 is not installed' + exit 1 +fi + { echo "\ #ifndef _BBCONFIGOPTS_H -- cgit v1.2.3-55-g6feb From 286b33721d5f6afd615f752ea83bbd72658c6bb9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 19:42:36 +0100 Subject: sed: correctly handle 'w FILE' commands writing to the same file function old new delta sed_xfopen_w - 84 +84 Signed-off-by: Denys Vlasenko --- editors/sed.c | 31 +++++++++++++++++++++++++++++-- testsuite/sed.tests | 9 +++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/editors/sed.c b/editors/sed.c index e8c82ac63..48b0dbf67 100644 --- a/editors/sed.c +++ b/editors/sed.c @@ -97,6 +97,12 @@ enum { OPT_in_place = 1 << 0, }; +struct sed_FILE { + struct sed_FILE *next; /* Next (linked list, NULL terminated) */ + const char *fname; + FILE *fp; +}; + /* Each sed command turns into one of these structures. */ typedef struct sed_cmd_s { /* Ordered by alignment requirements: currently 36 bytes on x86 */ @@ -151,6 +157,11 @@ struct globals { /* linked list of append lines */ llist_t *append_head; + /* linked list of FILEs opened for 'w' and s///w'. + * Needed to handle duplicate fnames: sed '/a/w F;/b/w F' + */ + struct sed_FILE *FILE_head; + char *add_cmd_line; struct pipeline { @@ -211,6 +222,22 @@ static void sed_free_and_close_stuff(void) void sed_free_and_close_stuff(void); #endif +static FILE *sed_xfopen_w(const char *fname) +{ + struct sed_FILE **pp = &G.FILE_head; + struct sed_FILE *cur; + while ((cur = *pp) != NULL) { + if (strcmp(cur->fname, fname) == 0) + return cur->fp; + pp = &cur->next; + } + *pp = cur = xzalloc(sizeof(*cur)); + /*cur->next = NULL; - already is */ + cur->fname = xstrdup(fname); + cur->fp = xfopen_for_write(fname); + return cur->fp; +} + /* If something bad happens during -i operation, delete temp file */ static void cleanup_outname(void) @@ -446,7 +473,7 @@ static int parse_subst_cmd(sed_cmd_t *sed_cmd, const char *substr) { char *fname; idx += parse_file_cmd(/*sed_cmd,*/ substr+idx+1, &fname); - sed_cmd->sw_file = xfopen_for_write(fname); + sed_cmd->sw_file = sed_xfopen_w(fname); sed_cmd->sw_last_char = '\n'; free(fname); break; @@ -561,7 +588,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr) } cmdstr += parse_file_cmd(/*sed_cmd,*/ cmdstr, &sed_cmd->string); if (sed_cmd->cmd == 'w') { - sed_cmd->sw_file = xfopen_for_write(sed_cmd->string); + sed_cmd->sw_file = sed_xfopen_w(sed_cmd->string); sed_cmd->sw_last_char = '\n'; } } diff --git a/testsuite/sed.tests b/testsuite/sed.tests index 2b78c9b12..e62b839f7 100755 --- a/testsuite/sed.tests +++ b/testsuite/sed.tests @@ -405,6 +405,15 @@ testing "sed ^ OR not^" \ "" \ "abca\n" +# This only works if file name is exactly the same. +# For example, w FILE; w ./FILE won't work. +testing "sed understands duplicate file name" \ + "sed -n -e '/a/w sed.output' -e '/c/w sed.output' 2>&1 && cat sed.output && rm sed.output" \ + "a\nc\n" \ + "" \ + "a\nb\nc\n" + + # testing "description" "commands" "result" "infile" "stdin" exit $FAILCOUNT -- cgit v1.2.3-55-g6feb From 31f45c1b369bee73843f7d791313423997618448 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 23:31:58 +0100 Subject: libbb: factor out fflush_stdout_and_exit(EXIT_SUCCESS) function old new delta fflush_stdout_and_exit_SUCCESS - 7 +7 xxd_main 890 888 -2 vlock_main 353 351 -2 uuencode_main 318 316 -2 uniq_main 427 425 -2 uname_main 250 248 -2 sort_main 853 851 -2 shuf_main 500 498 -2 route_main 238 236 -2 readlink_main 113 111 -2 nice_main 156 154 -2 last_main 957 955 -2 ipcs_main 960 958 -2 env_main 209 207 -2 chrt_main 464 462 -2 cal_main 921 919 -2 baseNUM_main 650 648 -2 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/16 up/down: 7/-32) Total: -25 bytes Signed-off-by: Denys Vlasenko --- coreutils/env.c | 2 +- coreutils/nice.c | 2 +- coreutils/readlink.c | 2 +- coreutils/shuf.c | 2 +- coreutils/sort.c | 2 +- coreutils/uname.c | 2 +- coreutils/uniq.c | 2 +- coreutils/uudecode.c | 2 +- coreutils/uuencode.c | 2 +- include/libbb.h | 1 + libbb/fflush_stdout_and_exit.c | 5 +++++ loginutils/vlock.c | 2 +- networking/route.c | 2 +- sysklogd/logread.c | 2 +- util-linux/cal.c | 2 +- util-linux/chrt.c | 2 +- util-linux/hexdump_xxd.c | 2 +- util-linux/ipcs.c | 8 ++++---- util-linux/last.c | 2 +- util-linux/last_fancy.c | 2 +- 20 files changed, 27 insertions(+), 21 deletions(-) diff --git a/coreutils/env.c b/coreutils/env.c index a0ea4dd27..6eafd06ef 100644 --- a/coreutils/env.c +++ b/coreutils/env.c @@ -100,7 +100,7 @@ int env_main(int argc UNUSED_PARAM, char **argv) } } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } /* diff --git a/coreutils/nice.c b/coreutils/nice.c index 28591ac61..e70da5d2b 100644 --- a/coreutils/nice.c +++ b/coreutils/nice.c @@ -33,7 +33,7 @@ int nice_main(int argc UNUSED_PARAM, char **argv) if (!*++argv) { /* No args, so (GNU) output current nice value. */ printf("%d\n", old_priority); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } adjustment = 10; /* Set default adjustment. */ diff --git a/coreutils/readlink.c b/coreutils/readlink.c index 09d69df2b..b2e867883 100644 --- a/coreutils/readlink.c +++ b/coreutils/readlink.c @@ -96,5 +96,5 @@ int readlink_main(int argc UNUSED_PARAM, char **argv) printf((opt & 2) ? "%s" : "%s\n", buf); free(buf); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/coreutils/shuf.c b/coreutils/shuf.c index 3def3d80f..337366b45 100644 --- a/coreutils/shuf.c +++ b/coreutils/shuf.c @@ -171,5 +171,5 @@ int shuf_main(int argc, char **argv) printf("%s%c", lines[i], eol); } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/coreutils/sort.c b/coreutils/sort.c index 32a06e40a..0cbb6f597 100644 --- a/coreutils/sort.c +++ b/coreutils/sort.c @@ -644,5 +644,5 @@ int sort_main(int argc UNUSED_PARAM, char **argv) printf("%s%c", lines[i], ch); } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/coreutils/uname.c b/coreutils/uname.c index da785ab4c..6c0bdf096 100644 --- a/coreutils/uname.c +++ b/coreutils/uname.c @@ -209,5 +209,5 @@ int uname_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM) #endif } - fflush_stdout_and_exit(EXIT_SUCCESS); /* coreutils-6.9 compat */ + fflush_stdout_and_exit_SUCCESS(); /* coreutils-6.9 compat */ } diff --git a/coreutils/uniq.c b/coreutils/uniq.c index a3058ac07..06c57f750 100644 --- a/coreutils/uniq.c +++ b/coreutils/uniq.c @@ -139,5 +139,5 @@ int uniq_main(int argc UNUSED_PARAM, char **argv) die_if_ferror(stdin, input_filename); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/coreutils/uudecode.c b/coreutils/uudecode.c index e90902f52..63a8d4d48 100644 --- a/coreutils/uudecode.c +++ b/coreutils/uudecode.c @@ -352,7 +352,7 @@ int baseNUM_main(int argc UNUSED_PARAM, char **argv) #undef src_buf } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } #endif diff --git a/coreutils/uuencode.c b/coreutils/uuencode.c index db49ec80a..f096e3122 100644 --- a/coreutils/uuencode.c +++ b/coreutils/uuencode.c @@ -78,5 +78,5 @@ int uuencode_main(int argc UNUSED_PARAM, char **argv) } printf(tbl == bb_uuenc_tbl_std ? "\n`\nend\n" : "\n====\n"); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/include/libbb.h b/include/libbb.h index a48782832..8308d6259 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -1054,6 +1054,7 @@ void die_if_ferror(FILE *file, const char *msg) FAST_FUNC; void die_if_ferror_stdout(void) FAST_FUNC; int fflush_all(void) FAST_FUNC; void fflush_stdout_and_exit(int retval) NORETURN FAST_FUNC; +void fflush_stdout_and_exit_SUCCESS(void) NORETURN FAST_FUNC; int fclose_if_not_stdin(FILE *file) FAST_FUNC; FILE* xfopen(const char *filename, const char *mode) FAST_FUNC; /* Prints warning to stderr and returns NULL on failure: */ diff --git a/libbb/fflush_stdout_and_exit.c b/libbb/fflush_stdout_and_exit.c index 5df74170e..5a13ebcf8 100644 --- a/libbb/fflush_stdout_and_exit.c +++ b/libbb/fflush_stdout_and_exit.c @@ -20,3 +20,8 @@ void FAST_FUNC fflush_stdout_and_exit(int retval) * but use xfunc_die() */ xfunc_die(); } + +void FAST_FUNC fflush_stdout_and_exit_SUCCESS(void) +{ + fflush_stdout_and_exit(EXIT_SUCCESS); +} diff --git a/loginutils/vlock.c b/loginutils/vlock.c index 334b7d2ad..720835c4b 100644 --- a/loginutils/vlock.c +++ b/loginutils/vlock.c @@ -128,5 +128,5 @@ int vlock_main(int argc UNUSED_PARAM, char **argv) ioctl(STDIN_FILENO, VT_SETMODE, &ovtm); #endif tcsetattr_stdin_TCSANOW(&oterm); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/networking/route.c b/networking/route.c index ff5daa8a7..26146f8e9 100644 --- a/networking/route.c +++ b/networking/route.c @@ -702,7 +702,7 @@ int route_main(int argc UNUSED_PARAM, char **argv) #endif bb_displayroutes(noresolve, opt & ROUTE_OPT_e); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } /* Check verb. At the moment, must be add, del, or delete. */ diff --git a/sysklogd/logread.c b/sysklogd/logread.c index d5f8ca0a2..e6cfcf4a7 100644 --- a/sysklogd/logread.c +++ b/sysklogd/logread.c @@ -226,5 +226,5 @@ int logread_main(int argc UNUSED_PARAM, char **argv) /* shmdt(shbuf); - on Linux, shmdt is not mandatory on exit */ - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/util-linux/cal.c b/util-linux/cal.c index 6ba6ebf98..522ab3476 100644 --- a/util-linux/cal.c +++ b/util-linux/cal.c @@ -233,7 +233,7 @@ int cal_main(int argc UNUSED_PARAM, char **argv) } } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } /* diff --git a/util-linux/chrt.c b/util-linux/chrt.c index 6799abb2d..be96fa426 100644 --- a/util-linux/chrt.c +++ b/util-linux/chrt.c @@ -110,7 +110,7 @@ int chrt_main(int argc UNUSED_PARAM, char **argv) show_min_max(SCHED_RR); show_min_max(SCHED_BATCH); show_min_max(SCHED_IDLE); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } //if (opt & OPT_r) // policy = SCHED_RR; - default, already set diff --git a/util-linux/hexdump_xxd.c b/util-linux/hexdump_xxd.c index 76dada983..4372ac770 100644 --- a/util-linux/hexdump_xxd.c +++ b/util-linux/hexdump_xxd.c @@ -150,7 +150,7 @@ static void reverse(unsigned opt, const char *filename) free(buf); } //fclose(fp); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } static void print_C_style(const char *p, const char *hdr) diff --git a/util-linux/ipcs.c b/util-linux/ipcs.c index ef2529c05..5973cbf57 100644 --- a/util-linux/ipcs.c +++ b/util-linux/ipcs.c @@ -600,15 +600,15 @@ int ipcs_main(int argc UNUSED_PARAM, char **argv) id = xatoi(opt_i); if (opt & flag_shm) { print_shm(id); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } if (opt & flag_sem) { print_sem(id); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } if (opt & flag_msg) { print_msg(id); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } bb_show_usage(); } @@ -633,5 +633,5 @@ int ipcs_main(int argc UNUSED_PARAM, char **argv) do_sem(format); bb_putchar('\n'); } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/util-linux/last.c b/util-linux/last.c index 24ce7a8d8..63751ca45 100644 --- a/util-linux/last.c +++ b/util-linux/last.c @@ -162,5 +162,5 @@ int last_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM) xlseek(file, pos, SEEK_SET); } - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } diff --git a/util-linux/last_fancy.c b/util-linux/last_fancy.c index e56e0ba85..648236229 100644 --- a/util-linux/last_fancy.c +++ b/util-linux/last_fancy.c @@ -296,5 +296,5 @@ int last_main(int argc UNUSED_PARAM, char **argv) if (ENABLE_FEATURE_CLEAN_UP) close(file); - fflush_stdout_and_exit(EXIT_SUCCESS); + fflush_stdout_and_exit_SUCCESS(); } -- cgit v1.2.3-55-g6feb From dfd8aafcf59c88662516a534a4334b3f08f58c88 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 23:36:16 +0100 Subject: libbb: fflush_stdout_and_exit(0) still exits with _error_ (not 0!) if fflush fails function old new delta fflush_stdout_and_exit 36 40 +4 Signed-off-by: Denys Vlasenko --- libbb/fflush_stdout_and_exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libbb/fflush_stdout_and_exit.c b/libbb/fflush_stdout_and_exit.c index 5a13ebcf8..33e28ae34 100644 --- a/libbb/fflush_stdout_and_exit.c +++ b/libbb/fflush_stdout_and_exit.c @@ -13,9 +13,9 @@ */ void FAST_FUNC fflush_stdout_and_exit(int retval) { - xfunc_error_retval = retval; if (fflush(stdout)) bb_simple_perror_msg_and_die(bb_msg_standard_output); + xfunc_error_retval = retval; /* In case we are in NOFORK applet. Do not exit() directly, * but use xfunc_die() */ xfunc_die(); -- cgit v1.2.3-55-g6feb From cc7d2e21780c28608b00a4faf0fed297527bcbf4 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Jan 2022 23:53:21 +0100 Subject: sort: fix -s -r interaction: 'stable' order is not affected by -r function old new delta compare_keys 818 820 +2 Signed-off-by: Denys Vlasenko --- coreutils/sort.c | 4 +++- testsuite/sort.tests | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/coreutils/sort.c b/coreutils/sort.c index 0cbb6f597..9ff777851 100644 --- a/coreutils/sort.c +++ b/coreutils/sort.c @@ -380,7 +380,9 @@ static int compare_keys(const void *xarg, const void *yarg) /* If x > y, 1, else -1 */ retval = (x32 > y32) * 2 - 1; - } else + /* Here, -r has no effect! */ + return retval; + } if (!(option_mask32 & FLAG_no_tie_break)) { /* fallback sort */ flags = option_mask32; diff --git a/testsuite/sort.tests b/testsuite/sort.tests index c51a8e475..5375f93de 100755 --- a/testsuite/sort.tests +++ b/testsuite/sort.tests @@ -175,6 +175,19 @@ testing "sort file in place" \ 111 " "" +testing "sort -sr (stable and reverse) does NOT reverse 'stable' ordering" \ +"sort -k2 -r -s input" "\ +b 2 +d 2 +a 1 +c 1 +" "\ +a 1 +b 2 +c 1 +d 2 +" "" + # testing "description" "command(s)" "result" "infile" "stdin" exit $FAILCOUNT -- cgit v1.2.3-55-g6feb From 34e0bb3931b595e7a48061255692ec4ff29499c5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 5 Jan 2022 12:05:55 +0100 Subject: sort: support -h function old new delta static.scale_suffix - 62 +62 .rodata 104304 104336 +32 compare_keys 820 848 +28 packed_usage 34159 34184 +25 static.suffix - 9 +9 sort_opt_str 37 38 +1 ------------------------------------------------------------------------------ (add/remove: 2/0 grow/shrink: 4/0 up/down: 157/0) Total: 157 bytes Signed-off-by: Denys Vlasenko --- coreutils/sort.c | 95 +++++++++++++++++++++++++++++++++++----------------- testsuite/sort.tests | 29 ++++++++++++++++ 2 files changed, 93 insertions(+), 31 deletions(-) diff --git a/coreutils/sort.c b/coreutils/sort.c index 9ff777851..9aac656fe 100644 --- a/coreutils/sort.c +++ b/coreutils/sort.c @@ -18,7 +18,7 @@ //config: sort is used to sort lines of text in specified files. //config: //config:config FEATURE_SORT_BIG -//config: bool "Full SuSv3 compliant sort (support -ktcbdfiogM)" +//config: bool "Full SuSv3 compliant sort (support -ktcbdfioghM)" //config: default y //config: depends on SORT //config: help @@ -43,7 +43,7 @@ //usage:#define sort_trivial_usage //usage: "[-nru" -//usage: IF_FEATURE_SORT_BIG("gMcszbdfiokt] [-o FILE] [-k START[.OFS][OPTS][,END[.OFS][OPTS]] [-t CHAR") +//usage: IF_FEATURE_SORT_BIG("ghMcszbdfiokt] [-o FILE] [-k START[.OFS][OPTS][,END[.OFS][OPTS]] [-t CHAR") //usage: "] [FILE]..." //usage:#define sort_full_usage "\n\n" //usage: "Sort lines of text\n" @@ -59,6 +59,7 @@ //usage: "\n -n Sort numbers" //usage: IF_FEATURE_SORT_BIG( //usage: "\n -g General numerical sort" +//usage: "\n -h Sort human readable numbers (2K 1G)" //usage: "\n -M Sort month" //usage: "\n -V Sort version" //usage: "\n -t CHAR Field separator" @@ -94,31 +95,32 @@ enum { FLAG_n = 1 << 0, /* Numeric sort */ FLAG_g = 1 << 1, /* Sort using strtod() */ - FLAG_M = 1 << 2, /* Sort date */ - FLAG_V = 1 << 3, /* Sort version */ + FLAG_h = 1 << 2, /* Sort using strtod(), plus KMGT suffixes */ + FLAG_M = 1 << 3, /* Sort date */ + FLAG_V = 1 << 4, /* Sort version */ /* ucsz apply to root level only, not keys. b at root level implies bb */ - FLAG_u = 1 << 4, /* Unique */ - FLAG_c = 1 << 5, /* Check: no output, exit(!ordered) */ - FLAG_s = 1 << 6, /* Stable sort, no ascii fallback at end */ - FLAG_z = 1 << 7, /* Input and output is NUL terminated, not \n */ + FLAG_u = 1 << 5, /* Unique */ + FLAG_c = 1 << 6, /* Check: no output, exit(!ordered) */ + FLAG_s = 1 << 7, /* Stable sort, no ascii fallback at end */ + FLAG_z = 1 << 8, /* Input and output is NUL terminated, not \n */ /* These can be applied to search keys, the previous four can't */ - FLAG_b = 1 << 8, /* Ignore leading blanks */ - FLAG_r = 1 << 9, /* Reverse */ - FLAG_d = 1 << 10, /* Ignore !(isalnum()|isspace()) */ - FLAG_f = 1 << 11, /* Force uppercase */ - FLAG_i = 1 << 12, /* Ignore !isprint() */ - FLAG_m = 1 << 13, /* ignored: merge already sorted files; do not sort */ - FLAG_S = 1 << 14, /* ignored: -S, --buffer-size=SIZE */ - FLAG_T = 1 << 15, /* ignored: -T, --temporary-directory=DIR */ - FLAG_o = 1 << 16, - FLAG_k = 1 << 17, - FLAG_t = 1 << 18, + FLAG_b = 1 << 9, /* Ignore leading blanks */ + FLAG_r = 1 << 10, /* Reverse */ + FLAG_d = 1 << 11, /* Ignore !(isalnum()|isspace()) */ + FLAG_f = 1 << 12, /* Force uppercase */ + FLAG_i = 1 << 13, /* Ignore !isprint() */ + FLAG_m = 1 << 14, /* ignored: merge already sorted files; do not sort */ + FLAG_S = 1 << 15, /* ignored: -S, --buffer-size=SIZE */ + FLAG_T = 1 << 16, /* ignored: -T, --temporary-directory=DIR */ + FLAG_o = 1 << 17, + FLAG_k = 1 << 18, + FLAG_t = 1 << 19, FLAG_bb = 0x80000000, /* Ignore trailing blanks */ FLAG_no_tie_break = 0x40000000, }; static const char sort_opt_str[] ALIGN1 = "^" - "ngMVucszbrdfimS:T:o:k:*t:" + "nghMVucszbrdfimS:T:o:k:*t:" "\0" "o--o:t--t"/*-t, -o: at most one of each*/; /* * OPT_STR must not be string literal, needs to have stable address: @@ -253,6 +255,25 @@ static struct sort_key *add_key(void) #define GET_LINE(fp) xmalloc_fgetline(fp) #endif +#if ENABLE_FEATURE_SORT_BIG +static int scale_suffix(const char *tail) +{ + static const char suffix[] ALIGN1 = "kmgtpezy"; + const char *s; + int n; + + if (!tail[0]) + return -1; + s = strchr(suffix, tail[0] | 0x20); + if (!s) + return -1; + n = s - suffix; + if (n != 0 && tail[0] >= 'a') + return -1; /* mg... not accepted, only MG... */ + return n; +} +#endif + /* Iterate through keys list and perform comparisons */ static int compare_keys(const void *xarg, const void *yarg) { @@ -275,7 +296,7 @@ static int compare_keys(const void *xarg, const void *yarg) y = *(char **)yarg; #endif /* Perform actual comparison */ - switch (flags & (FLAG_n | FLAG_g | FLAG_M | FLAG_V)) { + switch (flags & (FLAG_n | FLAG_g | FLAG_h | FLAG_M | FLAG_V)) { default: bb_simple_error_msg_and_die("unknown sort type"); break; @@ -293,7 +314,8 @@ static int compare_keys(const void *xarg, const void *yarg) #endif break; #if ENABLE_FEATURE_SORT_BIG - case FLAG_g: { + case FLAG_g: + case FLAG_h: { char *xx, *yy; //TODO: needs setlocale(LC_NUMERIC, "C")? double dx = strtod(x, &xx); @@ -308,16 +330,26 @@ static int compare_keys(const void *xarg, const void *yarg) retval = (dy != dy) ? 0 : -1; else if (dy != dy) retval = 1; - /* Check for infinity. Could underflow, but it avoids libm. */ - else if (1.0 / dx == 0.0) { - if (dx < 0) - retval = (1.0 / dy == 0.0 && dy < 0) ? 0 : -1; + else { + if (flags & FLAG_h) { + int xs = scale_suffix(xx); + int ys = scale_suffix(yy); + if (xs != ys) { + retval = xs - ys; + break; + } + } + /* Check for infinity. Could underflow, but it avoids libm. */ + if (1.0 / dx == 0.0) { + if (dx < 0) + retval = (1.0 / dy == 0.0 && dy < 0) ? 0 : -1; + else + retval = (1.0 / dy == 0.0 && dy > 0) ? 0 : 1; + } else if (1.0 / dy == 0.0) + retval = (dy < 0) ? 1 : -1; else - retval = (1.0 / dy == 0.0 && dy > 0) ? 0 : 1; - } else if (1.0 / dy == 0.0) - retval = (dy < 0) ? 1 : -1; - else - retval = (dx > dy) ? 1 : ((dx < dy) ? -1 : 0); + retval = (dx > dy) ? 1 : ((dx < dy) ? -1 : 0); + } break; } case FLAG_M: { @@ -476,6 +508,7 @@ int sort_main(int argc UNUSED_PARAM, char **argv) FLAG_allowed_for_k = FLAG_n | /* Numeric sort */ FLAG_g | /* Sort using strtod() */ + FLAG_h | /* Sort using strtod(), plus KMGT suffixes */ FLAG_M | /* Sort date */ FLAG_b | /* Ignore leading blanks */ FLAG_r | /* Reverse */ diff --git a/testsuite/sort.tests b/testsuite/sort.tests index 5375f93de..ff33e21b4 100755 --- a/testsuite/sort.tests +++ b/testsuite/sort.tests @@ -188,6 +188,35 @@ c 1 d 2 " "" +testing "sort -h" \ +"sort -h input" "\ +3e +4m +5y +1023 +1024 +1025 +3000 +2K +3k +1M +2E +1Y +" "\ +1Y +5y +1M +2E +3k +3e +2K +4m +1023 +1025 +3000 +1024 +" "" + # testing "description" "command(s)" "result" "infile" "stdin" exit $FAILCOUNT -- cgit v1.2.3-55-g6feb From 076f5e064fa7b6cc2c03b030abcf2cbd60514180 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 5 Jan 2022 22:04:21 +0100 Subject: less: code shrink function old new delta restore_tty - 29 +29 less_main 2107 2105 -2 getch_nowait 253 251 -2 buffer_print 614 612 -2 less_exit 51 12 -39 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/4 up/down: 29/-45) Total: -16 bytes Signed-off-by: Denys Vlasenko --- miscutils/less.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/miscutils/less.c b/miscutils/less.c index 26983f40d..6825e5577 100644 --- a/miscutils/less.c +++ b/miscutils/less.c @@ -325,15 +325,18 @@ static void print_statusline(const char *str) } /* Exit the program gracefully */ -static void less_exit(int code) +static void restore_tty(void) { set_tty_cooked(); if (!(G.kbd_fd_orig_flags & O_NONBLOCK)) ndelay_off(kbd_fd); clear_line(); - if (code < 0) - kill_myself_with_sig(- code); /* does not return */ - exit(code); +} + +static void less_exit(void) +{ + restore_tty(); + exit(EXIT_SUCCESS); } #if (ENABLE_FEATURE_LESS_DASHCMD && ENABLE_FEATURE_LESS_LINENUMS) \ @@ -913,7 +916,7 @@ static void buffer_print(void) ) { i = option_mask32 & FLAG_F ? 0 : cur_fline; if (max_fline - i <= max_displayed_line) - less_exit(EXIT_SUCCESS); + less_exit(); } status_print(); } @@ -1146,7 +1149,7 @@ static int64_t getch_nowait(void) goto again; } /* EOF/error (ssh session got killed etc) */ - less_exit(EXIT_SUCCESS); + less_exit(); } set_tty_cooked(); return key64; @@ -1297,7 +1300,7 @@ static void colon_process(void) change_file(-1); break; case 'q': - less_exit(EXIT_SUCCESS); + less_exit(); break; case 'x': change_file(0); @@ -1715,7 +1718,7 @@ static void keypress_process(int keypress) buffer_line(cur_fline); break; case 'q': case 'Q': - less_exit(EXIT_SUCCESS); + less_exit(); break; #if ENABLE_FEATURE_LESS_MARKS case 'm': @@ -1793,7 +1796,8 @@ static void keypress_process(int keypress) static void sig_catcher(int sig) { - less_exit(- sig); + restore_tty(); + kill_myself_with_sig(sig); /* does not return */ } #if ENABLE_FEATURE_LESS_WINCH -- cgit v1.2.3-55-g6feb From db5546ca101846f18294a43b39883bc4ff53613a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 5 Jan 2022 22:16:06 +0100 Subject: libbb: code shrink: introduce and use [_]exit_SUCCESS() function old new delta exit_SUCCESS - 7 +7 _exit_SUCCESS - 7 +7 run_pipe 1562 1567 +5 pseudo_exec_argv 399 400 +1 finish 86 87 +1 start_stop_daemon_main 1109 1107 -2 shutdown_on_signal 38 36 -2 runsv_main 1662 1660 -2 redirect 1070 1068 -2 read_line 79 77 -2 pause_and_low_level_reboot 54 52 -2 list_i2c_busses_and_exit 483 481 -2 less_exit 12 10 -2 identify 4123 4121 -2 grep_file 1161 1159 -2 getty_main 1519 1517 -2 fsck_minix_main 2681 2679 -2 free_session 132 130 -2 fdisk_main 4739 4737 -2 clean_up_and_exit 53 51 -2 bsd_select 1566 1564 -2 bb_daemonize_or_rexec 198 196 -2 ------------------------------------------------------------------------------ (add/remove: 2/0 grow/shrink: 3/17 up/down: 21/-34) Total: -13 bytes Signed-off-by: Denys Vlasenko --- debianutils/start_stop_daemon.c | 4 ++-- findutils/grep.c | 2 +- include/libbb.h | 2 ++ init/init.c | 6 +++--- libbb/vfork_daemon_rexec.c | 4 ++-- libbb/xfuncs.c | 10 ++++++++++ loginutils/getty.c | 6 +++--- loginutils/login.c | 2 +- miscutils/devfsd.c | 4 ++-- miscutils/hdparm.c | 2 +- miscutils/i2c_tools.c | 2 +- miscutils/less.c | 4 ++-- miscutils/watchdog.c | 2 +- modutils/modprobe-small.c | 2 +- networking/arping.c | 2 +- networking/inetd.c | 2 +- networking/nc.c | 2 +- networking/telnetd.c | 2 +- runit/runsv.c | 2 +- shell/ash.c | 2 +- shell/hush.c | 4 ++-- util-linux/fdisk.c | 4 ++-- util-linux/fdisk_osf.c | 4 ++-- util-linux/fsck_minix.c | 2 +- 24 files changed, 45 insertions(+), 33 deletions(-) diff --git a/debianutils/start_stop_daemon.c b/debianutils/start_stop_daemon.c index 68df44ae9..3e5dd9faa 100644 --- a/debianutils/start_stop_daemon.c +++ b/debianutils/start_stop_daemon.c @@ -519,7 +519,7 @@ int start_stop_daemon_main(int argc UNUSED_PARAM, char **argv) /* why _exit? the child may have changed the stack, * so "return 0" may do bad things */ - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } /* Child */ setsid(); /* detach from controlling tty */ @@ -531,7 +531,7 @@ int start_stop_daemon_main(int argc UNUSED_PARAM, char **argv) */ pid = xvfork(); if (pid != 0) - _exit(EXIT_SUCCESS); /* Parent */ + _exit_SUCCESS(); /* Parent */ } if (opt & OPT_MAKEPID) { /* User wants _us_ to make the pidfile */ diff --git a/findutils/grep.c b/findutils/grep.c index 8600d72fa..0b72812f1 100644 --- a/findutils/grep.c +++ b/findutils/grep.c @@ -470,7 +470,7 @@ static int grep_file(FILE *file) * "exit immediately with zero status * if any match is found, * even if errors were detected" */ - exit(EXIT_SUCCESS); + exit_SUCCESS(); } /* -l "print filenames with matches": stop after the first match */ if (option_mask32 & OPT_l) { diff --git a/include/libbb.h b/include/libbb.h index 8308d6259..c93058f6d 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -1278,6 +1278,8 @@ void set_task_comm(const char *comm) FAST_FUNC; # define re_execed_comm() 0 # define set_task_comm(name) ((void)0) #endif +void exit_SUCCESS(void) NORETURN FAST_FUNC; +void _exit_SUCCESS(void) NORETURN FAST_FUNC; /* Helpers for daemonization. * diff --git a/init/init.c b/init/init.c index efab5dcb4..785a3b460 100644 --- a/init/init.c +++ b/init/init.c @@ -744,7 +744,7 @@ static void pause_and_low_level_reboot(unsigned magic) pid = vfork(); if (pid == 0) { /* child */ reboot(magic); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } /* Used to have "while (1) sleep(1)" here. * However, in containers reboot() call is ignored, and with that loop @@ -752,7 +752,7 @@ static void pause_and_low_level_reboot(unsigned magic) */ waitpid(pid, NULL, 0); sleep1(); /* paranoia */ - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } static void run_shutdown_and_kill_processes(void) @@ -942,7 +942,7 @@ static void reload_inittab(void) for (a = G.init_action_list; a; a = a->next) if (a->action_type == 0 && a->pid != 0) kill(a->pid, SIGKILL); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } } #endif diff --git a/libbb/vfork_daemon_rexec.c b/libbb/vfork_daemon_rexec.c index 31e97051f..79141936a 100644 --- a/libbb/vfork_daemon_rexec.c +++ b/libbb/vfork_daemon_rexec.c @@ -308,7 +308,7 @@ void FAST_FUNC bb_daemonize_or_rexec(int flags, char **argv) /* fflush_all(); - add it in fork_or_rexec() if necessary */ if (fork_or_rexec(argv)) - _exit(EXIT_SUCCESS); /* parent */ + _exit_SUCCESS(); /* parent */ /* if daemonizing, detach from stdio & ctty */ setsid(); dup2(fd, 0); @@ -320,7 +320,7 @@ void FAST_FUNC bb_daemonize_or_rexec(int flags, char **argv) // * Prevent this: stop being a session leader. // */ // if (fork_or_rexec(argv)) -// _exit(EXIT_SUCCESS); /* parent */ +// _exit_SUCCESS(); /* parent */ // } } while (fd > 2) { diff --git a/libbb/xfuncs.c b/libbb/xfuncs.c index c40dcb706..465e5366c 100644 --- a/libbb/xfuncs.c +++ b/libbb/xfuncs.c @@ -423,3 +423,13 @@ int FAST_FUNC wait4pid(pid_t pid) return WTERMSIG(status) + 0x180; return 0; } + +void FAST_FUNC exit_SUCCESS(void) +{ + exit(EXIT_SUCCESS); +} + +void FAST_FUNC _exit_SUCCESS(void) +{ + _exit(EXIT_SUCCESS); +} diff --git a/loginutils/getty.c b/loginutils/getty.c index 6c6d409f4..cd6378d80 100644 --- a/loginutils/getty.c +++ b/loginutils/getty.c @@ -484,7 +484,7 @@ static char *get_logname(void) if (read(STDIN_FILENO, &c, 1) < 1) { finalize_tty_attrs(); if (errno == EINTR || errno == EIO) - exit(EXIT_SUCCESS); + exit_SUCCESS(); bb_simple_perror_msg_and_die(bb_msg_read_error); } @@ -511,7 +511,7 @@ static char *get_logname(void) case CTL('C'): case CTL('D'): finalize_tty_attrs(); - exit(EXIT_SUCCESS); + exit_SUCCESS(); case '\0': /* BREAK. If we have speeds to try, * return NULL (will switch speeds and return here) */ @@ -538,7 +538,7 @@ static char *get_logname(void) static void alarm_handler(int sig UNUSED_PARAM) { finalize_tty_attrs(); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } static void sleep10(void) diff --git a/loginutils/login.c b/loginutils/login.c index ce87e318a..569053c12 100644 --- a/loginutils/login.c +++ b/loginutils/login.c @@ -312,7 +312,7 @@ static void alarm_handler(int sig UNUSED_PARAM) /* unix API is brain damaged regarding O_NONBLOCK, * we should undo it, or else we can affect other processes */ ndelay_off(STDOUT_FILENO); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } int login_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c index e5bb8a2d8..839d00fd0 100644 --- a/miscutils/devfsd.c +++ b/miscutils/devfsd.c @@ -453,7 +453,7 @@ int devfsd_main(int argc, char **argv) DEVFSD_PROTOCOL_REVISION_DAEMON, bb_msg_proto_rev, proto_rev); if (DEVFSD_PROTOCOL_REVISION_DAEMON != proto_rev) bb_error_msg_and_die("%s mismatch!", bb_msg_proto_rev); - exit(EXIT_SUCCESS); /* -v */ + exit_SUCCESS(); /* -v */ } /* Tell kernel we are special(i.e. we get to see hidden entries) */ xioctl(fd, DEVFSDIOC_SET_EVENT_MASK, 0); @@ -474,7 +474,7 @@ int devfsd_main(int argc, char **argv) dir_operation(SERVICE, mount_point, 0, NULL); if (ENABLE_DEVFSD_FG_NP && no_polling) - exit(EXIT_SUCCESS); + exit_SUCCESS(); if (ENABLE_DEVFSD_VERBOSE || ENABLE_DEBUG) logmode = LOGMODE_BOTH; diff --git a/miscutils/hdparm.c b/miscutils/hdparm.c index 01b4e8e2e..d8d8f6166 100644 --- a/miscutils/hdparm.c +++ b/miscutils/hdparm.c @@ -1271,7 +1271,7 @@ static void identify(uint16_t *val) } } - exit(EXIT_SUCCESS); + exit_SUCCESS(); } #endif diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c index b25d49792..e3741eeba 100644 --- a/miscutils/i2c_tools.c +++ b/miscutils/i2c_tools.c @@ -1212,7 +1212,7 @@ static void NORETURN list_i2c_busses_and_exit(void) } } - exit(EXIT_SUCCESS); + exit_SUCCESS(); } static void NORETURN no_support(const char *cmd) diff --git a/miscutils/less.c b/miscutils/less.c index 6825e5577..82c4b21f0 100644 --- a/miscutils/less.c +++ b/miscutils/less.c @@ -333,10 +333,10 @@ static void restore_tty(void) clear_line(); } -static void less_exit(void) +static NOINLINE void less_exit(void) { restore_tty(); - exit(EXIT_SUCCESS); + exit_SUCCESS(); } #if (ENABLE_FEATURE_LESS_DASHCMD && ENABLE_FEATURE_LESS_LINENUMS) \ diff --git a/miscutils/watchdog.c b/miscutils/watchdog.c index d8e9c78f5..9f5a4b849 100644 --- a/miscutils/watchdog.c +++ b/miscutils/watchdog.c @@ -76,7 +76,7 @@ static void shutdown_on_signal(int sig UNUSED_PARAM) { remove_pidfile_std_path_and_ext("watchdog"); shutdown_watchdog(); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } static void watchdog_open(const char* device) diff --git a/modutils/modprobe-small.c b/modutils/modprobe-small.c index db44a2ed0..b61651621 100644 --- a/modutils/modprobe-small.c +++ b/modutils/modprobe-small.c @@ -415,7 +415,7 @@ static FAST_FUNC int fileAction(struct recursive_state *state, /* Load was successful, there is nothing else to do. * This can happen ONLY for "top-level" module load, * not a dep, because deps don't do dirscan. */ - exit(EXIT_SUCCESS); + exit_SUCCESS(); } } diff --git a/networking/arping.c b/networking/arping.c index d44d7d697..86f0221ed 100644 --- a/networking/arping.c +++ b/networking/arping.c @@ -159,7 +159,7 @@ static void finish(void) if (option_mask32 & DAD) exit(!!received); if (option_mask32 & UNSOLICITED) - exit(EXIT_SUCCESS); + exit_SUCCESS(); exit(!received); } diff --git a/networking/inetd.c b/networking/inetd.c index e5352a555..e71be51c3 100644 --- a/networking/inetd.c +++ b/networking/inetd.c @@ -1208,7 +1208,7 @@ static void clean_up_and_exit(int sig UNUSED_PARAM) close(sep->se_fd); } remove_pidfile_std_path_and_ext("inetd"); - exit(EXIT_SUCCESS); + exit_SUCCESS(); } int inetd_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; diff --git a/networking/nc.c b/networking/nc.c index d351bf72a..ab1316339 100644 --- a/networking/nc.c +++ b/networking/nc.c @@ -268,7 +268,7 @@ int nc_main(int argc, char **argv) nread = safe_read(pfds[fdidx].fd, iobuf, COMMON_BUFSIZE); if (fdidx != 0) { if (nread < 1) - exit(EXIT_SUCCESS); + exit_SUCCESS(); ofd = STDOUT_FILENO; } else { if (nread < 1) { diff --git a/networking/telnetd.c b/networking/telnetd.c index 581da1924..0805e464f 100644 --- a/networking/telnetd.c +++ b/networking/telnetd.c @@ -582,7 +582,7 @@ free_session(struct tsession *ts) struct tsession *t; if (option_mask32 & OPT_INETD) - exit(EXIT_SUCCESS); + exit_SUCCESS(); /* Unlink this telnet session from the session list */ t = G.sessions; diff --git a/runit/runsv.c b/runit/runsv.c index a4b8af494..6ad6bf46e 100644 --- a/runit/runsv.c +++ b/runit/runsv.c @@ -700,7 +700,7 @@ int runsv_main(int argc UNUSED_PARAM, char **argv) if (svd[0].sd_want == W_EXIT && svd[0].state == S_DOWN) { if (svd[1].pid == 0) - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); if (svd[1].sd_want != W_EXIT) { svd[1].sd_want = W_EXIT; /* stopservice(&svd[1]); */ diff --git a/shell/ash.c b/shell/ash.c index 827643808..4a8ec0c03 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -5505,7 +5505,7 @@ openhere(union node *redir) ignoresig(SIGTSTP); //signal(SIGTSTP, SIG_IGN); signal(SIGPIPE, SIG_DFL); xwrite(pip[1], p, len); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } out: close(pip[1]); diff --git a/shell/hush.c b/shell/hush.c index 6a27b1634..982fc356a 100644 --- a/shell/hush.c +++ b/shell/hush.c @@ -8587,7 +8587,7 @@ static NOINLINE void pseudo_exec_argv(nommu_save_t *nommu_save, * expand_assignments(): think about ... | var=`sleep 1` | ... */ free_strings(new_env); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } sv_shadowed = G.shadowed_vars_pp; @@ -8768,7 +8768,7 @@ static void pseudo_exec(nommu_save_t *nommu_save, /* Case when we are here: ... | >file */ debug_printf_exec("pseudo_exec'ed null command\n"); - _exit(EXIT_SUCCESS); + _exit_SUCCESS(); } #if ENABLE_HUSH_JOB diff --git a/util-linux/fdisk.c b/util-linux/fdisk.c index 1c2a7d683..9c393b8fc 100644 --- a/util-linux/fdisk.c +++ b/util-linux/fdisk.c @@ -665,7 +665,7 @@ read_line(const char *prompt) sz = read_line_input(NULL, prompt, line_buffer, sizeof(line_buffer)); if (sz <= 0) - exit(EXIT_SUCCESS); /* Ctrl-D or Ctrl-C */ + exit_SUCCESS(); /* Ctrl-D or Ctrl-C */ if (line_buffer[sz-1] == '\n') line_buffer[--sz] = '\0'; @@ -2855,7 +2855,7 @@ xselect(void) if (ENABLE_FEATURE_CLEAN_UP) close_dev_fd(); bb_putchar('\n'); - exit(EXIT_SUCCESS); + exit_SUCCESS(); case 'r': return; case 's': diff --git a/util-linux/fdisk_osf.c b/util-linux/fdisk_osf.c index 765740ff1..6c66c130d 100644 --- a/util-linux/fdisk_osf.c +++ b/util-linux/fdisk_osf.c @@ -383,7 +383,7 @@ bsd_select(void) if (xbsd_readlabel(NULL) == 0) if (xbsd_create_disklabel() == 0) - exit(EXIT_SUCCESS); + exit_SUCCESS(); #endif @@ -411,7 +411,7 @@ bsd_select(void) case 'q': if (ENABLE_FEATURE_CLEAN_UP) close_dev_fd(); - exit(EXIT_SUCCESS); + exit_SUCCESS(); case 'r': return; case 's': diff --git a/util-linux/fsck_minix.c b/util-linux/fsck_minix.c index 40b86d01b..dd2265c32 100644 --- a/util-linux/fsck_minix.c +++ b/util-linux/fsck_minix.c @@ -423,7 +423,7 @@ static void check_mount(void) cont = ask("Do you really want to continue", 0); if (!cont) { puts("Check aborted"); - exit(EXIT_SUCCESS); + exit_SUCCESS(); } } } -- cgit v1.2.3-55-g6feb From 6062c0d19bc201cbeb61b8875598cdd7a14a5ae0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 5 Jan 2022 23:02:13 +0100 Subject: libbb: change xstrndup, xmemdup to take size_t as size parameter Also, remove entirely usually-disabled paranoia check (was also using wrong config option to enable itself). Signed-off-by: Denys Vlasenko --- include/libbb.h | 4 ++-- libbb/xfuncs_printf.c | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/libbb.h b/include/libbb.h index c93058f6d..daa310776 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -429,8 +429,8 @@ void *xrealloc(void *old, size_t size) FAST_FUNC; xrealloc_vector_helper((vector), (sizeof((vector)[0]) << 8) + (shift), (idx)) void* xrealloc_vector_helper(void *vector, unsigned sizeof_and_shift, int idx) FAST_FUNC; char *xstrdup(const char *s) FAST_FUNC RETURNS_MALLOC; -char *xstrndup(const char *s, int n) FAST_FUNC RETURNS_MALLOC; -void *xmemdup(const void *s, int n) FAST_FUNC RETURNS_MALLOC; +char *xstrndup(const char *s, size_t n) FAST_FUNC RETURNS_MALLOC; +void *xmemdup(const void *s, size_t n) FAST_FUNC RETURNS_MALLOC; void *mmap_read(int fd, size_t size) FAST_FUNC; void *mmap_anon(size_t size) FAST_FUNC; void *xmmap_anon(size_t size) FAST_FUNC; diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c index d29acebcd..fc630d176 100644 --- a/libbb/xfuncs_printf.c +++ b/libbb/xfuncs_printf.c @@ -91,13 +91,10 @@ char* FAST_FUNC xstrdup(const char *s) // Die if we can't allocate n+1 bytes (space for the null terminator) and copy // the (possibly truncated to length n) string into it. -char* FAST_FUNC xstrndup(const char *s, int n) +char* FAST_FUNC xstrndup(const char *s, size_t n) { char *t; - if (ENABLE_DEBUG && s == NULL) - bb_simple_error_msg_and_die("xstrndup bug"); - t = strndup(s, n); if (t == NULL) @@ -106,7 +103,7 @@ char* FAST_FUNC xstrndup(const char *s, int n) return t; } -void* FAST_FUNC xmemdup(const void *s, int n) +void* FAST_FUNC xmemdup(const void *s, size_t n) { return memcpy(xmalloc(n), s, n); } -- cgit v1.2.3-55-g6feb