From 931c55f9e2b41473132683488820c6fb7c47506b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 13 Jan 2022 12:50:48 +0100 Subject: libbb: invert the meaning of SETUP_ENV_NO_CHDIR -> SETUP_ENV_CHDIR Double negatives are hard to grok. function old new delta login_main 986 988 +2 su_main 474 470 -4 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/1 up/down: 2/-4) Total: -2 bytes Signed-off-by: Denys Vlasenko --- include/libbb.h | 6 +++--- libbb/setup_environment.c | 5 +++-- loginutils/login.c | 4 +++- loginutils/su.c | 7 +++---- loginutils/sulogin.c | 9 ++++++--- miscutils/crontab.c | 4 ++-- shell/ash.c | 2 +- shell/hush.c | 2 +- 8 files changed, 22 insertions(+), 17 deletions(-) diff --git a/include/libbb.h b/include/libbb.h index a0ffbef62..780e9ae7d 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -1726,7 +1726,7 @@ extern void selinux_or_die(void) FAST_FUNC; /* setup_environment: - * if !SETUP_ENV_NO_CHDIR: + * if SETUP_ENV_CHDIR: * if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set * TERM=(old value) @@ -1734,7 +1734,7 @@ extern void selinux_or_die(void) FAST_FUNC; * PATH=bb_default_[root_]path * HOME=pw->pw_dir * SHELL=shell - * else if SETUP_ENV_CHANGEENV: + * else if SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME: * if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME: * USER=pw->pw_name, LOGNAME=pw->pw_name * HOME=pw->pw_dir @@ -1748,7 +1748,7 @@ extern void selinux_or_die(void) FAST_FUNC; #define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1) #define SETUP_ENV_CLEARENV (1 << 2) #define SETUP_ENV_TO_TMP (1 << 3) -#define SETUP_ENV_NO_CHDIR (1 << 4) +#define SETUP_ENV_CHDIR (1 << 4) void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC; void nuke_str(char *str) FAST_FUNC; #if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c index df2983958..37777204e 100644 --- a/libbb/setup_environment.c +++ b/libbb/setup_environment.c @@ -36,7 +36,7 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass /* Change the current working directory to be the home directory * of the user */ - if (!(flags & SETUP_ENV_NO_CHDIR)) { + if (flags & SETUP_ENV_CHDIR) { if (chdir(pw->pw_dir) != 0) { bb_error_msg("can't change directory to '%s'", pw->pw_dir); xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/"); @@ -59,7 +59,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass //xsetenv("LOGNAME", pw->pw_name); //xsetenv("HOME", pw->pw_dir); //xsetenv("SHELL", shell); - } else if (flags & SETUP_ENV_CHANGEENV) { + } else + if (flags & (SETUP_ENV_CHANGEENV|SETUP_ENV_CHANGEENV_LOGNAME)) { /* Set HOME, SHELL, and if not becoming a super-user * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */ if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) { diff --git a/loginutils/login.c b/loginutils/login.c index cac4349b2..332238181 100644 --- a/loginutils/login.c +++ b/loginutils/login.c @@ -564,7 +564,9 @@ int login_main(int argc UNUSED_PARAM, char **argv) change_identity(pw); setup_environment(pw->pw_shell, - (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) + SETUP_ENV_CHANGEENV, + (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) + + SETUP_ENV_CHANGEENV + + SETUP_ENV_CHDIR, pw); #if ENABLE_PAM diff --git a/loginutils/su.c b/loginutils/su.c index e1db7590f..6efe1981a 100644 --- a/loginutils/su.c +++ b/loginutils/su.c @@ -176,10 +176,9 @@ int su_main(int argc UNUSED_PARAM, char **argv) change_identity(pw); setup_environment(opt_shell, - ((flags & SU_OPT_l) / SU_OPT_l * SETUP_ENV_CLEARENV) - + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV) - + (!(flags & SU_OPT_l) * SETUP_ENV_NO_CHDIR), - pw); + ((flags & SU_OPT_l) ? (SETUP_ENV_CLEARENV + SETUP_ENV_CHDIR) : 0) + + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV), + pw); IF_SELINUX(set_current_security_context(NULL);) if (opt_command) { diff --git a/loginutils/sulogin.c b/loginutils/sulogin.c index c9817960c..681022acb 100644 --- a/loginutils/sulogin.c +++ b/loginutils/sulogin.c @@ -94,10 +94,13 @@ int sulogin_main(int argc UNUSED_PARAM, char **argv) shell = pwd->pw_shell; /* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */ - setup_environment(shell, SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME, pwd); + setup_environment(shell, 0 + + SETUP_ENV_CHANGEENV_LOGNAME + + SETUP_ENV_CHDIR + , pwd); // no SETUP_ENV_CLEARENV - // SETUP_ENV_CHANGEENV[+LOGNAME] - set HOME, SHELL, USER,and LOGNAME - // no SETUP_ENV_NO_CHDIR - IOW: cd to $HOME + // SETUP_ENV_CHANGEENV_LOGNAME - set HOME, SHELL, USER,and LOGNAME + // SETUP_ENV_CHDIR - cd to $HOME /* util-linux 2.36.1 compat: steal ctty if we don't have it yet * (yes, util-linux uses force=1) */ diff --git a/miscutils/crontab.c b/miscutils/crontab.c index 411a18a50..1111f4d54 100644 --- a/miscutils/crontab.c +++ b/miscutils/crontab.c @@ -55,8 +55,8 @@ static void edit_file(const struct passwd *pas, const char *file) /* initgroups, setgid, setuid */ change_identity(pas); setup_environment(pas->pw_shell, - SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP, - pas); + SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP | SETUP_ENV_CHDIR, + pas); ptr = getenv("VISUAL"); if (!ptr) { ptr = getenv("EDITOR"); diff --git a/shell/ash.c b/shell/ash.c index 12b2db3a9..ca5c755b6 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -10791,7 +10791,7 @@ preadfd(void) write(STDOUT_FILENO, "^C", 2); raise(SIGINT); /* raise(SIGINT) did not work! (e.g. if SIGINT - * is SIG_INGed on startup, it stays SIG_IGNed) + * is SIG_IGNed on startup, it stays SIG_IGNed) */ if (trap[SIGINT]) { buf[0] = '\n'; diff --git a/shell/hush.c b/shell/hush.c index 982fc356a..7d0dc67e4 100644 --- a/shell/hush.c +++ b/shell/hush.c @@ -10361,7 +10361,7 @@ int hush_main(int argc, char **argv) //it ignores TERM: // bash -i -c 'kill $$; echo ALIVE' // ALIVE -//it resets SIG_INGed HUP to SIG_DFL: +//it resets SIG_IGNed HUP to SIG_DFL: // trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE' // Hangup [the message is not printed by bash, it's the shell which started it] //is talkative about jobs and exiting: -- cgit v1.2.3-55-g6feb From c2788f88f430da8ae5fb5f293b13fc2b167ea2fe Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 13 Jan 2022 12:56:10 +0100 Subject: libbb: introduce and use chdir_or_warn() function old new delta chdir_or_warn - 37 +37 send_cgi_and_exit 720 711 -9 xchdir 27 15 -12 setup_environment 233 217 -16 fork_job 449 433 -16 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/4 up/down: 37/-53) Total: -16 bytes Signed-off-by: Denys Vlasenko --- include/libbb.h | 1 + libbb/setup_environment.c | 3 +-- libbb/xfuncs_printf.c | 11 +++++++++-- miscutils/crond.c | 3 +-- networking/httpd.c | 3 +-- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/libbb.h b/include/libbb.h index 780e9ae7d..91b456915 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -645,6 +645,7 @@ void xsetgid(gid_t gid) FAST_FUNC; void xsetuid(uid_t uid) FAST_FUNC; void xsetegid(gid_t egid) FAST_FUNC; void xseteuid(uid_t euid) FAST_FUNC; +int chdir_or_warn(const char *path) FAST_FUNC; void xchdir(const char *path) FAST_FUNC; void xfchdir(int fd) FAST_FUNC; void xchroot(const char *path) FAST_FUNC; diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c index 37777204e..3549e2099 100644 --- a/libbb/setup_environment.c +++ b/libbb/setup_environment.c @@ -37,8 +37,7 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass /* Change the current working directory to be the home directory * of the user */ if (flags & SETUP_ENV_CHDIR) { - if (chdir(pw->pw_dir) != 0) { - bb_error_msg("can't change directory to '%s'", pw->pw_dir); + if (chdir_or_warn(pw->pw_dir) != 0) { xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/"); } } diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c index fc630d176..842d10cd2 100644 --- a/libbb/xfuncs_printf.c +++ b/libbb/xfuncs_printf.c @@ -415,11 +415,18 @@ void FAST_FUNC xseteuid(uid_t euid) if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid"); } +int FAST_FUNC chdir_or_warn(const char *path) +{ + int r = chdir(path); + if (r != 0) + bb_perror_msg("can't change directory to '%s'", path); + return r; +} // Die if we can't chdir to a new path. void FAST_FUNC xchdir(const char *path) { - if (chdir(path)) - bb_perror_msg_and_die("can't change directory to '%s'", path); + if (chdir_or_warn(path) != 0) + xfunc_die(); } void FAST_FUNC xfchdir(int fd) diff --git a/miscutils/crond.c b/miscutils/crond.c index b74427351..1965af656 100644 --- a/miscutils/crond.c +++ b/miscutils/crond.c @@ -675,8 +675,7 @@ static void change_user(struct passwd *pas) { /* careful: we're after vfork! */ change_identity(pas); /* - initgroups, setgid, setuid */ - if (chdir(pas->pw_dir) < 0) { - bb_error_msg("can't change directory to '%s'", pas->pw_dir); + if (chdir_or_warn(pas->pw_dir) != 0) { xchdir(CRON_DIR); } } diff --git a/networking/httpd.c b/networking/httpd.c index 33045163f..ffc58e10b 100644 --- a/networking/httpd.c +++ b/networking/httpd.c @@ -1667,8 +1667,7 @@ static void send_cgi_and_exit( script = last_slash; if (script != url) { /* paranoia */ *script = '\0'; - if (chdir(url + 1) != 0) { - bb_perror_msg("can't change directory to '%s'", url + 1); + if (chdir_or_warn(url + 1) != 0) { goto error_execing_cgi; } // not needed: *script = '/'; -- cgit v1.2.3-55-g6feb From a277506a64404e6c4472ff89c944c4f353db1c33 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 16 Jan 2022 23:54:46 +0100 Subject: shell: add comments about SIGINT-related problems Signed-off-by: Denys Vlasenko --- shell/ash.c | 13 ++++++++----- shell/shell_common.c | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/shell/ash.c b/shell/ash.c index ca5c755b6..086773dd7 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -664,7 +664,7 @@ raise_exception(int e) /* * Called when a SIGINT is received. (If the user specifies * that SIGINT is to be trapped or ignored using the trap builtin, then - * this routine is not called.) Suppressint is nonzero when interrupts + * this routine is not called.) suppress_int is nonzero when interrupts * are held using the INT_OFF macro. (The test for iflag is just * defensive programming.) */ @@ -695,13 +695,12 @@ raise_interrupt(void) } while (0) #endif -static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void +static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void int_on(void) { barrier(); - if (--suppress_int == 0 && pending_int) { + if (--suppress_int == 0 && pending_int) raise_interrupt(); - } } #if DEBUG_INTONOFF # define INT_ON do { \ @@ -711,7 +710,7 @@ int_on(void) #else # define INT_ON int_on() #endif -static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void +static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void force_int_on(void) { barrier(); @@ -10785,6 +10784,10 @@ preadfd(void) # endif reinit_unicode_for_ash(); again: +//BUG: not in INT_OFF/INT_ON section - SIGINT et al would longjmp out of read_line_input()! +//This would cause a memory leak in interactive shell +//(repeated internal allocations in read_line_input): +// (while kill -INT $$; do :; done) & nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ); if (nr == 0) { /* ^C pressed, "convert" to SIGINT */ diff --git a/shell/shell_common.c b/shell/shell_common.c index 2e36d9208..13163acdf 100644 --- a/shell/shell_common.c +++ b/shell/shell_common.c @@ -196,6 +196,7 @@ shell_builtin_read(struct builtin_read_params *params) */ errno = 0; pfd[0].events = POLLIN; +//TODO race with a signal arriving just before the poll! if (poll(pfd, 1, timeout) <= 0) { /* timed out, or EINTR */ err = errno; -- cgit v1.2.3-55-g6feb From 12566e7f9b5e5c5d445bc4d36991d134b431dc6c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 17 Jan 2022 03:02:40 +0100 Subject: ash,hush: fix handling of SIGINT while waiting for interactive input function old new delta lineedit_read_key 160 237 +77 __pgetc 522 589 +67 fgetc_interactive 244 309 +65 safe_read_key - 39 +39 read_key 588 607 +19 record_pending_signo 23 32 +9 signal_handler 75 81 +6 .rodata 104312 104309 -3 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 6/1 up/down: 282/-3) Total: 279 bytes Signed-off-by: Denys Vlasenko --- editors/vi.c | 4 ++-- include/libbb.h | 5 +++- libbb/lineedit.c | 24 ++++++++++++++++--- libbb/read_key.c | 16 +++++++++++-- miscutils/hexedit.c | 2 +- miscutils/less.c | 4 ++-- procps/top.c | 2 +- shell/ash.c | 39 ++++++++++++++++++++++++------- shell/hush.c | 67 +++++++++++++++++++++++++++++++++++++---------------- 9 files changed, 122 insertions(+), 41 deletions(-) diff --git a/editors/vi.c b/editors/vi.c index 3dbe5b471..d37cd48a3 100644 --- a/editors/vi.c +++ b/editors/vi.c @@ -1122,7 +1122,7 @@ static int readit(void) // read (maybe cursor) key from stdin // on nonblocking stdin. // Note: read_key sets errno to 0 on success. again: - c = read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1); + c = safe_read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1); if (c == -1) { // EOF/error if (errno == EAGAIN) // paranoia goto again; @@ -4770,7 +4770,7 @@ static void edit_file(char *fn) uint64_t k; write1(ESC"[999;999H" ESC"[6n"); fflush_all(); - k = read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100); + k = safe_read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100); if ((int32_t)k == KEYCODE_CURSOR_POS) { uint32_t rc = (k >> 32); columns = (rc & 0x7fff); diff --git a/include/libbb.h b/include/libbb.h index 91b456915..b45ce91c5 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -1908,6 +1908,8 @@ enum { * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout */ int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC; +/* This version loops on EINTR: */ +int64_t safe_read_key(int fd, char *buffer, int timeout) FAST_FUNC; void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC; @@ -1961,7 +1963,8 @@ enum { USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION, VI_MODE = 8 * ENABLE_FEATURE_EDITING_VI, WITH_PATH_LOOKUP = 0x10, - FOR_SHELL = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION, + LI_INTERRUPTIBLE = 0x20, + FOR_SHELL = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION | LI_INTERRUPTIBLE, }; line_input_t *new_line_input_t(int flags) FAST_FUNC; #if ENABLE_FEATURE_EDITING_SAVEHISTORY diff --git a/libbb/lineedit.c b/libbb/lineedit.c index e14c78707..f76afd37d 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c @@ -2161,12 +2161,30 @@ static int lineedit_read_key(char *read_key_buffer, int timeout) * insist on full MB_CUR_MAX buffer to declare input like * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls". * + * If LI_INTERRUPTIBLE, return -1 if got EINTR in poll() + * inside read_key, or if bb_got_signal != 0 (IOW: if signal + * arrived before poll() is reached). + * * Note: read_key sets errno to 0 on success. */ - IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) - ic = read_key(STDIN_FILENO, read_key_buffer, timeout); - IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) + do { + if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) { + errno = EINTR; + return -1; + } +//FIXME: still races here with signals, but small window to poll() inside read_key + IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) + ic = read_key(STDIN_FILENO, read_key_buffer, timeout); + IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) + } while (!(state->flags & LI_INTERRUPTIBLE) && errno == EINTR); + if (errno) { + /* LI_INTERRUPTIBLE can bail out with EINTR here, + * but nothing really guarantees that bb_got_signal + * is nonzero. Follow the least surprise principle: + */ + if (errno == EINTR && bb_got_signal == 0) + bb_got_signal = 255; /* something nonzero */ #if ENABLE_UNICODE_SUPPORT if (errno == EAGAIN && unicode_idx != 0) goto pushback; diff --git a/libbb/read_key.c b/libbb/read_key.c index 03b7da656..829ae215c 100644 --- a/libbb/read_key.c +++ b/libbb/read_key.c @@ -126,7 +126,10 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout) * if fd can be in non-blocking mode. */ if (timeout >= -1) { - if (safe_poll(&pfd, 1, timeout) == 0) { + n = poll(&pfd, 1, timeout); + if (n < 0 && errno == EINTR) + return n; + if (n == 0) { /* Timed out */ errno = EAGAIN; return -1; @@ -138,7 +141,7 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout) * When we were reading 3 bytes here, we were eating * "li" too, and cat was getting wrong input. */ - n = safe_read(fd, buffer, 1); + n = read(fd, buffer, 1); if (n <= 0) return -1; } @@ -284,6 +287,15 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout) goto start_over; } +int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout) +{ + int64_t r; + do { + r = read_key(fd, buffer, timeout); + } while (errno == EINTR); + return r; +} + void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len) { unsigned cur_len = (unsigned char)buffer[0]; diff --git a/miscutils/hexedit.c b/miscutils/hexedit.c index f8ff9b62b..15ad78377 100644 --- a/miscutils/hexedit.c +++ b/miscutils/hexedit.c @@ -292,7 +292,7 @@ int hexedit_main(int argc UNUSED_PARAM, char **argv) fflush_all(); G.in_read_key = 1; if (!bb_got_signal) - key = read_key(STDIN_FILENO, G.read_key_buffer, -1); + key = safe_read_key(STDIN_FILENO, G.read_key_buffer, -1); G.in_read_key = 0; if (bb_got_signal) key = CTRL('X'); diff --git a/miscutils/less.c b/miscutils/less.c index 82c4b21f0..8a0525cb7 100644 --- a/miscutils/less.c +++ b/miscutils/less.c @@ -1137,9 +1137,9 @@ static int64_t getch_nowait(void) #endif } - /* We have kbd_fd in O_NONBLOCK mode, read inside read_key() + /* We have kbd_fd in O_NONBLOCK mode, read inside safe_read_key() * would not block even if there is no input available */ - key64 = read_key(kbd_fd, kbd_input, /*timeout off:*/ -2); + key64 = safe_read_key(kbd_fd, kbd_input, /*timeout off:*/ -2); if ((int)key64 == -1) { if (errno == EAGAIN) { /* No keyboard input available. Since poll() did return, diff --git a/procps/top.c b/procps/top.c index 4cd545c69..804d6f258 100644 --- a/procps/top.c +++ b/procps/top.c @@ -913,7 +913,7 @@ static unsigned handle_input(unsigned scan_mask, duration_t interval) while (1) { int32_t c; - c = read_key(STDIN_FILENO, G.kbd_input, interval * 1000); + c = safe_read_key(STDIN_FILENO, G.kbd_input, interval * 1000); if (c == -1 && errno != EAGAIN) { /* error/EOF */ option_mask32 |= OPT_EOF; diff --git a/shell/ash.c b/shell/ash.c index 086773dd7..55df54bd0 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -3679,7 +3679,9 @@ signal_handler(int signo) if (!trap[SIGCHLD]) return; } - +#if ENABLE_FEATURE_EDITING + bb_got_signal = signo; /* for read_line_input: "we got a signal" */ +#endif gotsig[signo - 1] = 1; pending_sig = signo; @@ -10784,33 +10786,52 @@ preadfd(void) # endif reinit_unicode_for_ash(); again: -//BUG: not in INT_OFF/INT_ON section - SIGINT et al would longjmp out of read_line_input()! -//This would cause a memory leak in interactive shell -//(repeated internal allocations in read_line_input): -// (while kill -INT $$; do :; done) & + /* For shell, LI_INTERRUPTIBLE is set: + * read_line_input will abort on either + * getting EINTR in poll(), or if it sees bb_got_signal != 0 + * (IOW: if signal arrives before poll() is reached). + * Interactive testcases: + * (while kill -INT $$; do sleep 1; done) & + * #^^^ prints ^C, prints prompt, repeats + * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) & + * #^^^ prints ^C, prints "I", prints prompt, repeats + * trap 'echo T' term; (while kill $$; do sleep 1; done) & + * #^^^ prints "T", prints prompt, repeats + * #(bash 5.0.17 exits after first "T", looks like a bug) + */ + bb_got_signal = 0; + INT_OFF; /* no longjmp'ing out of read_line_input please */ nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ); + if (bb_got_signal == SIGINT) + write(STDOUT_FILENO, "^C\n", 3); + INT_ON; /* here non-blocked SIGINT will longjmp */ if (nr == 0) { /* ^C pressed, "convert" to SIGINT */ - write(STDOUT_FILENO, "^C", 2); - raise(SIGINT); + write(STDOUT_FILENO, "^C\n", 3); + raise(SIGINT); /* here non-blocked SIGINT will longjmp */ /* raise(SIGINT) did not work! (e.g. if SIGINT * is SIG_IGNed on startup, it stays SIG_IGNed) */ if (trap[SIGINT]) { + empty_line_input: buf[0] = '\n'; buf[1] = '\0'; return 1; } exitstatus = 128 + SIGINT; /* bash behavior on ^C + ignored SIGINT: */ - write(STDOUT_FILENO, "\n", 1); goto again; } if (nr < 0) { if (errno == 0) { - /* Ctrl+D pressed */ + /* ^D pressed */ nr = 0; } + else if (errno == EINTR) { /* got signal? */ + if (bb_got_signal != SIGINT) + write(STDOUT_FILENO, "\n", 1); + goto empty_line_input; + } # if ENABLE_ASH_IDLE_TIMEOUT else if (errno == EAGAIN && timeout > 0) { puts("\007timed out waiting for input: auto-logout"); diff --git a/shell/hush.c b/shell/hush.c index 7d0dc67e4..6dc2ecaac 100644 --- a/shell/hush.c +++ b/shell/hush.c @@ -918,6 +918,7 @@ struct globals { #if ENABLE_HUSH_INTERACTIVE smallint promptmode; /* 0: PS1, 1: PS2 */ #endif + /* set by signal handler if SIGINT is received _and_ its trap is not set */ smallint flag_SIGINT; #if ENABLE_HUSH_LOOPS smallint flag_break_continue; @@ -1944,6 +1945,9 @@ enum { static void record_pending_signo(int sig) { sigaddset(&G.pending_set, sig); +#if ENABLE_FEATURE_EDITING + bb_got_signal = sig; /* for read_line_input: "we got a signal" */ +#endif #if ENABLE_HUSH_FAST if (sig == SIGCHLD) { G.count_SIGCHLD++; @@ -2652,30 +2656,53 @@ static int get_user_input(struct in_str *i) for (;;) { reinit_unicode_for_hush(); G.flag_SIGINT = 0; - /* buglet: SIGINT will not make new prompt to appear _at once_, - * only after . (^C works immediately) */ - r = read_line_input(G.line_input_state, prompt_str, + + bb_got_signal = 0; + if (!sigisemptyset(&G.pending_set)) { + /* Whoops, already got a signal, do not call read_line_input */ + bb_got_signal = r = -1; + } else { + /* For shell, LI_INTERRUPTIBLE is set: + * read_line_input will abort on either + * getting EINTR in poll(), or if it sees bb_got_signal != 0 + * (IOW: if signal arrives before poll() is reached). + * Interactive testcases: + * (while kill -INT $$; do sleep 1; done) & + * #^^^ prints ^C, prints prompt, repeats + * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) & + * #^^^ prints ^C, prints "I", prints prompt, repeats + * trap 'echo T' term; (while kill $$; do sleep 1; done) & + * #^^^ prints "T", prints prompt, repeats + * #(bash 5.0.17 exits after first "T", looks like a bug) + */ + r = read_line_input(G.line_input_state, prompt_str, G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1 - ); - /* read_line_input intercepts ^C, "convert" it to SIGINT */ - if (r == 0) { - raise(SIGINT); + ); + /* read_line_input intercepts ^C, "convert" it to SIGINT */ + if (r == 0) + raise(SIGINT); + } + /* bash prints ^C (before running a trap, if any) + * both on keyboard ^C and on real SIGINT (non-kbd generated). + */ + if (sigismember(&G.pending_set, SIGINT)) { + write(STDOUT_FILENO, "^C\n", 3); + G.last_exitcode = 128 | SIGINT; } check_and_run_traps(); - if (r != 0 && !G.flag_SIGINT) + if (r == 0) /* keyboard ^C? */ + continue; /* go back, read another input line */ + if (r > 0) /* normal input? (no ^C, no ^D, no signals) */ break; - /* ^C or SIGINT: repeat */ - /* bash prints ^C even on real SIGINT (non-kbd generated) */ - write(STDOUT_FILENO, "^C\n", 3); - G.last_exitcode = 128 | SIGINT; - } - if (r < 0) { - /* EOF/error detected */ - /* ^D on interactive input goes to next line before exiting: */ - write(STDOUT_FILENO, "\n", 1); - i->p = NULL; - i->peek_buf[0] = r = EOF; - return r; + if (!bb_got_signal) { + /* r < 0: ^D/EOF/error detected (but not signal) */ + /* ^D on interactive input goes to next line before exiting: */ + write(STDOUT_FILENO, "\n", 1); + i->p = NULL; + i->peek_buf[0] = r = EOF; + return r; + } + /* it was a signal: go back, read another input line */ } i->p = G.user_input_buf; return (unsigned char)*i->p++; -- cgit v1.2.3-55-g6feb From 8ad2acf352d790d0bdd792b8e126d58a088451f3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 17 Jan 2022 23:59:46 +0100 Subject: fix "defined but not used" warnings Signed-off-by: Denys Vlasenko --- archival/libarchive/get_header_tar.c | 2 ++ miscutils/i2c_tools.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/archival/libarchive/get_header_tar.c b/archival/libarchive/get_header_tar.c index d26868bf8..cc6f3f0ad 100644 --- a/archival/libarchive/get_header_tar.c +++ b/archival/libarchive/get_header_tar.c @@ -147,11 +147,13 @@ static void process_pax_hdr(archive_handle_t *archive_handle, unsigned sz, int g #endif } +#if ENABLE_FEATURE_TAR_GNU_EXTENSIONS static void die_if_bad_fnamesize(off_t sz) { if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */ bb_simple_error_msg_and_die("bad archive"); } +#endif char FAST_FUNC get_header_tar(archive_handle_t *archive_handle) { diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c index e3741eeba..da26f5e19 100644 --- a/miscutils/i2c_tools.c +++ b/miscutils/i2c_tools.c @@ -120,6 +120,7 @@ static int32_t i2c_smbus_access(int fd, char read_write, uint8_t cmd, return ioctl(fd, I2C_SMBUS, &args); } +#if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP || ENABLE_I2CDETECT static int32_t i2c_smbus_read_byte(int fd) { union i2c_smbus_data data; @@ -131,6 +132,7 @@ static int32_t i2c_smbus_read_byte(int fd) return data.byte; } +#endif #if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP static int32_t i2c_smbus_write_byte(int fd, uint8_t val) -- cgit v1.2.3-55-g6feb From 1e825acf8d715fe49af040cb02f9e96c26955832 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 18 Jan 2022 00:31:27 +0100 Subject: libbb: shrink lineedit_read_key() function old new delta lineedit_read_key 237 231 -6 Signed-off-by: Denys Vlasenko --- archival/libarchive/decompress_bunzip2.c | 2 +- coreutils/head.c | 6 +++--- editors/patch.c | 2 +- editors/patch_toybox.c | 2 +- include/libbb.h | 2 ++ libbb/lineedit.c | 26 ++++++++++++++++---------- libbb/read_key.c | 1 + 7 files changed, 25 insertions(+), 16 deletions(-) diff --git a/archival/libarchive/decompress_bunzip2.c b/archival/libarchive/decompress_bunzip2.c index 42e2b4f88..4a2b668aa 100644 --- a/archival/libarchive/decompress_bunzip2.c +++ b/archival/libarchive/decompress_bunzip2.c @@ -654,7 +654,7 @@ static int read_bunzip(bunzip_data *bd, char *outbuf, int len) /* Subtract the 1 copy we'd output anyway to get extras */ --bd->writeCopies; } - } /* for(;;) */ + } /* for (;;) */ /* Decompression of this input block completed successfully */ bd->writeCRC = CRC = ~CRC; diff --git a/coreutils/head.c b/coreutils/head.c index 9586f869f..c7537a20e 100644 --- a/coreutils/head.c +++ b/coreutils/head.c @@ -76,7 +76,7 @@ print_except_N_last_bytes(FILE *fp, unsigned count) { unsigned char *circle = xmalloc(++count); unsigned head = 0; - for(;;) { + for (;;) { int c; c = getc(fp); if (c == EOF) @@ -105,7 +105,7 @@ print_except_N_last_lines(FILE *fp, unsigned count) { char **circle = xzalloc((++count) * sizeof(circle[0])); unsigned head = 0; - for(;;) { + for (;;) { char *c; c = xmalloc_fgets(fp); if (!c) @@ -127,7 +127,7 @@ print_except_N_last_lines(FILE *fp, unsigned count) } ret: head = 0; - for(;;) { + for (;;) { free(circle[head++]); if (head == count) break; diff --git a/editors/patch.c b/editors/patch.c index 110176630..aebb5073e 100644 --- a/editors/patch.c +++ b/editors/patch.c @@ -418,7 +418,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv) } // Loop through the lines in the patch - for(;;) { + for (;;) { char *patchline; patchline = xmalloc_fgetline(stdin); diff --git a/editors/patch_toybox.c b/editors/patch_toybox.c index aebab8132..69a508b2e 100644 --- a/editors/patch_toybox.c +++ b/editors/patch_toybox.c @@ -441,7 +441,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv) TT.filein = TT.fileout = -1; // Loop through the lines in the patch - for(;;) { + for (;;) { char *patchline; patchline = get_line(TT.filepatch); diff --git a/include/libbb.h b/include/libbb.h index b45ce91c5..8e3b7ae8e 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -1900,6 +1900,8 @@ enum { * (unless fd is in non-blocking mode), * subsequent reads will time out after a few milliseconds. * Return of -1 means EOF or error (errno == 0 on EOF). + * Nonzero errno is not preserved across the call: + * if there was no error, errno will be cleared to 0. * buffer[0] is used as a counter of buffered chars and must be 0 * on first call. * timeout: diff --git a/libbb/lineedit.c b/libbb/lineedit.c index f76afd37d..82624757e 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c @@ -2155,7 +2155,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout) #endif fflush_all(); - while (1) { + for (;;) { /* Wait for input. TIMEOUT = -1 makes read_key wait even * on nonblocking stdin, TIMEOUT = 50 makes sure we won't * insist on full MB_CUR_MAX buffer to declare input like @@ -2167,24 +2167,30 @@ static int lineedit_read_key(char *read_key_buffer, int timeout) * * Note: read_key sets errno to 0 on success. */ - do { + for (;;) { if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) { errno = EINTR; return -1; } //FIXME: still races here with signals, but small window to poll() inside read_key IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) + /* errno = 0; - read_key does this itself */ ic = read_key(STDIN_FILENO, read_key_buffer, timeout); IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) - } while (!(state->flags & LI_INTERRUPTIBLE) && errno == EINTR); + if (errno != EINTR) + break; + if (state->flags & LI_INTERRUPTIBLE) { + /* LI_INTERRUPTIBLE bails out on EINTR, + * but nothing really guarantees that bb_got_signal + * is nonzero. Follow the least surprise principle: + */ + if (bb_got_signal == 0) + bb_got_signal = 255; + goto ret; + } + } if (errno) { - /* LI_INTERRUPTIBLE can bail out with EINTR here, - * but nothing really guarantees that bb_got_signal - * is nonzero. Follow the least surprise principle: - */ - if (errno == EINTR && bb_got_signal == 0) - bb_got_signal = 255; /* something nonzero */ #if ENABLE_UNICODE_SUPPORT if (errno == EAGAIN && unicode_idx != 0) goto pushback; @@ -2251,7 +2257,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout) #endif break; } - + ret: return ic; } diff --git a/libbb/read_key.c b/libbb/read_key.c index 829ae215c..cf8ed411e 100644 --- a/libbb/read_key.c +++ b/libbb/read_key.c @@ -291,6 +291,7 @@ int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout) { int64_t r; do { + /* errno = 0; - read_key does this itself */ r = read_key(fd, buffer, timeout); } while (errno == EINTR); return r; -- cgit v1.2.3-55-g6feb From 39369ff460f3e2dbfec7f6be181b2fb98f3c1867 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 23 Jan 2022 09:27:30 +0100 Subject: libbb/sha1: use SSE2 in unrolled x86-64 code. ~10% faster function old new delta .rodata 108241 108305 +64 sha1_process_block64 3502 3495 -7 ------------------------------------------------------------------------------ (add/remove: 5/0 grow/shrink: 1/1 up/down: 64/-7) Total: 57 bytes Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 992 +++++++++++++++++++++++------------------ libbb/hash_md5_sha_x86-64.S.sh | 440 ++++++++++++------ 2 files changed, 854 insertions(+), 578 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 87fb616a1..069a18719 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -20,16 +20,10 @@ sha1_process_block64: # eax..edx: a..d # ebp: e # esi,edi: temps -# -32+4*n(%rsp),r8...r15: W[0..7,8..15] -# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) - movl $3, %eax -1: - movq (%rdi,%rax,8), %rsi - bswapq %rsi - rolq $32, %rsi - movq %rsi, -32(%rsp,%rax,8) - decl %eax - jns 1b +# xmm0..xmm3: W[] +# xmm4,xmm5: temps +# xmm6: current round constant +# -64(%rsp): area for passing RCONST + W[] from vector to integer units movl 80(%rdi), %eax # a = ctx->hash[0] movl 84(%rdi), %ebx # b = ctx->hash[1] @@ -37,587 +31,709 @@ sha1_process_block64: movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] + movaps rconst0x5A827999(%rip), %xmm6 + + # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 + # instead of spilling them to stack. + # (We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so...) + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r10 + bswapq %rsi + bswapq %r10 + rolq $32, %rsi # rsi = W[1]:W[0] + rolq $32, %r10 + movq %rsi, %xmm0 + movq %r10, %xmm4 + punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) + movaps %xmm0, %xmm4 + paddd %xmm6, %xmm4 + movups %xmm4, -64+4*0(%rsp) + + movq 4*4(%rdi), %r8 + movq 4*6(%rdi), %r10 + bswapq %r8 + bswapq %r10 + rolq $32, %r8 + rolq $32, %r10 + movq %r8, %xmm1 + movq %r10, %xmm4 + punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) + movaps %xmm1, %xmm4 + paddd %xmm6, %xmm4 + movups %xmm4, -64+4*4(%rsp) + movq 4*8(%rdi), %r8 movq 4*10(%rdi), %r10 bswapq %r8 bswapq %r10 + movl %r8d, %r9d # r9d = W[9] + rolq $32, %r8 # r8 = W[9]:W[8] + movl %r10d, %r11d # r11d = W[11] + rolq $32, %r10 # r10 = W[11]:W[10] + movq %r8, %xmm2 + movq %r10, %xmm4 + punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) + movq 4*12(%rdi), %r12 movq 4*14(%rdi), %r14 bswapq %r12 bswapq %r14 - movl %r8d, %r9d - shrq $32, %r8 - movl %r10d, %r11d - shrq $32, %r10 - movl %r12d, %r13d - shrq $32, %r12 - movl %r14d, %r15d - shrq $32, %r14 + movl %r12d, %r13d # r13d = W[13] + rolq $32, %r12 # r12 = W[13]:W[12] + movl %r14d, %r15d # r15d = W[15] + rolq $32, %r14 # r14 = W[15]:W[14] + movq %r12, %xmm3 + movq %r14, %xmm4 + punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) # 0 - # W[0], already in %esi + leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 1 - movl -32+4*1(%rsp), %esi # W[n] + addl -64+4*1(%rsp), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 2 - movl -32+4*2(%rsp), %esi # W[n] + addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 3 - movl -32+4*3(%rsp), %esi # W[n] + addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 4 - movl -32+4*4(%rsp), %esi # W[n] + addl -64+4*4(%rsp), %eax # e += RCONST + W[n] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 5 - movl -32+4*5(%rsp), %esi # W[n] + addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 6 - movl -32+4*6(%rsp), %esi # W[n] + addl -64+4*6(%rsp), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 7 - movl -32+4*7(%rsp), %esi # W[n] + addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) # 8 - # W[n], in %r8 + leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 9 - # W[n], in %r9 + leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 10 - # W[n], in %r10 + leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 11 - # W[n], in %r11 + leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) + movaps rconst0x6ED9EBA1(%rip), %xmm6 +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) # 12 - # W[n], in %r12 + leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 13 - # W[n], in %r13 + leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 14 - # W[n], in %r14 + leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 15 - # W[n], in %r15 + leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] addl %edi, %ebp # e += (((c ^ d) & b) ^ d) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) # 16 - movl %r13d, %esi # W[(n+13) & 15] - xorl %r8d, %esi # ^W[(n+8) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*0(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*0(%rsp) # store to W[n & 15] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] + addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (((c ^ d) & b) ^ d) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 17 - movl %r14d, %esi # W[(n+13) & 15] - xorl %r9d, %esi # ^W[(n+8) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*1(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*1(%rsp) # store to W[n & 15] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] + addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (((c ^ d) & b) ^ d) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 18 - movl %r15d, %esi # W[(n+13) & 15] - xorl %r10d, %esi # ^W[(n+8) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*2(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*2(%rsp) # store to W[n & 15] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (((c ^ d) & b) ^ d) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 19 - movl -32+4*0(%rsp), %esi # W[(n+13) & 15] - xorl %r11d, %esi # ^W[(n+8) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*3(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*3(%rsp) # store to W[n & 15] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) - leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (((c ^ d) & b) ^ d) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) # 20 - movl -32+4*1(%rsp), %esi # W[(n+13) & 15] - xorl %r12d, %esi # ^W[(n+8) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*4(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*4(%rsp) # store to W[n & 15] movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] + addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 21 - movl -32+4*2(%rsp), %esi # W[(n+13) & 15] - xorl %r13d, %esi # ^W[(n+8) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*5(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*5(%rsp) # store to W[n & 15] movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] + addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 22 - movl -32+4*3(%rsp), %esi # W[(n+13) & 15] - xorl %r14d, %esi # ^W[(n+8) & 15] - xorl %r8d, %esi # ^W[(n+2) & 15] - xorl -32+4*6(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*6(%rsp) # store to W[n & 15] movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] + addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 23 - movl -32+4*4(%rsp), %esi # W[(n+13) & 15] - xorl %r15d, %esi # ^W[(n+8) & 15] - xorl %r9d, %esi # ^W[(n+2) & 15] - xorl -32+4*7(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*7(%rsp) # store to W[n & 15] movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) # 24 - xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] - xorl %r10d, %r8d # ^W[(n+2) & 15] - roll %r8d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] + addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 25 - xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] - xorl %r11d, %r9d # ^W[(n+2) & 15] - roll %r9d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] + addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 26 - xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] - xorl %r12d, %r10d # ^W[(n+2) & 15] - roll %r10d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] + addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 27 - xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] - xorl %r13d, %r11d # ^W[(n+2) & 15] - roll %r11d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] + addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) # 28 - xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] - xorl %r14d, %r12d # ^W[(n+2) & 15] - roll %r12d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] + addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 29 - xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] - xorl %r15d, %r13d # ^W[(n+2) & 15] - roll %r13d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] + addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 30 - xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] - roll %r14d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] + addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 31 - xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] - roll %r15d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] + addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) + movaps rconst0x8F1BBCDC(%rip), %xmm6 +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) # 32 - movl %r13d, %esi # W[(n+13) & 15] - xorl %r8d, %esi # ^W[(n+8) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*0(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*0(%rsp) # store to W[n & 15] movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] + addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 33 - movl %r14d, %esi # W[(n+13) & 15] - xorl %r9d, %esi # ^W[(n+8) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*1(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*1(%rsp) # store to W[n & 15] movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 34 - movl %r15d, %esi # W[(n+13) & 15] - xorl %r10d, %esi # ^W[(n+8) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*2(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*2(%rsp) # store to W[n & 15] movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 35 - movl -32+4*0(%rsp), %esi # W[(n+13) & 15] - xorl %r11d, %esi # ^W[(n+8) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*3(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*3(%rsp) # store to W[n & 15] movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] + addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) # 36 - movl -32+4*1(%rsp), %esi # W[(n+13) & 15] - xorl %r12d, %esi # ^W[(n+8) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*4(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*4(%rsp) # store to W[n & 15] movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] + addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 37 - movl -32+4*2(%rsp), %esi # W[(n+13) & 15] - xorl %r13d, %esi # ^W[(n+8) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*5(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*5(%rsp) # store to W[n & 15] movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] + addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 38 - movl -32+4*3(%rsp), %esi # W[(n+13) & 15] - xorl %r14d, %esi # ^W[(n+8) & 15] - xorl %r8d, %esi # ^W[(n+2) & 15] - xorl -32+4*6(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*6(%rsp) # store to W[n & 15] movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 39 - movl -32+4*4(%rsp), %esi # W[(n+13) & 15] - xorl %r15d, %esi # ^W[(n+8) & 15] - xorl %r9d, %esi # ^W[(n+2) & 15] - xorl -32+4*7(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*7(%rsp) # store to W[n & 15] movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) # 40 movl %ebx, %edi # di: b movl %ebx, %esi # si: b @@ -625,12 +741,8 @@ sha1_process_block64: andl %ecx, %esi # si: b & c andl %edx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] - xorl %r10d, %r8d # ^W[(n+2) & 15] - roll %r8d # addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] + addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -642,12 +754,8 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] - xorl %r11d, %r9d # ^W[(n+2) & 15] - roll %r9d # addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] + addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -659,12 +767,8 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] - xorl %r12d, %r10d # ^W[(n+2) & 15] - roll %r10d # addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] + addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -676,16 +780,37 @@ sha1_process_block64: andl %ebp, %esi # si: b & c andl %eax, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] - xorl %r13d, %r11d # ^W[(n+2) & 15] - roll %r11d # addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] + addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) # 44 movl %ecx, %edi # di: b movl %ecx, %esi # si: b @@ -693,12 +818,8 @@ sha1_process_block64: andl %edx, %esi # si: b & c andl %ebp, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] - xorl %r14d, %r12d # ^W[(n+2) & 15] - roll %r12d # addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] + addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) @@ -710,12 +831,8 @@ sha1_process_block64: andl %ecx, %esi # si: b & c andl %edx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] - xorl %r15d, %r13d # ^W[(n+2) & 15] - roll %r13d # addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] + addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -727,12 +844,8 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] - roll %r14d # addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] + addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -744,16 +857,37 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] - roll %r15d # addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] + addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) # 48 movl %edx, %edi # di: b movl %edx, %esi # si: b @@ -761,14 +895,8 @@ sha1_process_block64: andl %ebp, %esi # si: b & c andl %eax, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r13d, %esi # W[(n+13) & 15] - xorl %r8d, %esi # ^W[(n+8) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*0(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*0(%rsp) # store to W[n & 15] addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -780,14 +908,8 @@ sha1_process_block64: andl %edx, %esi # si: b & c andl %ebp, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r14d, %esi # W[(n+13) & 15] - xorl %r9d, %esi # ^W[(n+8) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*1(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*1(%rsp) # store to W[n & 15] addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) @@ -799,14 +921,8 @@ sha1_process_block64: andl %ecx, %esi # si: b & c andl %edx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl %r15d, %esi # W[(n+13) & 15] - xorl %r10d, %esi # ^W[(n+8) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*2(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*2(%rsp) # store to W[n & 15] addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] + addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) @@ -818,18 +934,38 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*0(%rsp), %esi # W[(n+13) & 15] - xorl %r11d, %esi # ^W[(n+8) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*3(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*3(%rsp) # store to W[n & 15] addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] + addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) + movaps rconst0xCA62C1D6(%rip), %xmm6 +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) # 52 movl %ebp, %edi # di: b movl %ebp, %esi # si: b @@ -837,14 +973,8 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*1(%rsp), %esi # W[(n+13) & 15] - xorl %r12d, %esi # ^W[(n+8) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*4(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*4(%rsp) # store to W[n & 15] addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] + addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -856,14 +986,8 @@ sha1_process_block64: andl %ebp, %esi # si: b & c andl %eax, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*2(%rsp), %esi # W[(n+13) & 15] - xorl %r13d, %esi # ^W[(n+8) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*5(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*5(%rsp) # store to W[n & 15] addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -875,14 +999,8 @@ sha1_process_block64: andl %edx, %esi # si: b & c andl %ebp, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*3(%rsp), %esi # W[(n+13) & 15] - xorl %r14d, %esi # ^W[(n+8) & 15] - xorl %r8d, %esi # ^W[(n+2) & 15] - xorl -32+4*6(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*6(%rsp) # store to W[n & 15] addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) @@ -894,18 +1012,37 @@ sha1_process_block64: andl %ecx, %esi # si: b & c andl %edx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - movl -32+4*4(%rsp), %esi # W[(n+13) & 15] - xorl %r15d, %esi # ^W[(n+8) & 15] - xorl %r9d, %esi # ^W[(n+2) & 15] - xorl -32+4*7(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*7(%rsp) # store to W[n & 15] addl %edi, %ebp # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] + addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) # 56 movl %eax, %edi # di: b movl %eax, %esi # si: b @@ -913,12 +1050,8 @@ sha1_process_block64: andl %ebx, %esi # si: b & c andl %ecx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] - xorl %r10d, %r8d # ^W[(n+2) & 15] - roll %r8d # addl %edi, %edx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] + addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) @@ -930,12 +1063,8 @@ sha1_process_block64: andl %eax, %esi # si: b & c andl %ebx, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] - xorl %r11d, %r9d # ^W[(n+2) & 15] - roll %r9d # addl %edi, %ecx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] + addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) @@ -947,12 +1076,8 @@ sha1_process_block64: andl %ebp, %esi # si: b & c andl %eax, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] - xorl %r12d, %r10d # ^W[(n+2) & 15] - roll %r10d # addl %edi, %ebx # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] + addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) @@ -964,307 +1089,282 @@ sha1_process_block64: andl %edx, %esi # si: b & c andl %ebp, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) - xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] - xorl %r13d, %r11d # ^W[(n+2) & 15] - roll %r11d # addl %edi, %eax # += ((b | c) & d) | (b & c) - leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] + addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) # 60 - xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] - xorl %r14d, %r12d # ^W[(n+2) & 15] - roll %r12d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] + addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 61 - xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] - xorl %r15d, %r13d # ^W[(n+2) & 15] - roll %r13d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] + addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 62 - xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] - roll %r14d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] + addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 63 - xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] - roll %r15d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] + addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) # 64 - movl %r13d, %esi # W[(n+13) & 15] - xorl %r8d, %esi # ^W[(n+8) & 15] - xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*0(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*0(%rsp) # store to W[n & 15] movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 65 - movl %r14d, %esi # W[(n+13) & 15] - xorl %r9d, %esi # ^W[(n+8) & 15] - xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*1(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*1(%rsp) # store to W[n & 15] movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] + addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 66 - movl %r15d, %esi # W[(n+13) & 15] - xorl %r10d, %esi # ^W[(n+8) & 15] - xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*2(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*2(%rsp) # store to W[n & 15] movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] + addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 67 - movl -32+4*0(%rsp), %esi # W[(n+13) & 15] - xorl %r11d, %esi # ^W[(n+8) & 15] - xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*3(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*3(%rsp) # store to W[n & 15] movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] + addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) # 68 - movl -32+4*1(%rsp), %esi # W[(n+13) & 15] - xorl %r12d, %esi # ^W[(n+8) & 15] - xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*4(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*4(%rsp) # store to W[n & 15] movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] + addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 69 - movl -32+4*2(%rsp), %esi # W[(n+13) & 15] - xorl %r13d, %esi # ^W[(n+8) & 15] - xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] - xorl -32+4*5(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*5(%rsp) # store to W[n & 15] movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] + addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 70 - movl -32+4*3(%rsp), %esi # W[(n+13) & 15] - xorl %r14d, %esi # ^W[(n+8) & 15] - xorl %r8d, %esi # ^W[(n+2) & 15] - xorl -32+4*6(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*6(%rsp) # store to W[n & 15] movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] + addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 71 - movl -32+4*4(%rsp), %esi # W[(n+13) & 15] - xorl %r15d, %esi # ^W[(n+8) & 15] - xorl %r9d, %esi # ^W[(n+2) & 15] - xorl -32+4*7(%rsp), %esi # ^W[n & 15] - roll %esi # - movl %esi, -32+4*7(%rsp) # store to W[n & 15] movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] + addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 72 - xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] - xorl %r10d, %r8d # ^W[(n+2) & 15] - roll %r8d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] + addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 73 - xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] - xorl %r11d, %r9d # ^W[(n+2) & 15] - roll %r9d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] + addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 74 - xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] - xorl %r12d, %r10d # ^W[(n+2) & 15] - roll %r10d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] + addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 75 - xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] - xorl %r13d, %r11d # ^W[(n+2) & 15] - roll %r11d # movl %ecx, %edi # c xorl %edx, %edi # ^d xorl %ebx, %edi # ^b - leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] + addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] addl %edi, %ebp # e += (c ^ d ^ b) movl %eax, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 76 - xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] - xorl %r14d, %r12d # ^W[(n+2) & 15] - roll %r12d # movl %ebx, %edi # c xorl %ecx, %edi # ^d xorl %eax, %edi # ^b - leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] + addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] addl %edi, %edx # e += (c ^ d ^ b) movl %ebp, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 77 - xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] - xorl %r15d, %r13d # ^W[(n+2) & 15] - roll %r13d # movl %eax, %edi # c xorl %ebx, %edi # ^d xorl %ebp, %edi # ^b - leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] + addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] addl %edi, %ecx # e += (c ^ d ^ b) movl %edx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 78 - xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] - xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] - roll %r14d # movl %ebp, %edi # c xorl %eax, %edi # ^d xorl %edx, %edi # ^b - leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] + addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] addl %edi, %ebx # e += (c ^ d ^ b) movl %ecx, %esi # roll $5, %esi # rotl32(a,5) addl %esi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 79 - xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] - xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] - xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] - roll %r15d # movl %edx, %edi # c xorl %ebp, %edi # ^d xorl %ecx, %edi # ^b - leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] + addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] addl %edi, %eax # e += (c ^ d ^ b) movl %ebx, %esi # roll $5, %esi # rotl32(a,5) @@ -1286,4 +1386,28 @@ sha1_process_block64: ret .size sha1_process_block64, .-sha1_process_block64 + + .section .rodata.cst16.sha1const, "aM", @progbits, 16 + .align 16 +rconst0x5A827999: + .long 0x5A827999 + .long 0x5A827999 + .long 0x5A827999 + .long 0x5A827999 +rconst0x6ED9EBA1: + .long 0x6ED9EBA1 + .long 0x6ED9EBA1 + .long 0x6ED9EBA1 + .long 0x6ED9EBA1 +rconst0x8F1BBCDC: + .long 0x8F1BBCDC + .long 0x8F1BBCDC + .long 0x8F1BBCDC + .long 0x8F1BBCDC +rconst0xCA62C1D6: + .long 0xCA62C1D6 + .long 0xCA62C1D6 + .long 0xCA62C1D6 + .long 0xCA62C1D6 + #endif diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 901896e6e..87c2d0800 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -6,33 +6,103 @@ # also contains the diff of the generated file. exec >hash_md5_sha_x86-64.S -# There is a way to use XMM registers (which always exist for x86-64!) for W[] -# For example, if we load W as follows: -# %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] -# %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] -# %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] -# %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] -# then the xor'ing operation to generate next W[0..3] is: -# movaps %xmm0, %xmmT2 -# palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) -# # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. -# movaps %xmm0, %xmmT13 -# palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) -# xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 -# xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or -# and then results can be extracted for use: -# movd %xmm0, %esi # new W[0] -# pextrd $1, %xmm0, %esi # new W[1] -# # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) -# pextrd $2, %xmm0, %esi # new W[2] -# pextrd $3, %xmm0, %esi # new W[3] -# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. +# Based on http://arctic.org/~dean/crypto/sha1.html. +# ("This SHA1 implementation is public domain.") +# +# x86-64 has at least SSE2 vector insns always available. +# We can use them without any CPUID checks (and without a need +# for a fallback code if needed insns are not available). +# This code uses them to calculate W[] ahead of time. +# +# Unfortunately, results are passed from vector unit to +# integer ALUs on the stack. MOVD/Q insns to move them directly +# from vector to integer registers are slower than store-to-load +# forwarding in LSU (on Skylake at least). +# +# The win against a purely integer code is small on Skylake, +# only about 7-8%. We offload about 1/3 of our operations to the vector unit. +# It can do 4 ops at once in one 128-bit register, +# but we have to use x2 of them because of W[0] complication, +# SSE2 has no "rotate each word by N bits" insns, +# moving data to/from vector unit is clunky, and Skylake +# has four integer ALUs unified with three vector ALUs, +# which makes pure integer code rather fast, and makes +# vector ops compete with integer ones. +# +# Zen3, with its separate vector ALUs, wins more, about 12%. + +xmmT1="%xmm4" +xmmT2="%xmm5" +xmmRCONST="%xmm6" +T=`printf '\t'` + +# SSE instructions are longer than 4 bytes on average. +# Intel CPUs (up to Tiger Lake at least) can't decode +# more than 16 bytes of code in one cycle. +# By interleaving SSE code and integer code +# we mostly achieve a situation where 16-byte decode fetch window +# contains 4 (or more) insns. +# +# However. On Skylake, there was no observed difference, +# but on Zen3, non-interleaved code is ~3% faster +# (822 Mb/s versus 795 Mb/s hashing speed). +# Off for now: +interleave=false + +INTERLEAVE() { + $interleave || \ + { + # Generate non-interleaved code + # (it should work correctly too) + echo "$1" + echo "$2" + return + } + ( + echo "$1" | grep -v '^$' >"$0.temp1" + echo "$2" | grep -v '^$' >"$0.temp2" + exec 3<"$0.temp1" + exec 4<"$0.temp2" + IFS='' + while :; do + line1='' + line2='' + while :; do + read -r line1 <&3 + if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then + break + fi + echo "$line1" + done + while :; do + read -r line2 <&4 + if test "${line2:0:4}" = "${T}lea"; then + # We use 7-8 byte long forms of LEA. + # Do not interleave them with SSE insns + # which are also long. + echo "$line2" + read -r line2 <&4 + echo "$line2" + continue + fi + if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then + break + fi + echo "$line2" + done + test "$line1$line2" || break + echo "$line1" + echo "$line2" + done + rm "$0.temp1" "$0.temp2" + ) +} echo \ -'### Generated by hash_md5_sha_x86-64.S.sh ### +"### Generated by hash_md5_sha_x86-64.S.sh ### #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) - .section .text.sha1_process_block64,"ax",@progbits + .section .text.sha1_process_block64,\"ax\",@progbits .globl sha1_process_block64 .hidden sha1_process_block64 .type sha1_process_block64, @function @@ -51,16 +121,10 @@ sha1_process_block64: # eax..edx: a..d # ebp: e # esi,edi: temps -# -32+4*n(%rsp),r8...r15: W[0..7,8..15] -# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) - movl $3, %eax -1: - movq (%rdi,%rax,8), %rsi - bswapq %rsi - rolq $32, %rsi - movq %rsi, -32(%rsp,%rax,8) - decl %eax - jns 1b +# xmm0..xmm3: W[] +# xmm4,xmm5: temps +# xmm6: current round constant +# -64(%rsp): area for passing RCONST + W[] from vector to integer units movl 80(%rdi), %eax # a = ctx->hash[0] movl 84(%rdi), %ebx # b = ctx->hash[1] @@ -68,32 +132,120 @@ sha1_process_block64: movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] + movaps rconst0x5A827999(%rip), $xmmRCONST + + # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 + # instead of spilling them to stack. + # (We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so...) + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r10 + bswapq %rsi + bswapq %r10 + rolq \$32, %rsi # rsi = W[1]:W[0] + rolq \$32, %r10 + movq %rsi, %xmm0 + movq %r10, $xmmT1 + punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) + movaps %xmm0, $xmmT1 + paddd $xmmRCONST, $xmmT1 + movups $xmmT1, -64+4*0(%rsp) + + movq 4*4(%rdi), %r8 + movq 4*6(%rdi), %r10 + bswapq %r8 + bswapq %r10 + rolq \$32, %r8 + rolq \$32, %r10 + movq %r8, %xmm1 + movq %r10, $xmmT1 + punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) + movaps %xmm1, $xmmT1 + paddd $xmmRCONST, $xmmT1 + movups $xmmT1, -64+4*4(%rsp) + movq 4*8(%rdi), %r8 movq 4*10(%rdi), %r10 bswapq %r8 bswapq %r10 + movl %r8d, %r9d # r9d = W[9] + rolq \$32, %r8 # r8 = W[9]:W[8] + movl %r10d, %r11d # r11d = W[11] + rolq \$32, %r10 # r10 = W[11]:W[10] + movq %r8, %xmm2 + movq %r10, $xmmT1 + punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) + movq 4*12(%rdi), %r12 movq 4*14(%rdi), %r14 bswapq %r12 bswapq %r14 - movl %r8d, %r9d - shrq $32, %r8 - movl %r10d, %r11d - shrq $32, %r10 - movl %r12d, %r13d - shrq $32, %r12 - movl %r14d, %r15d - shrq $32, %r14 -' -W32() { -test "$1" || exit 1 -test "$1" -lt 0 && exit 1 -test "$1" -gt 15 && exit 1 -test "$1" -lt 8 && echo "-32+4*$1(%rsp)" -test "$1" -ge 8 && echo "%r${1}d" + movl %r12d, %r13d # r13d = W[13] + rolq \$32, %r12 # r12 = W[13]:W[12] + movl %r14d, %r15d # r15d = W[15] + rolq \$32, %r14 # r14 = W[15]:W[14] + movq %r12, %xmm3 + movq %r14, $xmmT1 + punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) +" + +PREP() { +local xmmW0=$1 +local xmmW4=$2 +local xmmW8=$3 +local xmmW12=$4 +# the above must be %xmm0..3 in some permutation +local dstmem=$5 +#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); +#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); +#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); +#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); +#W[3] ^= rol(W[0], 1); +echo "# PREP $@ + movaps $xmmW12, $xmmT1 + psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + + pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) + punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) + + xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps $xmmT2, $xmmW0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps $xmmW0, $xmmT2 + + xorps $xmmT1, $xmmT1 # rol(W0,1): + pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) + paddd $xmmW0, $xmmW0 # shift left by 1 + psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + + pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps $xmmT2, $xmmT1 + pslld \$2, $xmmT2 + psrld \$30, $xmmT1 +# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) + xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 + + xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) +" +# movq $xmmW0, %r8 # high latency (~6 cycles) +# movaps $xmmW0, $xmmT1 +# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower +# movq $xmmT1, %r10 # high latency +# movq %r8, %r9 +# movq %r10, %r11 +# shrq \$32, %r9 +# shrq \$32, %r11 +# ^^^ slower than passing the results on stack (!!!) +echo " + movaps $xmmW0, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movups $xmmT2, $dstmem +" } -# It's possible to interleave insns in rounds to mostly eliminate +# It's possible to interleave integer insns in rounds to mostly eliminate # dependency chains, but this likely to only help old Pentium-based # CPUs (ones without OOO, which can only simultaneously execute a pair # of _adjacent_ insns). @@ -107,21 +259,16 @@ local n0=$(((n+0) & 15)) echo " # $n ";test $n0 = 0 && echo " - # W[0], already in %esi + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] ";test $n0 != 0 && test $n0 -lt 8 && echo " - movl `W32 $n0`, %esi # W[n] + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] ";test $n0 -ge 8 && echo " - # W[n], in %r$n0 + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] ";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d andl %e$b, %edi # &b xorl %e$d, %edi # (((c ^ d) & b) ^ d) -";test $n0 -lt 8 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] -";test $n0 -ge 8 && echo " - leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] -";echo " addl %edi, %e$e # e += (((c ^ d) & b) ^ d) movl %e$a, %esi # roll \$5, %esi # rotl32(a,5) @@ -138,28 +285,11 @@ local n2=$(((n+2) & 15)) local n0=$(((n+0) & 15)) echo " # $n -";test $n0 -lt 8 && echo " - movl `W32 $n13`, %esi # W[(n+13) & 15] - xorl `W32 $n8`, %esi # ^W[(n+8) & 15] - xorl `W32 $n2`, %esi # ^W[(n+2) & 15] - xorl `W32 $n0`, %esi # ^W[n & 15] - roll %esi # - movl %esi, `W32 $n0` # store to W[n & 15] -";test $n0 -ge 8 && echo " - xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] - xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] - xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] - roll `W32 $n0` # -";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d andl %e$b, %edi # &b xorl %e$d, %edi # (((c ^ d) & b) ^ d) -";test $n0 -lt 8 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] -";test $n0 -ge 8 && echo " - leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] -";echo " + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] addl %edi, %e$e # e += (((c ^ d) & b) ^ d) movl %e$a, %esi # roll \$5, %esi # rotl32(a,5) @@ -167,13 +297,6 @@ echo " rorl \$2, %e$b # b = rotl32(b,30) " } -{ -RCONST=0x5A827999 -RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 -RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 -RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 -RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 -} | grep -v '^$' RD2() { local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 @@ -184,27 +307,10 @@ local n2=$(((n+2) & 15)) local n0=$(((n+0) & 15)) echo " # $n -";test $n0 -lt 8 && echo " - movl `W32 $n13`, %esi # W[(n+13) & 15] - xorl `W32 $n8`, %esi # ^W[(n+8) & 15] - xorl `W32 $n2`, %esi # ^W[(n+2) & 15] - xorl `W32 $n0`, %esi # ^W[n & 15] - roll %esi # - movl %esi, `W32 $n0` # store to W[n & 15] -";test $n0 -ge 8 && echo " - xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] - xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] - xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] - roll `W32 $n0` # -";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d xorl %e$b, %edi # ^b -";test $n0 -lt 8 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] -";test $n0 -ge 8 && echo " - leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] -";echo " + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] addl %edi, %e$e # e += (c ^ d ^ b) movl %e$a, %esi # roll \$5, %esi # rotl32(a,5) @@ -212,13 +318,6 @@ echo " rorl \$2, %e$b # b = rotl32(b,30) " } -{ -RCONST=0x6ED9EBA1 -RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 -RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 -RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 -RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 -} | grep -v '^$' RD3() { local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 @@ -235,53 +334,82 @@ echo " andl %e$c, %esi # si: b & c andl %e$d, %edi # di: (b | c) & d orl %esi, %edi # ((b | c) & d) | (b & c) -";test $n0 -lt 8 && echo " - movl `W32 $n13`, %esi # W[(n+13) & 15] - xorl `W32 $n8`, %esi # ^W[(n+8) & 15] - xorl `W32 $n2`, %esi # ^W[(n+2) & 15] - xorl `W32 $n0`, %esi # ^W[n & 15] - roll %esi # - movl %esi, `W32 $n0` # store to W[n & 15] -";test $n0 -ge 8 && echo " - xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] - xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] - xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] - roll `W32 $n0` # -";echo " addl %edi, %e$e # += ((b | c) & d) | (b & c) -";test $n0 -lt 8 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] -";test $n0 -ge 8 && echo " - leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] -";echo " + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] movl %e$a, %esi # roll \$5, %esi # rotl32(a,5) addl %esi, %e$e # e += rotl32(a,5) rorl \$2, %e$b # b = rotl32(b,30) " } + { -#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" -RCONST=-0x70E44324 -RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 -RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 -RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 -RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 -} | grep -v '^$' +# Round 1 +RCONST=0x5A827999 +RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; +RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` +INTERLEAVE "$a" "$b" +a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" + PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` +INTERLEAVE "$a" "$b" + +# Round 2 +RCONST=0x6ED9EBA1 +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` +INTERLEAVE "$a" "$b" +a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" + PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` +INTERLEAVE "$a" "$b" + +# Round 3 +RCONST=0x8F1BBCDC +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` +INTERLEAVE "$a" "$b" +a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" + PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` +INTERLEAVE "$a" "$b" # Round 4 has the same logic as round 2, only n and RCONST are different -{ -#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" -RCONST=-0x359D3E2A -RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 -RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 -RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 -RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 -# Note: new W[n&15] values generated in last 3 iterations -# (W[13,14,15]) are unused after each of these iterations. -# Since we use r8..r15 for W[8..15], this does not matter. -# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] -# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. +RCONST=0xCA62C1D6 +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` +INTERLEAVE "$a" "$b" +RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; +RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; } | grep -v '^$' echo " @@ -300,4 +428,28 @@ echo " ret .size sha1_process_block64, .-sha1_process_block64 + + .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 + .align 16 +rconst0x5A827999: + .long 0x5A827999 + .long 0x5A827999 + .long 0x5A827999 + .long 0x5A827999 +rconst0x6ED9EBA1: + .long 0x6ED9EBA1 + .long 0x6ED9EBA1 + .long 0x6ED9EBA1 + .long 0x6ED9EBA1 +rconst0x8F1BBCDC: + .long 0x8F1BBCDC + .long 0x8F1BBCDC + .long 0x8F1BBCDC + .long 0x8F1BBCDC +rconst0xCA62C1D6: + .long 0xCA62C1D6 + .long 0xCA62C1D6 + .long 0xCA62C1D6 + .long 0xCA62C1D6 + #endif" -- cgit v1.2.3-55-g6feb From 33a9f34df5c53d3dd074a2168ff40d612a36667a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 23 Jan 2022 15:46:05 +0100 Subject: add busybox_ldscript.README.txt Signed-off-by: Denys Vlasenko --- busybox_ldscript.README.txt | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 busybox_ldscript.README.txt diff --git a/busybox_ldscript.README.txt b/busybox_ldscript.README.txt new file mode 100644 index 000000000..1625a970a --- /dev/null +++ b/busybox_ldscript.README.txt @@ -0,0 +1,47 @@ +/* Add SORT_BY_ALIGNMENT to linker script (found in busybox_unstripped.out): +## .rodata : { *(.rodata SORT_BY_ALIGNMENT(.rodata.*) .gnu.linkonce.r.*) } +## .data : { *(.data SORT_BY_ALIGNMENT(.data.*) .gnu.linkonce.d.*) } +## .bss : { *(.bss SORT_BY_ALIGNMENT(.bss.*) .gnu.linkonce.b.*) } +## This will eliminate most of the padding (~3kb). +## Hmm, "ld --sort-section alignment" should do it too. +## +## There is a ld hack which is meant to decrease disk usage +## at the cost of more RAM usage (??!!) in standard ld script: +## . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000); +## Replace it with: +## . = ALIGN (0x1000); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000); +## to unconditionally align .data to the next page boundary, +## instead of "next page, plus current offset in this page" +*/ + +/* To reduce the number of VMAs each bbox process has, +## move *(.bss SORT_BY_ALIGNMENT(.bss.*) ...) +## part from .bss : {...} block to .data : { ... } block. +## (This usually increases .data section by only one page). +## Result: +## +## text data bss dec hex filename +## 1050792 560 7580 1058932 102874 busybox.bss +## 1050792 8149 0 1058941 10287d busybox.nobss +## +## $ exec busybox.bss pmap $$ +## 0000000008048000 1028K r-xp /path/to/busybox.bss +## 0000000008149000 8K rw-p /path/to/busybox.bss +## 000000000814b000 4K rw-p [ anon ] <---- this VMA is eliminated +## 00000000085f5000 4K ---p [heap] +## 00000000085f6000 4K rw-p [heap] +## 00000000f7778000 8K rw-p [ anon ] +## 00000000f777a000 12K r--p [vvar] +## 00000000f777d000 8K r-xp [vdso] +## 00000000ff7e9000 132K rw-p [stack] +## +## $ exec busybox.nobss pmap $$ +## 0000000008048000 1028K r-xp /path/to/busybox.nobss +## 0000000008149000 12K rw-p /path/to/busybox.nobss +## 00000000086f0000 4K ---p [heap] +## 00000000086f1000 4K rw-p [heap] +## 00000000f7783000 8K rw-p [ anon ] +## 00000000f7785000 12K r--p [vvar] +## 00000000f7788000 8K r-xp [vdso] +## 00000000ffac0000 132K rw-p [stack] +*/ -- cgit v1.2.3-55-g6feb From e998c7c032458a05a7afcc13ce0dc980b99ecc6c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 23 Jan 2022 18:48:49 +0100 Subject: sed: fix handling of escaped delimiters in s/// search pattern, closes 14541 function old new delta copy_parsing_escapes 67 96 +29 parse_regex_delim 109 111 +2 get_address 213 215 +2 add_cmd 1176 1178 +2 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 4/0 up/down: 35/0) Total: 35 bytes Signed-off-by: Denys Vlasenko --- editors/sed.c | 19 +++++++++++-------- testsuite/sed.tests | 10 ++++++++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/editors/sed.c b/editors/sed.c index 48b0dbf67..02a527b4a 100644 --- a/editors/sed.c +++ b/editors/sed.c @@ -246,7 +246,6 @@ static void cleanup_outname(void) } /* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */ - static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to) { char *d = dest; @@ -276,7 +275,7 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from return d - dest; } -static char *copy_parsing_escapes(const char *string, int len) +static char *copy_parsing_escapes(const char *string, int len, char delim) { const char *s; char *dest = xmalloc(len + 1); @@ -287,10 +286,15 @@ static char *copy_parsing_escapes(const char *string, int len) len = parse_escapes(dest, string, len, s[1], s[0]); string = dest; } + if (delim) { + /* we additionally unescape any instances of escaped delimiter. + * For example, in 's+9\++X+' the pattern is "9+", not "9\+". + */ + len = parse_escapes(dest, string, len, delim, delim); + } return dest; } - /* * index_of_next_unescaped_regexp_delim - walks left to right through a string * beginning at a specified index and returns the index of the next regular @@ -347,12 +351,11 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace) /* save the match string */ idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr); - *match = copy_parsing_escapes(cmdstr_ptr, idx); - + *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter); /* save the replacement string */ cmdstr_ptr += idx + 1; idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr); - *replace = copy_parsing_escapes(cmdstr_ptr, idx); + *replace = copy_parsing_escapes(cmdstr_ptr, idx, 0); return ((cmdstr_ptr - cmdstr) + idx); } @@ -380,7 +383,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex) delimiter = *++pos; next = index_of_next_unescaped_regexp_delim(delimiter, ++pos); if (next != 0) { - temp = copy_parsing_escapes(pos, next); + temp = copy_parsing_escapes(pos, next, 0); G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t)); xregcomp(*regex, temp, G.regex_type); free(temp); @@ -575,7 +578,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr) cmdstr++; } len = strlen(cmdstr); - sed_cmd->string = copy_parsing_escapes(cmdstr, len); + sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0); cmdstr += len; /* "\anychar" -> "anychar" */ parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0'); diff --git a/testsuite/sed.tests b/testsuite/sed.tests index e62b839f7..440996a21 100755 --- a/testsuite/sed.tests +++ b/testsuite/sed.tests @@ -324,6 +324,16 @@ testing "sed zero chars match/replace logic must not falsely trigger here 2" \ "sed 's/ *$/_/g'" \ "qwerty_\n" "" "qwerty\n" +# the pattern here is interpreted as "9+", not as "9\+" +testing "sed special char as s/// delimiter, in pattern" \ + "sed 's+9\++X+'" \ + "X8=17\n" "" "9+8=17\n" + +# but in replacement string, "\&" remains "\&", not interpreted as "&" +testing "sed special char as s/// delimiter, in replacement" \ + "sed 's&9&X\&&'" \ + "X&+8=17\n" "" "9+8=17\n" + testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \ "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \ "\ -- cgit v1.2.3-55-g6feb From f12fb1e4092900f26f7f8c71cde44b1cd7d26439 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 23 Jan 2022 19:04:27 +0100 Subject: sed: fix handling of escaped delimiters in s/// replacement function old new delta parse_regex_delim 111 140 +29 Signed-off-by: Denys Vlasenko --- editors/sed.c | 5 ++++- testsuite/sed.tests | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/editors/sed.c b/editors/sed.c index 02a527b4a..32a4b61f6 100644 --- a/editors/sed.c +++ b/editors/sed.c @@ -355,7 +355,10 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace) /* save the replacement string */ cmdstr_ptr += idx + 1; idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr); - *replace = copy_parsing_escapes(cmdstr_ptr, idx, 0); +//GNU sed 4.8: +// echo 789 | sed 's&8&\&&' - 7&9 ("\&" remained "\&") +// echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11") + *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0); return ((cmdstr_ptr - cmdstr) + idx); } diff --git a/testsuite/sed.tests b/testsuite/sed.tests index 440996a21..626542e33 100755 --- a/testsuite/sed.tests +++ b/testsuite/sed.tests @@ -329,10 +329,15 @@ testing "sed special char as s/// delimiter, in pattern" \ "sed 's+9\++X+'" \ "X8=17\n" "" "9+8=17\n" -# but in replacement string, "\&" remains "\&", not interpreted as "&" -testing "sed special char as s/// delimiter, in replacement" \ +# Matching GNU sed 4.8: +# in replacement string, "\&" remains "\&", not interpreted as "&" +testing "sed special char as s/// delimiter, in replacement 1" \ "sed 's&9&X\&&'" \ "X&+8=17\n" "" "9+8=17\n" +# in replacement string, "\1" is interpreted as "1" +testing "sed special char as s/// delimiter, in replacement 2" \ + "sed 's1\(9\)1X\11'" \ + "X1+8=17\n" "" "9+8=17\n" testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \ "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \ -- cgit v1.2.3-55-g6feb From 6dd6a6c42d1465d8cca2539476f6bffd5e1353dd Mon Sep 17 00:00:00 2001 From: Walter Lozano Date: Fri, 21 Jan 2022 11:00:27 -0300 Subject: Add support for long options to cmp In order to improve compatibility with GNU cmp add support for long options to busybox cmp. function old new delta static.cmp_longopts - 36 +36 cmp_main 589 594 +5 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 1/0 up/down: 41/0) Total: 41 bytes Signed-off-by: Walter Lozano Signed-off-by: Denys Vlasenko --- editors/cmp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/editors/cmp.c b/editors/cmp.c index 6d2b0c6c3..b89e519ad 100644 --- a/editors/cmp.c +++ b/editors/cmp.c @@ -54,6 +54,7 @@ int cmp_main(int argc UNUSED_PARAM, char **argv) int retval = 0; int max_count = -1; +#if !ENABLE_LONG_OPTS opt = getopt32(argv, "^" OPT_STR "\0" "-1" @@ -62,6 +63,23 @@ int cmp_main(int argc UNUSED_PARAM, char **argv) ":l--s:s--l", &max_count ); +#else + static const char cmp_longopts[] ALIGN1 = + "bytes\0" Required_argument "n" + "quiet\0" No_argument "s" + "silent\0" No_argument "s" + "verbose\0" No_argument "l" + ; + opt = getopt32long(argv, "^" + OPT_STR + "\0" "-1" + IF_DESKTOP(":?4") + IF_NOT_DESKTOP(":?2") + ":l--s:s--l", + cmp_longopts, + &max_count + ); +#endif argv += optind; filename1 = *argv; -- cgit v1.2.3-55-g6feb From 78fdf4d22d578d5d51cc08c768b35d050a92902a Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Fri, 21 Jan 2022 13:17:00 +0200 Subject: mkfs.vfat: fix volume label to be padded with space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification requires volume label to be space padded. Latest fsck.vfat will remove the zero padded volume label as invalid. See also: https://github.com/dosfstools/dosfstools/issues/172 Make the default label also "NO NAME" which has the special meaning that label is not set. function old new delta mkfs_vfat_main 1470 1502 +32 static.NO_NAME_11 - 12 +12 .rodata 104309 104318 +9 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 2/0 up/down: 53/0) Total: 53 bytes Signed-off-by: Timo Teräs Signed-off-by: Denys Vlasenko --- util-linux/mkfs_vfat.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/util-linux/mkfs_vfat.c b/util-linux/mkfs_vfat.c index 844d965f8..821371953 100644 --- a/util-linux/mkfs_vfat.c +++ b/util-linux/mkfs_vfat.c @@ -218,8 +218,11 @@ static const char boot_code[] ALIGN1 = int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) { + static const char NO_NAME_11[] = "NO NAME "; + struct stat st; - const char *volume_label = ""; + const char *arg_volume_label = NO_NAME_11; //default + char volume_label11[12]; char *buf; char *device_name; uoff_t volume_size_bytes; @@ -257,14 +260,17 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) opts = getopt32(argv, "^" "Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v" "\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c - NULL, NULL, NULL, NULL, NULL, - NULL, NULL, &volume_label, NULL, NULL, NULL, NULL); + /*b*/NULL, /*f*/NULL, /*F*/NULL, /*h*/NULL, /*i*/NULL, + /*l*/NULL, /*m*/NULL, /*n*/&arg_volume_label, + /*r*/NULL, /*R*/NULL, /*s*/NULL, /*S*/NULL); argv += optind; // cache device name device_name = argv[0]; // default volume ID = creation time volume_id = time(NULL); + // truncate to exactly 11 chars, pad with spaces + sprintf(volume_label11, "%-11.11s", arg_volume_label); dev = xopen(device_name, O_RDWR); xfstat(dev, &st, device_name); @@ -459,7 +465,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) (int)media_byte, volume_size_sect, (int)total_clust, (int)sect_per_clust, sect_per_fat, - (int)volume_id, volume_label + (int)volume_id, volume_label11 ); } @@ -508,7 +514,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) STORE_LE(boot_blk->vi.ext_boot_sign, 0x29); STORE_LE(boot_blk->vi.volume_id32, volume_id); memcpy(boot_blk->vi.fs_type, "FAT32 ", sizeof(boot_blk->vi.fs_type)); - strncpy(boot_blk->vi.volume_label, volume_label, sizeof(boot_blk->vi.volume_label)); + memcpy(boot_blk->vi.volume_label, volume_label11, 11); memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code)); STORE_LE(boot_blk->boot_sign, BOOT_SIGN); @@ -545,15 +551,18 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) // root directory // empty directory is just a set of zero bytes memset(buf, 0, sect_per_clust * bytes_per_sect); - if (volume_label[0]) { - // create dir entry for volume_label + // not "NO NAME", "NO NAME " etc? + // (mkfs.fat 4.1 won't create dir entry even with explicit -n 'NO NAME', + // but will create one with e.g. -n '', -n ' zZz') + if (strcmp(volume_label11, NO_NAME_11) != 0) { + // create dir entry for volume label struct msdos_dir_entry *de; #if 0 struct tm tm_time; uint16_t t, d; #endif de = (void*)buf; - strncpy(de->name, volume_label, sizeof(de->name)); + memcpy(de->name, volume_label11, 11); STORE_LE(de->attr, ATTR_VOLUME); #if 0 localtime_r(&create_time, &tm_time); -- cgit v1.2.3-55-g6feb From 117a8c9b7a50053964159c342af1f3810cbbd5b8 Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Wed, 12 Jan 2022 10:54:54 -0800 Subject: apply const trick to ptr_to_globals This was missing in the previous attempt to fix it via [1] This helps fix segfaults when compiling with clang ( seen on riscv64 ) [ 452.428349] less[270]: unhandled signal 11 code 0x1 at 0x000000000000000c in busybox.nosuid[2ab7491000+ba000] [ 452.430246] CPU: 3 PID: 270 Comm: less Not tainted 5.15.13-yocto-standard #1 [ 452.431323] Hardware name: riscv-virtio,qemu (DT) [ 452.431925] epc : 0000002ab74a19ee ra : 0000002ab74a19dc sp : 0000003fec6ec980 [ 452.432725] gp : 0000002ab754dcb0 tp : 0000003f88783800 t0 : 0000003f8878d4a0 [ 452.433744] t1 : 0000002ab749b00c t2 : 0000000000000000 s0 : 0000003fec6ecc38 [ 452.434732] s1 : 000000000000004c a0 : 00000000ffffffff a1 : 0000002ab754dde0 [ 452.435861] a2 : 0000000000000000 a3 : 0000000000000100 a4 : 0000002ab754f3a0 [ 452.436787] a5 : 0000002ab754f3a0 a6 : 0000000000000000 a7 : 0000002ab754f2a0 [ 452.437974] s2 : 0000000000000002 s3 : 0000002ab754b6c8 s4 : 0000002ab749b60e [ 452.438781] s5 : 0000000000000000 s6 : 0000002ab754b6c8 s7 : 0000003f88943060 [ 452.439723] s8 : 0000003f88944050 s9 : 0000002ad8502e88 s10: 0000002ad8502de8 [ 452.440538] s11: 0000000000000014 t3 : 0000003f887fceb6 t4 : 0000003f8893af0c [ 452.441438] t5 : 0000000000000000 t6 : 0000003f88923000 [1] https://git.busybox.net/busybox/commit/?id=1f925038a Signed-off-by: Khem Raj Signed-off-by: Denys Vlasenko --- include/libbb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/libbb.h b/include/libbb.h index 8e3b7ae8e..6aeec249d 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -2292,7 +2292,7 @@ struct globals; /* '*const' ptr makes gcc optimize code much better. * Magic prevents ptr_to_globals from going into rodata. * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */ -extern struct globals *const ptr_to_globals; +extern struct globals *BB_GLOBAL_CONST ptr_to_globals; #define barrier() asm volatile ("":::"memory") -- cgit v1.2.3-55-g6feb From 99e22d230ded676ab53dfa8ab276c1301c2955a0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 24 Jan 2022 07:07:17 +0100 Subject: cut: build fix for FEATURE_CUT_REGEX Signed-off-by: Denys Vlasenko --- libbb/Kbuild.src | 1 + 1 file changed, 1 insertion(+) diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index e8bb24f6d..b9d34de8e 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -200,6 +200,7 @@ lib-$(CONFIG_PGREP) += xregcomp.o lib-$(CONFIG_PKILL) += xregcomp.o lib-$(CONFIG_DEVFSD) += xregcomp.o lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o +lib-$(CONFIG_FEATURE_CUT_REGEX) += xregcomp.o # Add the experimental logging functionality, only used by zcip lib-$(CONFIG_ZCIP) += logenv.o -- cgit v1.2.3-55-g6feb From 205042c07a3bf6c8e685c434713f2a9e46630cd0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 25 Jan 2022 17:00:57 +0100 Subject: libbb/sha1: in unrolled x86-64 code, pass initial W[] in registers, not on stack This can be faster on some CPUs. On Skylake, evidently load latency from L1 (or store-to-load forwarding in LSU) is fast enough to completely hide memory reference latencies here. function old new delta sha1_process_block64 3495 3514 +19 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 310 +++++++++++++++++++++-------------------- libbb/hash_md5_sha_x86-64.S.sh | 109 ++++++++------- 2 files changed, 214 insertions(+), 205 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 069a18719..743269d98 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -1,7 +1,7 @@ ### Generated by hash_md5_sha_x86-64.S.sh ### #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) - .section .text.sha1_process_block64,"ax",@progbits + .section .text.sha1_process_block64, "ax", @progbits .globl sha1_process_block64 .hidden sha1_process_block64 .type sha1_process_block64, @function @@ -10,7 +10,7 @@ sha1_process_block64: pushq %rbp # 1 byte insn pushq %rbx # 1 byte insn - pushq %r15 # 2 byte insn +# pushq %r15 # 2 byte insn pushq %r14 # 2 byte insn pushq %r13 # 2 byte insn pushq %r12 # 2 byte insn @@ -19,7 +19,8 @@ sha1_process_block64: #Register and stack use: # eax..edx: a..d # ebp: e -# esi,edi: temps +# esi,edi,r8..r14: temps +# r15: unused # xmm0..xmm3: W[] # xmm4,xmm5: temps # xmm6: current round constant @@ -33,147 +34,148 @@ sha1_process_block64: movaps rconst0x5A827999(%rip), %xmm6 - # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 - # instead of spilling them to stack. - # (We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so...) + # Load W[] to xmm registers, byteswapping on the fly. + # + # For iterations 0..15, we pass W[] in rsi,r8..r14 + # for use in RD1A's instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it's probably a wash. + # (We use rsi instead of rN because this makes two + # LEAs in two first RD1A's shorter by one byte). movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r10 + movq 4*2(%rdi), %r8 bswapq %rsi - bswapq %r10 + bswapq %r8 rolq $32, %rsi # rsi = W[1]:W[0] - rolq $32, %r10 + rolq $32, %r8 # r8 = W[3]:W[2] movq %rsi, %xmm0 - movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) - movaps %xmm0, %xmm4 - paddd %xmm6, %xmm4 - movups %xmm4, -64+4*0(%rsp) + movq %r8, %xmm4 + punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, %xmm4 # add RCONST, spill to stack +# paddd %xmm6, %xmm4 +# movups %xmm4, -64+16*0(%rsp) - movq 4*4(%rdi), %r8 + movq 4*4(%rdi), %r9 movq 4*6(%rdi), %r10 - bswapq %r8 + bswapq %r9 bswapq %r10 - rolq $32, %r8 - rolq $32, %r10 - movq %r8, %xmm1 + rolq $32, %r9 # r9 = W[5]:W[4] + rolq $32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) - movaps %xmm1, %xmm4 - paddd %xmm6, %xmm4 - movups %xmm4, -64+4*4(%rsp) + punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - movq 4*8(%rdi), %r8 - movq 4*10(%rdi), %r10 - bswapq %r8 - bswapq %r10 - movl %r8d, %r9d # r9d = W[9] - rolq $32, %r8 # r8 = W[9]:W[8] - movl %r10d, %r11d # r11d = W[11] - rolq $32, %r10 # r10 = W[11]:W[10] - movq %r8, %xmm2 - movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq $32, %r11 # r11 = W[9]:W[8] + rolq $32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, %xmm4 + punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - movq 4*12(%rdi), %r12 + movq 4*12(%rdi), %r13 movq 4*14(%rdi), %r14 - bswapq %r12 + bswapq %r13 bswapq %r14 - movl %r12d, %r13d # r13d = W[13] - rolq $32, %r12 # r12 = W[13]:W[12] - movl %r14d, %r15d # r15d = W[15] + rolq $32, %r13 # r13 = W[13]:W[12] rolq $32, %r14 # r14 = W[15]:W[14] - movq %r12, %xmm3 + movq %r13, %xmm3 movq %r14, %xmm4 - punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) + punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) # 0 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] + shrq $32, %rsi movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 1 - addl -64+4*1(%rsp), %edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 2 - addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] + shrq $32, %r8 movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 3 - addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 4 - addl -64+4*4(%rsp), %eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] + shrq $32, %r9 movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 5 - addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 6 - addl -64+4*6(%rsp), %edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] + shrq $32, %r10 movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 7 - addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) movaps %xmm3, %xmm4 @@ -186,9 +188,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm0, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -201,48 +203,50 @@ sha1_process_block64: paddd %xmm6, %xmm5 movups %xmm5, -64+16*0(%rsp) # 8 - leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] + shrq $32, %r11 movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 9 - leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 10 - leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] + shrq $32, %r12 movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 11 - leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b xorl %ecx, %edi # (((c ^ d) & b) ^ d) addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) movaps rconst0x6ED9EBA1(%rip), %xmm6 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) @@ -256,9 +260,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm1, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -271,15 +275,16 @@ sha1_process_block64: paddd %xmm6, %xmm5 movups %xmm5, -64+16*1(%rsp) # 12 - leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] + shrq $32, %r13 movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b xorl %ebx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 13 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] @@ -288,31 +293,32 @@ sha1_process_block64: andl %edx, %edi # &b xorl %eax, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 14 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] + shrq $32, %r14 movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b xorl %ebp, %edi # (((c ^ d) & b) ^ d) addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 15 - leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b xorl %edx, %edi # (((c ^ d) & b) ^ d) addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) movaps %xmm1, %xmm4 @@ -325,9 +331,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm2, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -394,9 +400,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm3, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -459,9 +465,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm0, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -524,9 +530,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm1, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -590,9 +596,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm2, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -655,9 +661,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm3, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -720,9 +726,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm0, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -797,9 +803,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm1, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -874,9 +880,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm2, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -952,9 +958,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm3, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -1029,9 +1035,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm0, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -1106,9 +1112,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm1, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -1171,9 +1177,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm2, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -1236,9 +1242,9 @@ sha1_process_block64: # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup movaps %xmm3, %xmm5 xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) movaps %xmm5, %xmm4 @@ -1378,7 +1384,7 @@ sha1_process_block64: addl %ebx, 84(%rdi) # ctx->hash[1] += b popq %r14 # addl %ecx, 88(%rdi) # ctx->hash[2] += c - popq %r15 # +# popq %r15 # addl %edx, 92(%rdi) # ctx->hash[3] += d popq %rbx # addl %ebp, 96(%rdi) # ctx->hash[4] += e diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 87c2d0800..47c40af0d 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -102,7 +102,7 @@ echo \ "### Generated by hash_md5_sha_x86-64.S.sh ### #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) - .section .text.sha1_process_block64,\"ax\",@progbits + .section .text.sha1_process_block64, \"ax\", @progbits .globl sha1_process_block64 .hidden sha1_process_block64 .type sha1_process_block64, @function @@ -111,7 +111,7 @@ echo \ sha1_process_block64: pushq %rbp # 1 byte insn pushq %rbx # 1 byte insn - pushq %r15 # 2 byte insn +# pushq %r15 # 2 byte insn pushq %r14 # 2 byte insn pushq %r13 # 2 byte insn pushq %r12 # 2 byte insn @@ -120,7 +120,8 @@ sha1_process_block64: #Register and stack use: # eax..edx: a..d # ebp: e -# esi,edi: temps +# esi,edi,r8..r14: temps +# r15: unused # xmm0..xmm3: W[] # xmm4,xmm5: temps # xmm6: current round constant @@ -134,59 +135,56 @@ sha1_process_block64: movaps rconst0x5A827999(%rip), $xmmRCONST - # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 - # instead of spilling them to stack. - # (We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so...) + # Load W[] to xmm registers, byteswapping on the fly. + # + # For iterations 0..15, we pass W[] in rsi,r8..r14 + # for use in RD1A's instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it's probably a wash. + # (We use rsi instead of rN because this makes two + # LEAs in two first RD1A's shorter by one byte). movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r10 + movq 4*2(%rdi), %r8 bswapq %rsi - bswapq %r10 + bswapq %r8 rolq \$32, %rsi # rsi = W[1]:W[0] - rolq \$32, %r10 + rolq \$32, %r8 # r8 = W[3]:W[2] movq %rsi, %xmm0 - movq %r10, $xmmT1 - punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) - movaps %xmm0, $xmmT1 - paddd $xmmRCONST, $xmmT1 - movups $xmmT1, -64+4*0(%rsp) + movq %r8, $xmmT1 + punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, $xmmT1 # add RCONST, spill to stack +# paddd $xmmRCONST, $xmmT1 +# movups $xmmT1, -64+16*0(%rsp) - movq 4*4(%rdi), %r8 + movq 4*4(%rdi), %r9 movq 4*6(%rdi), %r10 - bswapq %r8 + bswapq %r9 bswapq %r10 - rolq \$32, %r8 - rolq \$32, %r10 - movq %r8, %xmm1 + rolq \$32, %r9 # r9 = W[5]:W[4] + rolq \$32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 movq %r10, $xmmT1 - punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) - movaps %xmm1, $xmmT1 - paddd $xmmRCONST, $xmmT1 - movups $xmmT1, -64+4*4(%rsp) + punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - movq 4*8(%rdi), %r8 - movq 4*10(%rdi), %r10 - bswapq %r8 - bswapq %r10 - movl %r8d, %r9d # r9d = W[9] - rolq \$32, %r8 # r8 = W[9]:W[8] - movl %r10d, %r11d # r11d = W[11] - rolq \$32, %r10 # r10 = W[11]:W[10] - movq %r8, %xmm2 - movq %r10, $xmmT1 - punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq \$32, %r11 # r11 = W[9]:W[8] + rolq \$32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, $xmmT1 + punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - movq 4*12(%rdi), %r12 + movq 4*12(%rdi), %r13 movq 4*14(%rdi), %r14 - bswapq %r12 + bswapq %r13 bswapq %r14 - movl %r12d, %r13d # r13d = W[13] - rolq \$32, %r12 # r12 = W[13]:W[12] - movl %r14d, %r15d # r15d = W[15] + rolq \$32, %r13 # r13 = W[13]:W[12] rolq \$32, %r14 # r14 = W[15]:W[14] - movq %r12, %xmm3 + movq %r13, %xmm3 movq %r14, $xmmT1 - punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) + punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) " PREP() { @@ -215,9 +213,9 @@ echo "# PREP $@ movaps $xmmW0, $xmmT2 xorps $xmmT1, $xmmT1 # rol(W0,1): - pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) - paddd $xmmW0, $xmmW0 # shift left by 1 - psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 + pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) + paddd $xmmW0, $xmmW0 # shift left by 1 + psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) @@ -256,23 +254,28 @@ RD1A() { local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 local n=$(($6)) local n0=$(((n+0) & 15)) +local rN=$((7+n0/2)) echo " # $n ";test $n0 = 0 && echo " leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] -";test $n0 != 0 && test $n0 -lt 8 && echo " - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] -";test $n0 -ge 8 && echo " - leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] + shrq \$32, %rsi +";test $n0 = 1 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] + shrq \$32, %r$rN +";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] ";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d andl %e$b, %edi # &b xorl %e$d, %edi # (((c ^ d) & b) ^ d) addl %edi, %e$e # e += (((c ^ d) & b) ^ d) - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) + movl %e$a, %edi # + roll \$5, %edi # rotl32(a,5) + addl %edi, %e$e # e += rotl32(a,5) rorl \$2, %e$b # b = rotl32(b,30) " } @@ -420,7 +423,7 @@ echo " addl %ebx, 84(%rdi) # ctx->hash[1] += b popq %r14 # addl %ecx, 88(%rdi) # ctx->hash[2] += c - popq %r15 # +# popq %r15 # addl %edx, 92(%rdi) # ctx->hash[3] += d popq %rbx # addl %ebp, 96(%rdi) # ctx->hash[4] += e -- cgit v1.2.3-55-g6feb From 6472ac942898437e040171cec991de1c0b962f72 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 3 Feb 2022 14:15:20 +0100 Subject: libbb/sha256: optional x86 hardware accelerated hashing 64 bit: function old new delta sha256_process_block64_shaNI - 730 +730 .rodata 108314 108586 +272 sha256_begin 31 83 +52 ------------------------------------------------------------------------------ (add/remove: 5/1 grow/shrink: 2/0 up/down: 1055/-1) Total: 1054 bytes 32 bit: function old new delta sha256_process_block64_shaNI - 747 +747 .rodata 104318 104590 +272 sha256_begin 29 84 +55 ------------------------------------------------------------------------------ (add/remove: 5/1 grow/shrink: 2/0 up/down: 1075/-1) Total: 1074 bytes Signed-off-by: Denys Vlasenko --- libbb/Config.src | 6 + libbb/Kbuild.src | 2 + libbb/hash_md5_sha.c | 54 ++++--- libbb/hash_md5_sha256_x86-32_shaNI.S | 283 +++++++++++++++++++++++++++++++++++ libbb/hash_md5_sha256_x86-64_shaNI.S | 281 ++++++++++++++++++++++++++++++++++ libbb/hash_md5_sha_x86-32_shaNI.S | 4 +- libbb/hash_md5_sha_x86-64.S | 2 +- libbb/hash_md5_sha_x86-64.S.sh | 2 +- libbb/hash_md5_sha_x86-64_shaNI.S | 4 +- 9 files changed, 612 insertions(+), 26 deletions(-) create mode 100644 libbb/hash_md5_sha256_x86-32_shaNI.S create mode 100644 libbb/hash_md5_sha256_x86-64_shaNI.S diff --git a/libbb/Config.src b/libbb/Config.src index 708d3b0c8..0ecd5bd46 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -70,6 +70,12 @@ config SHA1_HWACCEL On x86, this adds ~590 bytes of code. Throughput is about twice as fast as fully-unrolled generic code. +config SHA256_HWACCEL + bool "SHA256: Use hardware accelerated instructions if possible" + default y + help + On x86, this adds ~1k bytes of code. + config SHA3_SMALL int "SHA3: Trade bytes for speed (0:fast, 1:slow)" default 1 # all "fast or small" options default to small diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index b9d34de8e..653025e56 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -59,6 +59,8 @@ lib-y += hash_md5_sha.o lib-y += hash_md5_sha_x86-64.o lib-y += hash_md5_sha_x86-64_shaNI.o lib-y += hash_md5_sha_x86-32_shaNI.o +lib-y += hash_md5_sha256_x86-64_shaNI.o +lib-y += hash_md5_sha256_x86-32_shaNI.o # Alternative (disabled) MD5 implementation #lib-y += hash_md5prime.o lib-y += messages.o diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index a23db5152..880ffab01 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -13,6 +13,27 @@ #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) +#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL +# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) +{ + asm ("cpuid" + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) + : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx) + ); +} +static smallint shaNI; +void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); +void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx); +# if defined(__i386__) +struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; }; +# endif +# if defined(__x86_64__) +struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; }; +# endif +# endif +#endif + /* gcc 4.2.1 optimizes rotr64 better with inline than with macro * (for rotX32, there is no difference). Why? My guess is that * macro requires clever common subexpression elimination heuristics @@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) } #endif /* NEED_SHA512 */ -#if ENABLE_SHA1_HWACCEL -# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) -static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) -{ - asm ("cpuid" - : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) - : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx) - ); -} -void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); -# if defined(__i386__) -struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; }; -# endif -# if defined(__x86_64__) -struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; -# endif -# endif -#endif - void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) { ctx->hash[0] = 0x67452301; @@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) #if ENABLE_SHA1_HWACCEL # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) { - static smallint shaNI; if (!shaNI) { unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; cpuid(&eax, &ebx, &ecx, &edx); @@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx) memcpy(&ctx->total64, init256, sizeof(init256)); /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */ ctx->process_block = sha256_process_block64; +#if ENABLE_SHA256_HWACCEL +# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + { + if (!shaNI) { + unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; + cpuid(&eax, &ebx, &ecx, &edx); + shaNI = ((ebx >> 29) << 1) - 1; + } + if (shaNI > 0) + ctx->process_block = sha256_process_block64_shaNI; + } +# endif +#endif } #if NEED_SHA512 diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S new file mode 100644 index 000000000..56e37fa38 --- /dev/null +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -0,0 +1,283 @@ +#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA1 insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define shuf128_32 pshufd +#define shuf128_32 shufps + + .section .text.sha256_process_block64_shaNI, "ax", @progbits + .globl sha256_process_block64_shaNI + .hidden sha256_process_block64_shaNI + .type sha256_process_block64_shaNI, @function + +#define DATA_PTR %eax + +#define SHA256CONSTANTS %ecx + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 +#define MSGTMP4 %xmm7 + + .balign 8 # allow decoders to fetch at least 3 first insns +sha256_process_block64_shaNI: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + andl $~0xF, %esp # paddd needs aligned memory operand + + movu128 76+0*16(%eax), STATE0 + movu128 76+1*16(%eax), STATE1 + + shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ + shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + mova128 STATE0, MSGTMP4 + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + +# mova128 PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK + lea K256, SHA256CONSTANTS + + /* Save hash values for addition after rounds */ + mova128 STATE0, 0*16(%esp) + mova128 STATE1, 1*16(%esp) + + /* Rounds 0-3 */ + movu128 0*16(DATA_PTR), MSG + pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + mova128 MSG, MSGTMP0 + paddd 0*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 4-7 */ + movu128 1*16(DATA_PTR), MSG + pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + mova128 MSG, MSGTMP1 + paddd 1*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movu128 2*16(DATA_PTR), MSG + pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + mova128 MSG, MSGTMP2 + paddd 2*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movu128 3*16(DATA_PTR), MSG + pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + mova128 MSG, MSGTMP3 + paddd 3*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + mova128 MSGTMP0, MSG + paddd 4*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + mova128 MSGTMP1, MSG + paddd 5*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + mova128 MSGTMP2, MSG + paddd 6*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + mova128 MSGTMP3, MSG + paddd 7*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + mova128 MSGTMP0, MSG + paddd 8*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + mova128 MSGTMP1, MSG + paddd 9*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + mova128 MSGTMP2, MSG + paddd 10*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + mova128 MSGTMP3, MSG + paddd 11*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + mova128 MSGTMP0, MSG + paddd 12*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + mova128 MSGTMP1, MSG + paddd 13*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 56-59 */ + mova128 MSGTMP2, MSG + paddd 14*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 60-63 */ + mova128 MSGTMP3, MSG + paddd 15*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd 0*16(%esp), STATE0 + paddd 1*16(%esp), STATE1 + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ + shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + mova128 STATE0, MSGTMP4 + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, MSGTMP4, STATE1 /* HGFE */ + + movu128 STATE0, 76+0*16(%eax) + movu128 STATE1, 76+1*16(%eax) + + movl %ebp, %esp + popl %ebp + ret + .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.balign 16 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 +.balign 16 +PSHUFFLE_BSWAP32_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +#endif diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S new file mode 100644 index 000000000..1c2b75af3 --- /dev/null +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -0,0 +1,281 @@ +#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA1 insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define shuf128_32 pshufd +#define shuf128_32 shufps + + .section .text.sha256_process_block64_shaNI, "ax", @progbits + .globl sha256_process_block64_shaNI + .hidden sha256_process_block64_shaNI + .type sha256_process_block64_shaNI, @function + +#define DATA_PTR %rdi + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 +#define MSGTMP4 %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: + movu128 80+0*16(%rdi), STATE0 + movu128 80+1*16(%rdi), STATE1 + + shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ + shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + mova128 STATE0, MSGTMP4 + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + + mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK + lea K256(%rip), SHA256CONSTANTS + + /* Save hash values for addition after rounds */ + mova128 STATE0, ABEF_SAVE + mova128 STATE1, CDGH_SAVE + + /* Rounds 0-3 */ + movu128 0*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + mova128 MSG, MSGTMP0 + paddd 0*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 4-7 */ + movu128 1*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + mova128 MSG, MSGTMP1 + paddd 1*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movu128 2*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + mova128 MSG, MSGTMP2 + paddd 2*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movu128 3*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + mova128 MSG, MSGTMP3 + paddd 3*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + mova128 MSGTMP0, MSG + paddd 4*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + mova128 MSGTMP1, MSG + paddd 5*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + mova128 MSGTMP2, MSG + paddd 6*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + mova128 MSGTMP3, MSG + paddd 7*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + mova128 MSGTMP0, MSG + paddd 8*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + mova128 MSGTMP1, MSG + paddd 9*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + mova128 MSGTMP2, MSG + paddd 10*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + mova128 MSGTMP3, MSG + paddd 11*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + mova128 MSGTMP0, MSG + paddd 12*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + mova128 MSGTMP1, MSG + paddd 13*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 56-59 */ + mova128 MSGTMP2, MSG + paddd 14*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + mova128 MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 60-63 */ + mova128 MSGTMP3, MSG + paddd 15*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ + shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + mova128 STATE0, MSGTMP4 + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, MSGTMP4, STATE1 /* HGFE */ + + movu128 STATE0, 80+0*16(%rdi) + movu128 STATE1, 80+1*16(%rdi) + + ret + .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.balign 16 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 +.balign 16 +PSHUFFLE_BSWAP32_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +#endif diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 166cfd38a..11b855e26 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -20,7 +20,7 @@ #define extr128_32 pextrd //#define extr128_32 extractps # not shorter - .section .text.sha1_process_block64_shaNI,"ax",@progbits + .section .text.sha1_process_block64_shaNI, "ax", @progbits .globl sha1_process_block64_shaNI .hidden sha1_process_block64_shaNI .type sha1_process_block64_shaNI, @function @@ -224,7 +224,7 @@ sha1_process_block64_shaNI: .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 +.balign 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 743269d98..47ace60de 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -1394,7 +1394,7 @@ sha1_process_block64: .size sha1_process_block64, .-sha1_process_block64 .section .rodata.cst16.sha1const, "aM", @progbits, 16 - .align 16 + .balign 16 rconst0x5A827999: .long 0x5A827999 .long 0x5A827999 diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 47c40af0d..656fb5414 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -433,7 +433,7 @@ echo " .size sha1_process_block64, .-sha1_process_block64 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 - .align 16 + .balign 16 rconst0x5A827999: .long 0x5A827999 .long 0x5A827999 diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index 33cc3bf7f..ba92f09df 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -20,7 +20,7 @@ #define extr128_32 pextrd //#define extr128_32 extractps # not shorter - .section .text.sha1_process_block64_shaNI,"ax",@progbits + .section .text.sha1_process_block64_shaNI, "ax", @progbits .globl sha1_process_block64_shaNI .hidden sha1_process_block64_shaNI .type sha1_process_block64_shaNI, @function @@ -218,7 +218,7 @@ sha1_process_block64_shaNI: .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 +.balign 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f -- cgit v1.2.3-55-g6feb From de6cb4bed82356db72af81890c7c26d7e85fb50d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 3 Feb 2022 15:11:23 +0100 Subject: libbb/sha256: code shrink in 32-bit x86 function old new delta sha256_process_block64_shaNI 747 722 -25 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 56e37fa38..632dab7e6 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -49,8 +49,7 @@ sha256_process_block64_shaNI: palignr $8, STATE1, STATE0 /* ABEF */ pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ -# mova128 PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK - lea K256, SHA256CONSTANTS + movl $K256+8*16, SHA256CONSTANTS /* Save hash values for addition after rounds */ mova128 STATE0, 0*16(%esp) @@ -60,7 +59,7 @@ sha256_process_block64_shaNI: movu128 0*16(DATA_PTR), MSG pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG mova128 MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG + paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -69,7 +68,7 @@ sha256_process_block64_shaNI: movu128 1*16(DATA_PTR), MSG pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG mova128 MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG + paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -79,7 +78,7 @@ sha256_process_block64_shaNI: movu128 2*16(DATA_PTR), MSG pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG mova128 MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG + paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -89,7 +88,7 @@ sha256_process_block64_shaNI: movu128 3*16(DATA_PTR), MSG pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG mova128 MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG + paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP3, MSGTMP4 palignr $4, MSGTMP2, MSGTMP4 @@ -101,7 +100,7 @@ sha256_process_block64_shaNI: /* Rounds 16-19 */ mova128 MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG + paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP0, MSGTMP4 palignr $4, MSGTMP3, MSGTMP4 @@ -113,7 +112,7 @@ sha256_process_block64_shaNI: /* Rounds 20-23 */ mova128 MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG + paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP1, MSGTMP4 palignr $4, MSGTMP0, MSGTMP4 @@ -125,7 +124,7 @@ sha256_process_block64_shaNI: /* Rounds 24-27 */ mova128 MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG + paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP2, MSGTMP4 palignr $4, MSGTMP1, MSGTMP4 @@ -137,7 +136,7 @@ sha256_process_block64_shaNI: /* Rounds 28-31 */ mova128 MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG + paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP3, MSGTMP4 palignr $4, MSGTMP2, MSGTMP4 @@ -149,7 +148,7 @@ sha256_process_block64_shaNI: /* Rounds 32-35 */ mova128 MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG + paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP0, MSGTMP4 palignr $4, MSGTMP3, MSGTMP4 @@ -161,7 +160,7 @@ sha256_process_block64_shaNI: /* Rounds 36-39 */ mova128 MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG + paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP1, MSGTMP4 palignr $4, MSGTMP0, MSGTMP4 @@ -173,7 +172,7 @@ sha256_process_block64_shaNI: /* Rounds 40-43 */ mova128 MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG + paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP2, MSGTMP4 palignr $4, MSGTMP1, MSGTMP4 @@ -185,7 +184,7 @@ sha256_process_block64_shaNI: /* Rounds 44-47 */ mova128 MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG + paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP3, MSGTMP4 palignr $4, MSGTMP2, MSGTMP4 @@ -197,7 +196,7 @@ sha256_process_block64_shaNI: /* Rounds 48-51 */ mova128 MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG + paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP0, MSGTMP4 palignr $4, MSGTMP3, MSGTMP4 @@ -209,7 +208,7 @@ sha256_process_block64_shaNI: /* Rounds 52-55 */ mova128 MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG + paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP1, MSGTMP4 palignr $4, MSGTMP0, MSGTMP4 @@ -220,7 +219,7 @@ sha256_process_block64_shaNI: /* Rounds 56-59 */ mova128 MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG + paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP2, MSGTMP4 palignr $4, MSGTMP1, MSGTMP4 @@ -231,7 +230,7 @@ sha256_process_block64_shaNI: /* Rounds 60-63 */ mova128 MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG + paddd 15*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 -- cgit v1.2.3-55-g6feb From a1429fbb8ca373efc01939d599f6f65969b1a366 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 3 Feb 2022 15:17:42 +0100 Subject: libbb/sha256: code shrink in 64-bit x86 function old new delta sha256_process_block64_shaNI 730 706 -24 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-64_shaNI.S | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index 1c2b75af3..f3df541e4 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -50,7 +50,7 @@ sha256_process_block64_shaNI: pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS + leaq K256+8*16(%rip), SHA256CONSTANTS /* Save hash values for addition after rounds */ mova128 STATE0, ABEF_SAVE @@ -60,7 +60,7 @@ sha256_process_block64_shaNI: movu128 0*16(DATA_PTR), MSG pshufb SHUF_MASK, MSG mova128 MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG + paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -69,7 +69,7 @@ sha256_process_block64_shaNI: movu128 1*16(DATA_PTR), MSG pshufb SHUF_MASK, MSG mova128 MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG + paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -79,7 +79,7 @@ sha256_process_block64_shaNI: movu128 2*16(DATA_PTR), MSG pshufb SHUF_MASK, MSG mova128 MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG + paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -89,7 +89,7 @@ sha256_process_block64_shaNI: movu128 3*16(DATA_PTR), MSG pshufb SHUF_MASK, MSG mova128 MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG + paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP3, MSGTMP4 palignr $4, MSGTMP2, MSGTMP4 @@ -101,7 +101,7 @@ sha256_process_block64_shaNI: /* Rounds 16-19 */ mova128 MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG + paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP0, MSGTMP4 palignr $4, MSGTMP3, MSGTMP4 @@ -113,7 +113,7 @@ sha256_process_block64_shaNI: /* Rounds 20-23 */ mova128 MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG + paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP1, MSGTMP4 palignr $4, MSGTMP0, MSGTMP4 @@ -125,7 +125,7 @@ sha256_process_block64_shaNI: /* Rounds 24-27 */ mova128 MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG + paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP2, MSGTMP4 palignr $4, MSGTMP1, MSGTMP4 @@ -137,7 +137,7 @@ sha256_process_block64_shaNI: /* Rounds 28-31 */ mova128 MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG + paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP3, MSGTMP4 palignr $4, MSGTMP2, MSGTMP4 @@ -149,7 +149,7 @@ sha256_process_block64_shaNI: /* Rounds 32-35 */ mova128 MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG + paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP0, MSGTMP4 palignr $4, MSGTMP3, MSGTMP4 @@ -161,7 +161,7 @@ sha256_process_block64_shaNI: /* Rounds 36-39 */ mova128 MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG + paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP1, MSGTMP4 palignr $4, MSGTMP0, MSGTMP4 @@ -173,7 +173,7 @@ sha256_process_block64_shaNI: /* Rounds 40-43 */ mova128 MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG + paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP2, MSGTMP4 palignr $4, MSGTMP1, MSGTMP4 @@ -185,7 +185,7 @@ sha256_process_block64_shaNI: /* Rounds 44-47 */ mova128 MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG + paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP3, MSGTMP4 palignr $4, MSGTMP2, MSGTMP4 @@ -197,7 +197,7 @@ sha256_process_block64_shaNI: /* Rounds 48-51 */ mova128 MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG + paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP0, MSGTMP4 palignr $4, MSGTMP3, MSGTMP4 @@ -209,7 +209,7 @@ sha256_process_block64_shaNI: /* Rounds 52-55 */ mova128 MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG + paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP1, MSGTMP4 palignr $4, MSGTMP0, MSGTMP4 @@ -220,7 +220,7 @@ sha256_process_block64_shaNI: /* Rounds 56-59 */ mova128 MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG + paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 mova128 MSGTMP2, MSGTMP4 palignr $4, MSGTMP1, MSGTMP4 @@ -231,7 +231,7 @@ sha256_process_block64_shaNI: /* Rounds 60-63 */ mova128 MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG + paddd 15*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 -- cgit v1.2.3-55-g6feb From 31c1c310772fa6c897ee1585ea15fc38f3ab3dff Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 6 Feb 2022 00:30:03 +0100 Subject: libbb/sha256: code shrink in 64-bit x86 function old new delta sha256_process_block64_shaNI 706 701 -5 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-64_shaNI.S | 96 ++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index f3df541e4..dbf391135 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -31,9 +31,7 @@ #define MSGTMP1 %xmm4 #define MSGTMP2 %xmm5 #define MSGTMP3 %xmm6 -#define MSGTMP4 %xmm7 - -#define SHUF_MASK %xmm8 +#define XMMTMP4 %xmm7 #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 @@ -45,11 +43,12 @@ sha256_process_block64_shaNI: shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ - mova128 STATE0, MSGTMP4 + mova128 STATE0, XMMTMP4 palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK +/* XMMTMP4 holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP4 leaq K256+8*16(%rip), SHA256CONSTANTS /* Save hash values for addition after rounds */ @@ -58,7 +57,7 @@ sha256_process_block64_shaNI: /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG + pshufb XMMTMP4, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -67,7 +66,7 @@ sha256_process_block64_shaNI: /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG + pshufb XMMTMP4, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -77,7 +76,7 @@ sha256_process_block64_shaNI: /* Rounds 8-11 */ movu128 2*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG + pshufb XMMTMP4, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -87,13 +86,14 @@ sha256_process_block64_shaNI: /* Rounds 12-15 */ movu128 3*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG + pshufb XMMTMP4, MSG +/* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP4 + palignr $4, MSGTMP2, XMMTMP4 + paddd XMMTMP4, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -103,9 +103,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP4 + palignr $4, MSGTMP3, XMMTMP4 + paddd XMMTMP4, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -115,9 +115,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP4 + palignr $4, MSGTMP0, XMMTMP4 + paddd XMMTMP4, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -127,9 +127,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP4 + palignr $4, MSGTMP1, XMMTMP4 + paddd XMMTMP4, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -139,9 +139,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP4 + palignr $4, MSGTMP2, XMMTMP4 + paddd XMMTMP4, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -151,9 +151,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP4 + palignr $4, MSGTMP3, XMMTMP4 + paddd XMMTMP4, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -163,9 +163,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP4 + palignr $4, MSGTMP0, XMMTMP4 + paddd XMMTMP4, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -175,9 +175,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP4 + palignr $4, MSGTMP1, XMMTMP4 + paddd XMMTMP4, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -187,9 +187,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP4 + palignr $4, MSGTMP2, XMMTMP4 + paddd XMMTMP4, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -199,9 +199,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP4 + palignr $4, MSGTMP3, XMMTMP4 + paddd XMMTMP4, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -211,9 +211,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP4 + palignr $4, MSGTMP0, XMMTMP4 + paddd XMMTMP4, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -222,9 +222,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP4 + palignr $4, MSGTMP1, XMMTMP4 + paddd XMMTMP4, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -243,9 +243,9 @@ sha256_process_block64_shaNI: /* Write hash values back in the correct order */ shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ - mova128 STATE0, MSGTMP4 + mova128 STATE0, XMMTMP4 pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ + palignr $8, XMMTMP4, STATE1 /* HGFE */ movu128 STATE0, 80+0*16(%rdi) movu128 STATE1, 80+1*16(%rdi) -- cgit v1.2.3-55-g6feb From 4f40735c87f8292a87c066b3b7099b0be007cf59 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 6 Feb 2022 00:55:52 +0100 Subject: libbb/sha256: code shrink in 32-bit x86 function old new delta sha256_process_block64_shaNI 722 713 -9 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 93 +++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 632dab7e6..417da37d8 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -31,7 +31,7 @@ #define MSGTMP1 %xmm4 #define MSGTMP2 %xmm5 #define MSGTMP3 %xmm6 -#define MSGTMP4 %xmm7 +#define XMMTMP4 %xmm7 .balign 8 # allow decoders to fetch at least 3 first insns sha256_process_block64_shaNI: @@ -45,10 +45,12 @@ sha256_process_block64_shaNI: shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ - mova128 STATE0, MSGTMP4 + mova128 STATE0, XMMTMP4 palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */ +/* XMMTMP4 holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4 movl $K256+8*16, SHA256CONSTANTS /* Save hash values for addition after rounds */ @@ -57,7 +59,7 @@ sha256_process_block64_shaNI: /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG - pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + pshufb XMMTMP4, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -66,7 +68,7 @@ sha256_process_block64_shaNI: /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG - pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + pshufb XMMTMP4, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -76,7 +78,7 @@ sha256_process_block64_shaNI: /* Rounds 8-11 */ movu128 2*16(DATA_PTR), MSG - pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + pshufb XMMTMP4, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -86,13 +88,14 @@ sha256_process_block64_shaNI: /* Rounds 12-15 */ movu128 3*16(DATA_PTR), MSG - pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG + pshufb XMMTMP4, MSG +/* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP4 + palignr $4, MSGTMP2, XMMTMP4 + paddd XMMTMP4, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -102,9 +105,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP4 + palignr $4, MSGTMP3, XMMTMP4 + paddd XMMTMP4, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -114,9 +117,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP4 + palignr $4, MSGTMP0, XMMTMP4 + paddd XMMTMP4, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -126,9 +129,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP4 + palignr $4, MSGTMP1, XMMTMP4 + paddd XMMTMP4, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -138,9 +141,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP4 + palignr $4, MSGTMP2, XMMTMP4 + paddd XMMTMP4, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -150,9 +153,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP4 + palignr $4, MSGTMP3, XMMTMP4 + paddd XMMTMP4, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -162,9 +165,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP4 + palignr $4, MSGTMP0, XMMTMP4 + paddd XMMTMP4, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -174,9 +177,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP4 + palignr $4, MSGTMP1, XMMTMP4 + paddd XMMTMP4, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -186,9 +189,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP4 + palignr $4, MSGTMP2, XMMTMP4 + paddd XMMTMP4, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -198,9 +201,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP4 + palignr $4, MSGTMP3, XMMTMP4 + paddd XMMTMP4, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -210,9 +213,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP4 + palignr $4, MSGTMP0, XMMTMP4 + paddd XMMTMP4, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -221,9 +224,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP4 + palignr $4, MSGTMP1, XMMTMP4 + paddd XMMTMP4, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -242,9 +245,9 @@ sha256_process_block64_shaNI: /* Write hash values back in the correct order */ shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ - mova128 STATE0, MSGTMP4 + mova128 STATE0, XMMTMP4 pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ + palignr $8, XMMTMP4, STATE1 /* HGFE */ movu128 STATE0, 76+0*16(%eax) movu128 STATE1, 76+1*16(%eax) -- cgit v1.2.3-55-g6feb From ca466f385ac985a8b3491daa9f326dc480cdee70 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 6 Feb 2022 19:53:10 +0100 Subject: *: slap on a few ALIGN* where appropriate The result of looking at "grep -F -B2 '*fill*' busybox_unstripped.map" function old new delta .rodata 108586 108460 -126 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-126) Total: -126 bytes text data bss dec hex filename 970412 4219 1848 976479 ee65f busybox_old 970286 4219 1848 976353 ee5e1 busybox_unstripped Signed-off-by: Denys Vlasenko --- console-tools/reset.c | 2 +- coreutils/od.c | 2 +- include/platform.h | 1 + libbb/appletlib.c | 2 +- libbb/get_console.c | 2 +- miscutils/bc.c | 2 +- miscutils/man.c | 2 +- networking/ifupdown.c | 8 ++++---- networking/interface.c | 6 +++--- networking/libiproute/ipaddress.c | 2 +- networking/udhcp/common.c | 2 +- networking/udhcp/d6_dhcpc.c | 2 +- shell/ash.c | 2 +- util-linux/hexdump.c | 2 +- util-linux/nsenter.c | 2 +- util-linux/unshare.c | 2 +- 16 files changed, 21 insertions(+), 20 deletions(-) diff --git a/console-tools/reset.c b/console-tools/reset.c index b3acf69f8..cc04e4fcc 100644 --- a/console-tools/reset.c +++ b/console-tools/reset.c @@ -36,7 +36,7 @@ int stty_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int reset_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM) { - static const char *const args[] = { + static const char *const args[] ALIGN_PTR = { "stty", "sane", NULL }; diff --git a/coreutils/od.c b/coreutils/od.c index 9a888dd5f..6f22331e0 100644 --- a/coreutils/od.c +++ b/coreutils/od.c @@ -144,7 +144,7 @@ odoffset(dumper_t *dumper, int argc, char ***argvp) } } -static const char *const add_strings[] = { +static const char *const add_strings[] ALIGN_PTR = { "16/1 \"%3_u \" \"\\n\"", /* a */ "8/2 \" %06o \" \"\\n\"", /* B, o */ "16/1 \"%03o \" \"\\n\"", /* b */ diff --git a/include/platform.h b/include/platform.h index ad27bb31a..ea0512f36 100644 --- a/include/platform.h +++ b/include/platform.h @@ -346,6 +346,7 @@ typedef unsigned smalluint; # define ALIGN4 #endif #define ALIGN8 __attribute__((aligned(8))) +#define ALIGN_INT __attribute__((aligned(sizeof(int)))) #define ALIGN_PTR __attribute__((aligned(sizeof(void*)))) /* diff --git a/libbb/appletlib.c b/libbb/appletlib.c index 03389f541..841b3b873 100644 --- a/libbb/appletlib.c +++ b/libbb/appletlib.c @@ -651,7 +651,7 @@ static void check_suid(int applet_no) # if ENABLE_FEATURE_INSTALLER static const char usr_bin [] ALIGN1 = "/usr/bin/"; static const char usr_sbin[] ALIGN1 = "/usr/sbin/"; -static const char *const install_dir[] = { +static const char *const install_dir[] ALIGN_PTR = { &usr_bin [8], /* "/" */ &usr_bin [4], /* "/bin/" */ &usr_sbin[4] /* "/sbin/" */ diff --git a/libbb/get_console.c b/libbb/get_console.c index 7f2c75332..9044efea1 100644 --- a/libbb/get_console.c +++ b/libbb/get_console.c @@ -37,7 +37,7 @@ static int open_a_console(const char *fnam) */ int FAST_FUNC get_console_fd_or_die(void) { - static const char *const console_names[] = { + static const char *const console_names[] ALIGN_PTR = { DEV_CONSOLE, CURRENT_VC, CURRENT_TTY }; diff --git a/miscutils/bc.c b/miscutils/bc.c index ae370ff55..ab785bbc8 100644 --- a/miscutils/bc.c +++ b/miscutils/bc.c @@ -6011,7 +6011,7 @@ static BC_STATUS zxc_program_assign(char inst) #endif if (ib || sc || left->t == XC_RESULT_OBASE) { - static const char *const msg[] = { + static const char *const msg[] ALIGN_PTR = { "bad ibase; must be [2,16]", //XC_RESULT_IBASE "bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE "bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE diff --git a/miscutils/man.c b/miscutils/man.c index d319e8bba..deaf9e5ab 100644 --- a/miscutils/man.c +++ b/miscutils/man.c @@ -303,7 +303,7 @@ int man_main(int argc UNUSED_PARAM, char **argv) config_close(parser); if (!man_path_list) { - static const char *const mpl[] = { "/usr/man", "/usr/share/man", NULL }; + static const char *const mpl[] ALIGN_PTR = { "/usr/man", "/usr/share/man", NULL }; man_path_list = (char**)mpl; /*count_mp = 2; - not used below anyway */ } diff --git a/networking/ifupdown.c b/networking/ifupdown.c index 737113dd4..6c4ae27f2 100644 --- a/networking/ifupdown.c +++ b/networking/ifupdown.c @@ -532,7 +532,7 @@ static int FAST_FUNC v4tunnel_down(struct interface_defn_t * ifd, execfn * exec) } # endif -static const struct method_t methods6[] = { +static const struct method_t methods6[] ALIGN_PTR = { # if ENABLE_FEATURE_IFUPDOWN_IP { "v4tunnel" , v4tunnel_up , v4tunnel_down , }, # endif @@ -627,7 +627,7 @@ struct dhcp_client_t { const char *stopcmd; }; -static const struct dhcp_client_t ext_dhcp_clients[] = { +static const struct dhcp_client_t ext_dhcp_clients[] ALIGN_PTR = { { "dhcpcd", "dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%", "dhcpcd -k %iface%", @@ -774,7 +774,7 @@ static int FAST_FUNC wvdial_down(struct interface_defn_t *ifd, execfn *exec) "-p /var/run/wvdial.%iface% -s 2", ifd, exec); } -static const struct method_t methods[] = { +static const struct method_t methods[] ALIGN_PTR = { { "manual" , manual_up_down, manual_up_down, }, { "wvdial" , wvdial_up , wvdial_down , }, { "ppp" , ppp_up , ppp_down , }, @@ -797,7 +797,7 @@ static int FAST_FUNC link_up_down(struct interface_defn_t *ifd UNUSED_PARAM, exe return 1; } -static const struct method_t link_methods[] = { +static const struct method_t link_methods[] ALIGN_PTR = { { "none", link_up_down, link_up_down } }; diff --git a/networking/interface.c b/networking/interface.c index ea6a2c8a8..6b6c0944a 100644 --- a/networking/interface.c +++ b/networking/interface.c @@ -446,13 +446,13 @@ static char *get_name(char name[IFNAMSIZ], char *p) * %n specifiers (even the size of integers may not match). */ #if INT_MAX == LONG_MAX -static const char *const ss_fmt[] = { +static const char *const ss_fmt[] ALIGN_PTR = { "%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u", "%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u", "%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u" }; #else -static const char *const ss_fmt[] = { +static const char *const ss_fmt[] ALIGN_PTR = { "%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu", "%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu", "%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu" @@ -731,7 +731,7 @@ static const struct hwtype ib_hwtype = { #endif -static const struct hwtype *const hwtypes[] = { +static const struct hwtype *const hwtypes[] ALIGN_PTR = { &loop_hwtype, ðer_hwtype, &ppp_hwtype, diff --git a/networking/libiproute/ipaddress.c b/networking/libiproute/ipaddress.c index 17a838411..ecc3848ff 100644 --- a/networking/libiproute/ipaddress.c +++ b/networking/libiproute/ipaddress.c @@ -58,7 +58,7 @@ typedef struct filter_t filter_t; static void print_link_flags(unsigned flags, unsigned mdown) { - static const int flag_masks[] = { + static const int flag_masks[] ALIGN_INT = { IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT, IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP }; static const char flag_labels[] ALIGN1 = diff --git a/networking/udhcp/common.c b/networking/udhcp/common.c index 8e9b93655..ae818db05 100644 --- a/networking/udhcp/common.c +++ b/networking/udhcp/common.c @@ -19,7 +19,7 @@ const uint8_t MAC_BCAST_ADDR[6] ALIGN2 = { * See RFC2132 for more options. * OPTION_REQ: these options are requested by udhcpc (unless -o). */ -const struct dhcp_optflag dhcp_optflags[] = { +const struct dhcp_optflag dhcp_optflags[] ALIGN2 = { /* flags code */ { OPTION_IP | OPTION_REQ, 0x01 }, /* DHCP_SUBNET */ { OPTION_S32 , 0x02 }, /* DHCP_TIME_OFFSET */ diff --git a/networking/udhcp/d6_dhcpc.c b/networking/udhcp/d6_dhcpc.c index 9d2a8f5d3..9fc690315 100644 --- a/networking/udhcp/d6_dhcpc.c +++ b/networking/udhcp/d6_dhcpc.c @@ -65,7 +65,7 @@ /* "struct client_data_t client_data" is in bb_common_bufsiz1 */ -static const struct dhcp_optflag d6_optflags[] = { +static const struct dhcp_optflag d6_optflags[] ALIGN2 = { #if ENABLE_FEATURE_UDHCPC6_RFC3646 { OPTION_6RD | OPTION_LIST | OPTION_REQ, D6_OPT_DNS_SERVERS }, { OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST }, diff --git a/shell/ash.c b/shell/ash.c index 55df54bd0..adb0f223a 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -313,7 +313,7 @@ typedef long arith_t; /* ============ Shell options */ /* If you add/change options hare, update --help text too */ -static const char *const optletters_optnames[] = { +static const char *const optletters_optnames[] ALIGN_PTR = { "e" "errexit", "f" "noglob", /* bash has '-o ignoreeof', but no short synonym -I for it */ diff --git a/util-linux/hexdump.c b/util-linux/hexdump.c index 57e7e8db7..307a84803 100644 --- a/util-linux/hexdump.c +++ b/util-linux/hexdump.c @@ -71,7 +71,7 @@ static void bb_dump_addfile(dumper_t *dumper, char *name) fclose(fp); } -static const char *const add_strings[] = { +static const char *const add_strings[] ALIGN_PTR = { "\"%07.7_ax \"16/1 \"%03o \"\"\n\"", /* b */ "\"%07.7_ax \"16/1 \"%3_c \"\"\n\"", /* c */ "\"%07.7_ax \"8/2 \" %05u \"\"\n\"", /* d */ diff --git a/util-linux/nsenter.c b/util-linux/nsenter.c index e6339da2f..1aa045b35 100644 --- a/util-linux/nsenter.c +++ b/util-linux/nsenter.c @@ -93,7 +93,7 @@ enum { * The user namespace comes first, so that it is entered first. * This gives an unprivileged user the potential to enter other namespaces. */ -static const struct namespace_descr ns_list[] = { +static const struct namespace_descr ns_list[] ALIGN_INT = { { CLONE_NEWUSER, "ns/user", }, { CLONE_NEWIPC, "ns/ipc", }, { CLONE_NEWUTS, "ns/uts", }, diff --git a/util-linux/unshare.c b/util-linux/unshare.c index 68ccdd874..06b938074 100644 --- a/util-linux/unshare.c +++ b/util-linux/unshare.c @@ -120,7 +120,7 @@ enum { NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */ NS_COUNT, }; -static const struct namespace_descr ns_list[] = { +static const struct namespace_descr ns_list[] ALIGN_INT = { { CLONE_NEWNS, "mnt" }, { CLONE_NEWUTS, "uts" }, { CLONE_NEWIPC, "ipc" }, -- cgit v1.2.3-55-g6feb From 987be932ed3cbea56b68bbe85649191c13b66015 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 6 Feb 2022 20:07:12 +0100 Subject: *: slap on a few ALIGN_PTR where appropriate Signed-off-by: Denys Vlasenko --- coreutils/test.c | 2 +- e2fsprogs/fsck.c | 2 +- libbb/getopt32.c | 2 +- miscutils/devfsd.c | 4 ++-- modutils/modutils-24.c | 4 ++-- networking/inetd.c | 2 +- procps/nmeter.c | 2 +- selinux/setenforce.c | 2 +- shell/hush.c | 10 +++++----- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/coreutils/test.c b/coreutils/test.c index a914c7490..840a0daaf 100644 --- a/coreutils/test.c +++ b/coreutils/test.c @@ -242,7 +242,7 @@ int depth; depth--; \ return __res; \ } while (0) -static const char *const TOKSTR[] = { +static const char *const TOKSTR[] ALIGN_PTR = { "EOI", "FILRD", "FILWR", diff --git a/e2fsprogs/fsck.c b/e2fsprogs/fsck.c index 96c1e51e0..028f8a803 100644 --- a/e2fsprogs/fsck.c +++ b/e2fsprogs/fsck.c @@ -190,7 +190,7 @@ struct globals { * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3 * pathames. */ -static const char *const devfs_hier[] = { +static const char *const devfs_hier[] ALIGN_PTR = { "host", "bus", "target", "lun", NULL }; #endif diff --git a/libbb/getopt32.c b/libbb/getopt32.c index 5ab4d66f1..e861d0567 100644 --- a/libbb/getopt32.c +++ b/libbb/getopt32.c @@ -296,7 +296,7 @@ Special characters: /* Code here assumes that 'unsigned' is at least 32 bits wide */ -const char *const bb_argv_dash[] = { "-", NULL }; +const char *const bb_argv_dash[] ALIGN_PTR = { "-", NULL }; enum { PARAM_STRING, diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c index 839d00fd0..fb9ebcf60 100644 --- a/miscutils/devfsd.c +++ b/miscutils/devfsd.c @@ -928,7 +928,7 @@ static void action_compat(const struct devfsd_notify_struct *info, unsigned int unsigned int i; char rewind_; /* 1 to 5 "scsi/" , 6 to 9 "ide/host" */ - static const char *const fmt[] = { + static const char *const fmt[] ALIGN_PTR = { NULL , "sg/c%db%dt%du%d", /* scsi/generic */ "sd/c%db%dt%du%d", /* scsi/disc */ @@ -1468,7 +1468,7 @@ const char *get_old_name(const char *devname, unsigned int namelen, const char *pty1; const char *pty2; /* 1 to 5 "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */ - static const char *const fmt[] = { + static const char *const fmt[] ALIGN_PTR = { NULL , "sg%u", /* scsi/generic */ NULL, /* scsi/disc */ diff --git a/modutils/modutils-24.c b/modutils/modutils-24.c index ac8632481..d0bc2a6ef 100644 --- a/modutils/modutils-24.c +++ b/modutils/modutils-24.c @@ -3458,7 +3458,7 @@ static int obj_load_progbits(char *image, size_t image_size, struct obj_file *f, static void hide_special_symbols(struct obj_file *f) { - static const char *const specials[] = { + static const char *const specials[] ALIGN_PTR = { SPFX "cleanup_module", SPFX "init_module", SPFX "kernel_version", @@ -3484,7 +3484,7 @@ static int obj_gpl_license(struct obj_file *f, const char **license) * linux/include/linux/module.h. Checking for leading "GPL" will not * work, somebody will use "GPL sucks, this is proprietary". */ - static const char *const gpl_licenses[] = { + static const char *const gpl_licenses[] ALIGN_PTR = { "GPL", "GPL v2", "GPL and additional rights", diff --git a/networking/inetd.c b/networking/inetd.c index e71be51c3..fb2fbe323 100644 --- a/networking/inetd.c +++ b/networking/inetd.c @@ -1538,7 +1538,7 @@ int inetd_main(int argc UNUSED_PARAM, char **argv) #if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \ || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD # if !BB_MMU -static const char *const cat_args[] = { "cat", NULL }; +static const char *const cat_args[] ALIGN_PTR = { "cat", NULL }; # endif #endif diff --git a/procps/nmeter.c b/procps/nmeter.c index 2310e9844..088d366bf 100644 --- a/procps/nmeter.c +++ b/procps/nmeter.c @@ -70,7 +70,7 @@ typedef struct proc_file { smallint last_gen; } proc_file; -static const char *const proc_name[] = { +static const char *const proc_name[] ALIGN_PTR = { "stat", // Must match the order of proc_file's! "loadavg", "net/dev", diff --git a/selinux/setenforce.c b/selinux/setenforce.c index 996034f8e..2267be451 100644 --- a/selinux/setenforce.c +++ b/selinux/setenforce.c @@ -26,7 +26,7 @@ /* These strings are arranged so that odd ones * result in security_setenforce(1) being done, * the rest will do security_setenforce(0) */ -static const char *const setenforce_cmd[] = { +static const char *const setenforce_cmd[] ALIGN_PTR = { "0", "1", "permissive", diff --git a/shell/hush.c b/shell/hush.c index 6dc2ecaac..ae81f0da5 100644 --- a/shell/hush.c +++ b/shell/hush.c @@ -564,7 +564,7 @@ enum { #define NULL_O_STRING { NULL } #ifndef debug_printf_parse -static const char *const assignment_flag[] = { +static const char *const assignment_flag[] ALIGN_PTR = { "MAYBE_ASSIGNMENT", "DEFINITELY_ASSIGNMENT", "NOT_ASSIGNMENT", @@ -3682,7 +3682,7 @@ static void free_pipe_list(struct pipe *pi) #ifndef debug_print_tree static void debug_print_tree(struct pipe *pi, int lvl) { - static const char *const PIPE[] = { + static const char *const PIPE[] ALIGN_PTR = { [PIPE_SEQ] = "SEQ", [PIPE_AND] = "AND", [PIPE_OR ] = "OR" , @@ -3717,7 +3717,7 @@ static void debug_print_tree(struct pipe *pi, int lvl) [RES_XXXX ] = "XXXX" , [RES_SNTX ] = "SNTX" , }; - static const char *const CMDTYPE[] = { + static const char *const CMDTYPE[] ALIGN_PTR = { "{}", "()", "[noglob]", @@ -7659,7 +7659,7 @@ static int generate_stream_from_string(const char *s, pid_t *pid_p) if (is_prefixed_with(s, "trap") && skip_whitespace(s + 4)[0] == '\0' ) { - static const char *const argv[] = { NULL, NULL }; + static const char *const argv[] ALIGN_PTR = { NULL, NULL }; builtin_trap((char**)argv); fflush_all(); /* important */ _exit(0); @@ -9826,7 +9826,7 @@ static int run_list(struct pipe *pi) static const char encoded_dollar_at[] ALIGN1 = { SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0' }; /* encoded representation of "$@" */ - static const char *const encoded_dollar_at_argv[] = { + static const char *const encoded_dollar_at_argv[] ALIGN_PTR = { encoded_dollar_at, NULL }; /* argv list with one element: "$@" */ char **vals; -- cgit v1.2.3-55-g6feb From c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 7 Feb 2022 02:06:18 +0100 Subject: libbb/sha1: shrink and speed up unrolled x86-64 code function old new delta sha1_process_block64 3514 3482 -32 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 8 +- libbb/hash_md5_sha256_x86-64_shaNI.S | 8 +- libbb/hash_md5_sha_x86-32_shaNI.S | 4 +- libbb/hash_md5_sha_x86-64.S | 144 +++++++++++++++++++++++++++-------- libbb/hash_md5_sha_x86-64.S.sh | 9 ++- libbb/hash_md5_sha_x86-64_shaNI.S | 4 +- 6 files changed, 131 insertions(+), 46 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 417da37d8..39e2baf41 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -257,8 +257,8 @@ sha256_process_block64_shaNI: ret .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI -.section .rodata.cst256.K256, "aM", @progbits, 256 -.balign 16 + .section .rodata.cst256.K256, "aM", @progbits, 256 + .balign 16 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 @@ -277,8 +277,8 @@ K256: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 -.balign 16 + .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 + .balign 16 PSHUFFLE_BSWAP32_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index dbf391135..c6c931341 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -253,8 +253,8 @@ sha256_process_block64_shaNI: ret .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI -.section .rodata.cst256.K256, "aM", @progbits, 256 -.balign 16 + .section .rodata.cst256.K256, "aM", @progbits, 256 + .balign 16 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 @@ -273,8 +273,8 @@ K256: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 -.balign 16 + .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 + .balign 16 PSHUFFLE_BSWAP32_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 11b855e26..5d082ebfb 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -223,8 +223,8 @@ sha1_process_block64_shaNI: ret .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.balign 16 + .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 + .balign 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 47ace60de..e26c46f25 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -180,8 +180,13 @@ sha1_process_block64: # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) movaps %xmm3, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm0 # ^ @@ -252,8 +257,13 @@ sha1_process_block64: # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) movaps %xmm0, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm1 # ^ @@ -323,8 +333,13 @@ sha1_process_block64: # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) movaps %xmm1, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm2 # ^ @@ -392,8 +407,13 @@ sha1_process_block64: # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) movaps %xmm2, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm3 # ^ @@ -457,8 +477,13 @@ sha1_process_block64: # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) movaps %xmm3, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm0 # ^ @@ -522,8 +547,13 @@ sha1_process_block64: # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) movaps %xmm0, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm1 # ^ @@ -588,8 +618,13 @@ sha1_process_block64: # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) movaps %xmm1, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm2 # ^ @@ -653,8 +688,13 @@ sha1_process_block64: # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) movaps %xmm2, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm3 # ^ @@ -718,8 +758,13 @@ sha1_process_block64: # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) movaps %xmm3, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm0 # ^ @@ -795,8 +840,13 @@ sha1_process_block64: # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) movaps %xmm0, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm1 # ^ @@ -872,8 +922,13 @@ sha1_process_block64: # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) movaps %xmm1, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm2 # ^ @@ -950,8 +1005,13 @@ sha1_process_block64: # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) movaps %xmm2, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm3 # ^ @@ -1027,8 +1087,13 @@ sha1_process_block64: # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) movaps %xmm3, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm0 # ^ @@ -1104,8 +1169,13 @@ sha1_process_block64: # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) movaps %xmm0, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm1 # ^ @@ -1169,8 +1239,13 @@ sha1_process_block64: # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) movaps %xmm1, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm2 # ^ @@ -1234,8 +1309,13 @@ sha1_process_block64: # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) movaps %xmm2, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) xorps %xmm5, %xmm3 # ^ diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 656fb5414..fb1e4b57e 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -203,8 +203,13 @@ echo "# PREP $@ movaps $xmmW12, $xmmT1 psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps $xmmW0, $xmmT2 + shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index ba92f09df..8ddec87ce 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -217,8 +217,8 @@ sha1_process_block64_shaNI: ret .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.balign 16 + .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 + .balign 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f -- cgit v1.2.3-55-g6feb From 4923f74e5873b25b8205a4059964cff75ee731a8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 8 Feb 2022 03:29:16 +0100 Subject: libbb/sha1: shrink unrolled x86-64 code function old new delta sha1_process_block64 3482 3481 -1 .rodata 108460 108412 -48 ------------------------------------------------------------------------------ (add/remove: 1/4 grow/shrink: 0/2 up/down: 0/-49) Total: -49 bytes Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 33 ++++++++++----------------------- libbb/hash_md5_sha_x86-64.S.sh | 34 +++++++++++----------------------- 2 files changed, 21 insertions(+), 46 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index e26c46f25..287cfe547 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -24,6 +24,7 @@ sha1_process_block64: # xmm0..xmm3: W[] # xmm4,xmm5: temps # xmm6: current round constant +# xmm7: all round constants # -64(%rsp): area for passing RCONST + W[] from vector to integer units movl 80(%rdi), %eax # a = ctx->hash[0] @@ -32,16 +33,17 @@ sha1_process_block64: movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] - movaps rconst0x5A827999(%rip), %xmm6 + movaps sha1const(%rip), %xmm7 + pshufd $0x00, %xmm7, %xmm6 # Load W[] to xmm registers, byteswapping on the fly. # # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1A's instead of spilling them to stack. + # for use in RD1As instead of spilling them to stack. # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it's probably a wash. + # can do two additions at once, so it is probably a wash. # (We use rsi instead of rN because this makes two - # LEAs in two first RD1A's shorter by one byte). + # LEAs in two first RD1As shorter by one byte). movq 4*0(%rdi), %rsi movq 4*2(%rdi), %r8 bswapq %rsi @@ -253,7 +255,7 @@ sha1_process_block64: roll $5, %edi # rotl32(a,5) addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) - movaps rconst0x6ED9EBA1(%rip), %xmm6 + pshufd $0x55, %xmm7, %xmm6 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) movaps %xmm0, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) @@ -614,7 +616,7 @@ sha1_process_block64: roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) - movaps rconst0x8F1BBCDC(%rip), %xmm6 + pshufd $0xaa, %xmm7, %xmm6 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) movaps %xmm1, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) @@ -1001,7 +1003,7 @@ sha1_process_block64: roll $5, %esi # rotl32(a,5) addl %esi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) - movaps rconst0xCA62C1D6(%rip), %xmm6 + pshufd $0xff, %xmm7, %xmm6 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) movaps %xmm2, %xmm4 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) @@ -1475,25 +1477,10 @@ sha1_process_block64: .section .rodata.cst16.sha1const, "aM", @progbits, 16 .balign 16 -rconst0x5A827999: +sha1const: .long 0x5A827999 - .long 0x5A827999 - .long 0x5A827999 - .long 0x5A827999 -rconst0x6ED9EBA1: - .long 0x6ED9EBA1 - .long 0x6ED9EBA1 - .long 0x6ED9EBA1 .long 0x6ED9EBA1 -rconst0x8F1BBCDC: .long 0x8F1BBCDC - .long 0x8F1BBCDC - .long 0x8F1BBCDC - .long 0x8F1BBCDC -rconst0xCA62C1D6: - .long 0xCA62C1D6 - .long 0xCA62C1D6 - .long 0xCA62C1D6 .long 0xCA62C1D6 #endif diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index fb1e4b57e..a10ac411d 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -34,6 +34,7 @@ exec >hash_md5_sha_x86-64.S xmmT1="%xmm4" xmmT2="%xmm5" xmmRCONST="%xmm6" +xmmALLRCONST="%xmm7" T=`printf '\t'` # SSE instructions are longer than 4 bytes on average. @@ -125,6 +126,7 @@ sha1_process_block64: # xmm0..xmm3: W[] # xmm4,xmm5: temps # xmm6: current round constant +# xmm7: all round constants # -64(%rsp): area for passing RCONST + W[] from vector to integer units movl 80(%rdi), %eax # a = ctx->hash[0] @@ -133,16 +135,17 @@ sha1_process_block64: movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] - movaps rconst0x5A827999(%rip), $xmmRCONST + movaps sha1const(%rip), $xmmALLRCONST + pshufd \$0x00, $xmmALLRCONST, $xmmRCONST # Load W[] to xmm registers, byteswapping on the fly. # # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1A's instead of spilling them to stack. + # for use in RD1As instead of spilling them to stack. # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it's probably a wash. + # can do two additions at once, so it is probably a wash. # (We use rsi instead of rN because this makes two - # LEAs in two first RD1A's shorter by one byte). + # LEAs in two first RD1As shorter by one byte). movq 4*0(%rdi), %rsi movq 4*2(%rdi), %r8 bswapq %rsi @@ -359,7 +362,7 @@ RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` INTERLEAVE "$a" "$b" -a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" +a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` INTERLEAVE "$a" "$b" @@ -378,7 +381,7 @@ INTERLEAVE "$a" "$b" a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` INTERLEAVE "$a" "$b" -a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" +a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` INTERLEAVE "$a" "$b" @@ -397,7 +400,7 @@ INTERLEAVE "$a" "$b" a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` INTERLEAVE "$a" "$b" -a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" +a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` INTERLEAVE "$a" "$b" @@ -439,25 +442,10 @@ echo " .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 .balign 16 -rconst0x5A827999: +sha1const: .long 0x5A827999 - .long 0x5A827999 - .long 0x5A827999 - .long 0x5A827999 -rconst0x6ED9EBA1: - .long 0x6ED9EBA1 - .long 0x6ED9EBA1 - .long 0x6ED9EBA1 .long 0x6ED9EBA1 -rconst0x8F1BBCDC: .long 0x8F1BBCDC - .long 0x8F1BBCDC - .long 0x8F1BBCDC - .long 0x8F1BBCDC -rconst0xCA62C1D6: - .long 0xCA62C1D6 - .long 0xCA62C1D6 - .long 0xCA62C1D6 .long 0xCA62C1D6 #endif" -- cgit v1.2.3-55-g6feb From 71a1cccaad679bd102f87283f78c581a8fb0e255 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 8 Feb 2022 08:20:27 +0100 Subject: libbb/sha1: shrink x86 hardware accelerated hashing function old new delta sha1_process_block64_shaNI 32-bit 524 517 -7 sha1_process_block64_shaNI 64-bit 510 508 -2 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-32_shaNI.S | 37 +++++++++++++++++-------------------- libbb/hash_md5_sha_x86-64_shaNI.S | 24 ++++++++++++------------ 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 5d082ebfb..0f3fe57ca 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -32,14 +32,10 @@ #define MSG1 %xmm4 #define MSG2 %xmm5 #define MSG3 %xmm6 -#define SHUF_MASK %xmm7 - .balign 8 # allow decoders to fetch at least 3 first insns + .balign 8 # allow decoders to fetch at least 2 first insns sha1_process_block64_shaNI: - pushl %ebp - movl %esp, %ebp - subl $32, %esp - andl $~0xF, %esp # paddd needs aligned memory operand + subl $16, %esp /* load initial hash values */ xor128 E0, E0 @@ -47,30 +43,33 @@ sha1_process_block64_shaNI: pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD - mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK + mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 + + movu128 0*16(%eax), MSG0 + pshufb %xmm7, MSG0 + movu128 1*16(%eax), MSG1 + pshufb %xmm7, MSG1 + movu128 2*16(%eax), MSG2 + pshufb %xmm7, MSG2 + movu128 3*16(%eax), MSG3 + pshufb %xmm7, MSG3 /* Save hash values for addition after rounds */ - movu128 E0, 16(%esp) + movu128 E0, %xmm7 movu128 ABCD, (%esp) /* Rounds 0-3 */ - movu128 0*16(%eax), MSG0 - pshufb SHUF_MASK, MSG0 paddd MSG0, E0 mova128 ABCD, E1 sha1rnds4 $0, E0, ABCD /* Rounds 4-7 */ - movu128 1*16(%eax), MSG1 - pshufb SHUF_MASK, MSG1 sha1nexte MSG1, E1 mova128 ABCD, E0 sha1rnds4 $0, E1, ABCD sha1msg1 MSG1, MSG0 /* Rounds 8-11 */ - movu128 2*16(%eax), MSG2 - pshufb SHUF_MASK, MSG2 sha1nexte MSG2, E0 mova128 ABCD, E1 sha1rnds4 $0, E0, ABCD @@ -78,8 +77,6 @@ sha1_process_block64_shaNI: xor128 MSG2, MSG0 /* Rounds 12-15 */ - movu128 3*16(%eax), MSG3 - pshufb SHUF_MASK, MSG3 sha1nexte MSG3, E1 mova128 ABCD, E0 sha1msg2 MSG3, MSG0 @@ -210,16 +207,16 @@ sha1_process_block64_shaNI: sha1rnds4 $3, E1, ABCD /* Add current hash values with previously saved */ - sha1nexte 16(%esp), E0 - paddd (%esp), ABCD + sha1nexte %xmm7, E0 + movu128 (%esp), %xmm7 + paddd %xmm7, ABCD /* Write hash values back in the correct order */ shuf128_32 $0x1B, ABCD, ABCD movu128 ABCD, 76(%eax) extr128_32 $3, E0, 76+4*4(%eax) - movl %ebp, %esp - popl %ebp + addl $16, %esp ret .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index 8ddec87ce..fc2ca92e8 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -32,7 +32,6 @@ #define MSG1 %xmm4 #define MSG2 %xmm5 #define MSG3 %xmm6 -#define SHUF_MASK %xmm7 .balign 8 # allow decoders to fetch at least 2 first insns sha1_process_block64_shaNI: @@ -43,30 +42,33 @@ sha1_process_block64_shaNI: pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD - mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 + + movu128 0*16(%rdi), MSG0 + pshufb %xmm7, MSG0 + movu128 1*16(%rdi), MSG1 + pshufb %xmm7, MSG1 + movu128 2*16(%rdi), MSG2 + pshufb %xmm7, MSG2 + movu128 3*16(%rdi), MSG3 + pshufb %xmm7, MSG3 /* Save hash values for addition after rounds */ - mova128 E0, %xmm9 + mova128 E0, %xmm7 mova128 ABCD, %xmm8 /* Rounds 0-3 */ - movu128 0*16(%rdi), MSG0 - pshufb SHUF_MASK, MSG0 paddd MSG0, E0 mova128 ABCD, E1 sha1rnds4 $0, E0, ABCD /* Rounds 4-7 */ - movu128 1*16(%rdi), MSG1 - pshufb SHUF_MASK, MSG1 sha1nexte MSG1, E1 mova128 ABCD, E0 sha1rnds4 $0, E1, ABCD sha1msg1 MSG1, MSG0 /* Rounds 8-11 */ - movu128 2*16(%rdi), MSG2 - pshufb SHUF_MASK, MSG2 sha1nexte MSG2, E0 mova128 ABCD, E1 sha1rnds4 $0, E0, ABCD @@ -74,8 +76,6 @@ sha1_process_block64_shaNI: xor128 MSG2, MSG0 /* Rounds 12-15 */ - movu128 3*16(%rdi), MSG3 - pshufb SHUF_MASK, MSG3 sha1nexte MSG3, E1 mova128 ABCD, E0 sha1msg2 MSG3, MSG0 @@ -206,7 +206,7 @@ sha1_process_block64_shaNI: sha1rnds4 $3, E1, ABCD /* Add current hash values with previously saved */ - sha1nexte %xmm9, E0 + sha1nexte %xmm7, E0 paddd %xmm8, ABCD /* Write hash values back in the correct order */ -- cgit v1.2.3-55-g6feb From eb52e7fa522d829fb400461ca4c808ee5c1d6428 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 8 Feb 2022 15:23:26 +0100 Subject: libbb/sha1: shrink x86 hardware accelerated hashing (32-bit) function old new delta sha1_process_block64_shaNI 517 511 -6 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-32_shaNI.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 0f3fe57ca..ad814a21b 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -35,11 +35,9 @@ .balign 8 # allow decoders to fetch at least 2 first insns sha1_process_block64_shaNI: - subl $16, %esp - /* load initial hash values */ - xor128 E0, E0 movu128 76(%eax), ABCD + xor128 E0, E0 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD @@ -56,7 +54,7 @@ sha1_process_block64_shaNI: /* Save hash values for addition after rounds */ movu128 E0, %xmm7 - movu128 ABCD, (%esp) + /*movu128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ /* Rounds 0-3 */ paddd MSG0, E0 @@ -208,7 +206,9 @@ sha1_process_block64_shaNI: /* Add current hash values with previously saved */ sha1nexte %xmm7, E0 - movu128 (%esp), %xmm7 + /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ + movu128 76(%eax), %xmm7 # recreate original ABCD + shuf128_32 $0x1B, %xmm7, %xmm7 # DCBA -> ABCD paddd %xmm7, ABCD /* Write hash values back in the correct order */ @@ -216,7 +216,6 @@ sha1_process_block64_shaNI: movu128 ABCD, 76(%eax) extr128_32 $3, E0, 76+4*4(%eax) - addl $16, %esp ret .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI -- cgit v1.2.3-55-g6feb From eb8d5f3b8f3c91f3ed82a52b4ce52a154c146ede Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 8 Feb 2022 15:34:02 +0100 Subject: libbb/sha1: shrink x86 hardware accelerated hashing (32-bit) function old new delta sha1_process_block64_shaNI 511 507 -4 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-32_shaNI.S | 9 ++++----- libbb/hash_md5_sha_x86-64_shaNI.S | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index ad814a21b..a61b3cbed 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -53,8 +53,8 @@ sha1_process_block64_shaNI: pshufb %xmm7, MSG3 /* Save hash values for addition after rounds */ - movu128 E0, %xmm7 - /*movu128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ + mova128 E0, %xmm7 + /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ /* Rounds 0-3 */ paddd MSG0, E0 @@ -207,12 +207,11 @@ sha1_process_block64_shaNI: /* Add current hash values with previously saved */ sha1nexte %xmm7, E0 /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ - movu128 76(%eax), %xmm7 # recreate original ABCD - shuf128_32 $0x1B, %xmm7, %xmm7 # DCBA -> ABCD - paddd %xmm7, ABCD + movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)... /* Write hash values back in the correct order */ shuf128_32 $0x1B, ABCD, ABCD + paddd %xmm7, ABCD # ...add it to final ABCD movu128 ABCD, 76(%eax) extr128_32 $3, E0, 76+4*4(%eax) diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index fc2ca92e8..b32029360 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -36,9 +36,8 @@ .balign 8 # allow decoders to fetch at least 2 first insns sha1_process_block64_shaNI: /* load initial hash values */ - - xor128 E0, E0 movu128 80(%rdi), ABCD + xor128 E0, E0 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD -- cgit v1.2.3-55-g6feb From c0ff0d4528d718c20b9ca2290bd10d59e9f794a3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 9 Feb 2022 00:33:39 +0100 Subject: libbb/sha256: code shrink in 32-bit x86 function old new delta sha256_process_block64_shaNI 713 697 -16 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 130 ++++++++++++++++------------------- libbb/hash_md5_sha256_x86-64_shaNI.S | 107 ++++++++++++++-------------- 2 files changed, 114 insertions(+), 123 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 39e2baf41..a849dfcc2 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -31,35 +31,27 @@ #define MSGTMP1 %xmm4 #define MSGTMP2 %xmm5 #define MSGTMP3 %xmm6 -#define XMMTMP4 %xmm7 - .balign 8 # allow decoders to fetch at least 3 first insns -sha256_process_block64_shaNI: - pushl %ebp - movl %esp, %ebp - subl $32, %esp - andl $~0xF, %esp # paddd needs aligned memory operand +#define XMMTMP %xmm7 + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: movu128 76+0*16(%eax), STATE0 movu128 76+1*16(%eax), STATE1 - shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ - shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ - mova128 STATE0, XMMTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */ + shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ + shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + mova128 STATE0, XMMTMP + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, XMMTMP, STATE1 /* CDGH */ -/* XMMTMP4 holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4 +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP movl $K256+8*16, SHA256CONSTANTS - /* Save hash values for addition after rounds */ - mova128 STATE0, 0*16(%esp) - mova128 STATE1, 1*16(%esp) - /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -68,7 +60,7 @@ sha256_process_block64_shaNI: /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -78,7 +70,7 @@ sha256_process_block64_shaNI: /* Rounds 8-11 */ movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -88,14 +80,14 @@ sha256_process_block64_shaNI: /* Rounds 12-15 */ movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG /* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -105,9 +97,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -117,9 +109,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -129,9 +121,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -141,9 +133,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -153,9 +145,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -165,9 +157,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -177,9 +169,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -189,9 +181,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -201,9 +193,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -213,9 +205,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -224,9 +216,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -238,22 +230,20 @@ sha256_process_block64_shaNI: shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 - /* Add current hash values with previously saved */ - paddd 0*16(%esp), STATE0 - paddd 1*16(%esp), STATE1 - /* Write hash values back in the correct order */ - shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ - shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ - mova128 STATE0, XMMTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, XMMTMP4, STATE1 /* HGFE */ - + shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ + shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + mova128 STATE0, XMMTMP + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, XMMTMP, STATE1 /* HGFE */ + /* add current hash values to previous ones */ + movu128 76+0*16(%eax), XMMTMP + paddd XMMTMP, STATE0 + movu128 76+1*16(%eax), XMMTMP movu128 STATE0, 76+0*16(%eax) + paddd XMMTMP, STATE1 movu128 STATE1, 76+1*16(%eax) - movl %ebp, %esp - popl %ebp ret .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index c6c931341..b5c950a9a 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -31,7 +31,8 @@ #define MSGTMP1 %xmm4 #define MSGTMP2 %xmm5 #define MSGTMP3 %xmm6 -#define XMMTMP4 %xmm7 + +#define XMMTMP %xmm7 #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 @@ -41,14 +42,14 @@ sha256_process_block64_shaNI: movu128 80+0*16(%rdi), STATE0 movu128 80+1*16(%rdi), STATE1 - shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ - shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ - mova128 STATE0, XMMTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */ + shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ + shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + mova128 STATE0, XMMTMP + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, XMMTMP, STATE1 /* CDGH */ -/* XMMTMP4 holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP4 +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP leaq K256+8*16(%rip), SHA256CONSTANTS /* Save hash values for addition after rounds */ @@ -57,7 +58,7 @@ sha256_process_block64_shaNI: /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -66,7 +67,7 @@ sha256_process_block64_shaNI: /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -76,7 +77,7 @@ sha256_process_block64_shaNI: /* Rounds 8-11 */ movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -86,14 +87,14 @@ sha256_process_block64_shaNI: /* Rounds 12-15 */ movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG /* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -103,9 +104,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -115,9 +116,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -127,9 +128,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -139,9 +140,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -151,9 +152,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -163,9 +164,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -175,9 +176,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -187,9 +188,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -199,9 +200,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -211,9 +212,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -222,9 +223,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -241,11 +242,11 @@ sha256_process_block64_shaNI: paddd CDGH_SAVE, STATE1 /* Write hash values back in the correct order */ - shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ - shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ - mova128 STATE0, XMMTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, XMMTMP4, STATE1 /* HGFE */ + shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ + shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + mova128 STATE0, XMMTMP + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, XMMTMP, STATE1 /* HGFE */ movu128 STATE0, 80+0*16(%rdi) movu128 STATE1, 80+1*16(%rdi) -- cgit v1.2.3-55-g6feb From 461a994b09c5022b93bccccf903b39438d61bbf1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 9 Feb 2022 01:30:23 +0100 Subject: libbb/sha256: code shrink in 32-bit x86 function old new delta sha256_process_block64_shaNI 697 676 -21 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index a849dfcc2..846230e3e 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -34,16 +34,18 @@ #define XMMTMP %xmm7 +#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) + .balign 8 # allow decoders to fetch at least 2 first insns sha256_process_block64_shaNI: - movu128 76+0*16(%eax), STATE0 - movu128 76+1*16(%eax), STATE1 - shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ - shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + movu128 76+0*16(%eax), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */ + movu128 76+1*16(%eax), STATE0 /* HGFE */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ mova128 STATE0, XMMTMP - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, XMMTMP, STATE1 /* CDGH */ + shufps SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */ + shufps SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */ + mova128 XMMTMP, STATE1 /* XMMTMP holds flip mask from here... */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP @@ -231,18 +233,19 @@ sha256_process_block64_shaNI: sha256rnds2 STATE1, STATE0 /* Write hash values back in the correct order */ - shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ - shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ + /* STATE1: CDGH */ mova128 STATE0, XMMTMP - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, XMMTMP, STATE1 /* HGFE */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ /* add current hash values to previous ones */ + movu128 76+1*16(%eax), STATE1 + paddd XMMTMP, STATE1 + movu128 STATE1, 76+1*16(%eax) movu128 76+0*16(%eax), XMMTMP paddd XMMTMP, STATE0 - movu128 76+1*16(%eax), XMMTMP movu128 STATE0, 76+0*16(%eax) - paddd XMMTMP, STATE1 - movu128 STATE1, 76+1*16(%eax) ret .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI -- cgit v1.2.3-55-g6feb From 11bcea7ac0ac4b2156c1b2d53f926d789b9792b4 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 9 Feb 2022 01:42:49 +0100 Subject: libbb/sha256: code shrink in 64-bit x86 function old new delta sha256_process_block64_shaNI 701 680 -21 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-64_shaNI.S | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index b5c950a9a..bc063b9cc 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -37,16 +37,18 @@ #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 +#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) + .balign 8 # allow decoders to fetch at least 2 first insns sha256_process_block64_shaNI: - movu128 80+0*16(%rdi), STATE0 - movu128 80+1*16(%rdi), STATE1 - shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ - shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + movu128 80+0*16(%rdi), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */ + movu128 80+1*16(%rdi), STATE0 /* HGFE */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ mova128 STATE0, XMMTMP - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, XMMTMP, STATE1 /* CDGH */ + shufps SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */ + shufps SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */ + mova128 XMMTMP, STATE1 /* XMMTMP holds flip mask from here... */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP @@ -242,14 +244,15 @@ sha256_process_block64_shaNI: paddd CDGH_SAVE, STATE1 /* Write hash values back in the correct order */ - shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ - shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ + /* STATE1: CDGH */ mova128 STATE0, XMMTMP - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, XMMTMP, STATE1 /* HGFE */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ movu128 STATE0, 80+0*16(%rdi) - movu128 STATE1, 80+1*16(%rdi) + movu128 XMMTMP, 80+1*16(%rdi) ret .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI -- cgit v1.2.3-55-g6feb From caa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 9 Feb 2022 01:50:22 +0100 Subject: libbb/sha256: code shrink in x86 assembly function old new delta sha256_process_block64_shaNI 32-bit 676 673 -3 sha256_process_block64_shaNI 64-bit 680 677 -3 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 11 +++++------ libbb/hash_md5_sha256_x86-64_shaNI.S | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 846230e3e..aa68193bd 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -39,13 +39,12 @@ .balign 8 # allow decoders to fetch at least 2 first insns sha256_process_block64_shaNI: - movu128 76+0*16(%eax), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */ - movu128 76+1*16(%eax), STATE0 /* HGFE */ + movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ + movu128 76+1*16(%eax), STATE1 /* HGFE */ /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - mova128 STATE0, XMMTMP - shufps SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */ - shufps SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */ - mova128 XMMTMP, STATE1 + mova128 STATE1, STATE0 + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ /* XMMTMP holds flip mask from here... */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index bc063b9cc..4663f750a 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -42,13 +42,12 @@ .balign 8 # allow decoders to fetch at least 2 first insns sha256_process_block64_shaNI: - movu128 80+0*16(%rdi), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */ - movu128 80+1*16(%rdi), STATE0 /* HGFE */ + movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ + movu128 80+1*16(%rdi), STATE1 /* HGFE */ /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - mova128 STATE0, XMMTMP - shufps SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */ - shufps SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */ - mova128 XMMTMP, STATE1 + mova128 STATE1, STATE0 + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ /* XMMTMP holds flip mask from here... */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP -- cgit v1.2.3-55-g6feb