From ab755f492599cf595d532f0f240a14c6e5caa435 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 14 Jul 2023 16:37:24 +0200 Subject: hwclock: force LONG_OPTS, stop accepting non-compatible -t function old new delta hwclock_main 576 579 +3 .rodata 105404 105349 -55 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/1 up/down: 3/-55) Total: -52 bytes Signed-off-by: Denys Vlasenko --- util-linux/hwclock.c | 83 ++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/util-linux/hwclock.c b/util-linux/hwclock.c index d78bfe374..e6f0043d0 100644 --- a/util-linux/hwclock.c +++ b/util-linux/hwclock.c @@ -9,6 +9,7 @@ //config:config HWCLOCK //config: bool "hwclock (5.9 kb)" //config: default y +//config: select LONG_OPTS //config: help //config: The hwclock utility is used to read and set the hardware clock //config: on a system. This is primarily used to set the current time on @@ -409,89 +410,89 @@ static void set_rtc_param(const char **pp_rtcname, char *rtc_param) // -v, --verbose display more details //usage:#define hwclock_trivial_usage -//usage: IF_LONG_OPTS( -//usage: "[-swul] [--systz] [--param-get PARAM] [--param-set PARAM=VAL] [-f DEV]" -//usage: ) -//usage: IF_NOT_LONG_OPTS( -//usage: "[-swult] [-g PARAM] [-p PARAM=VAL] [-f DEV]" -//usage: ) +//usage: "[-ul] [-f DEV] [-s|-w|--systz|--param-get PARAM|--param-set PARAM=VAL]" //usage:#define hwclock_full_usage "\n\n" //usage: "Show or set hardware clock (RTC)\n" +//usage: "\n -f DEV Use this device (e.g. /dev/rtc2)" +//usage: "\n -u Assume RTC is kept in UTC" +//usage: "\n -l Assume RTC is kept in local time" +//usage: "\n (if neither is given, read from "ADJTIME_PATH")" ///////: "\n -r Show RTC time" ///////-r is default, don't bother showing it in help //usage: "\n -s Set system time from RTC" //usage: "\n -w Set RTC from system time" -//usage: IF_LONG_OPTS( //usage: "\n --systz Set in-kernel timezone, correct system time" //usage: "\n if RTC is kept in local time" //usage: "\n --param-get PARAM Get RTC parameter" //usage: "\n --param-set PARAM=VAL Set RTC parameter" -//usage: ) -//usage: "\n -f DEV Use specified device (e.g. /dev/rtc2)" -//usage: "\n -u Assume RTC is kept in UTC" -//usage: "\n -l Assume RTC is kept in local time" -//usage: "\n (if neither is given, read from "ADJTIME_PATH")" - -//TODO: get rid of incompatible -t alias to --systz? - -#define HWCLOCK_OPT_LOCALTIME 0x01 -#define HWCLOCK_OPT_UTC 0x02 -#define HWCLOCK_OPT_SHOW 0x04 -#define HWCLOCK_OPT_HCTOSYS 0x08 -#define HWCLOCK_OPT_SYSTOHC 0x10 -#define HWCLOCK_OPT_SYSTZ 0x20 -#define HWCLOCK_OPT_RTCFILE 0x40 -#define HWCLOCK_OPT_PARAM_GET 0x80 -#define HWCLOCK_OPT_PARAM_SET 0x100 int hwclock_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int hwclock_main(int argc UNUSED_PARAM, char **argv) { const char *rtcname = NULL; char *param; - unsigned opt; + unsigned opt, exclusive; int utc; -#if ENABLE_LONG_OPTS +#define OPT_LOCALTIME (1 << 0) +#define OPT_UTC (1 << 1) +#define OPT_RTCFILE (1 << 2) +#define OPT_SHOW (1 << 3) +#define OPT_HCTOSYS (1 << 4) +#define OPT_SYSTOHC (1 << 5) +#define OPT_PARAM_GET (1 << 6) +#define OPT_PARAM_SET (1 << 7) +//#define OPT_VERBOSE (1 << 8) UNUSED +#define OPT_SYSTZ (1 << 9) static const char hwclock_longopts[] ALIGN1 = "localtime\0" No_argument "l" "utc\0" No_argument "u" + "rtc\0" Required_argument "f" "show\0" No_argument "r" "hctosys\0" No_argument "s" "systohc\0" No_argument "w" - "systz\0" No_argument "t" /* short opt is non-standard */ - "rtc\0" Required_argument "f" - "param-get\0" Required_argument "g" /* short opt is non-standard */ - "param-set\0" Required_argument "p" /* short opt is non-standard */ + "param-get\0" Required_argument "\xfd" /* no short equivalent */ + "param-set\0" Required_argument "\xfe" /* no short equivalent */ + "systz\0" No_argument "\xff" /* no short equivalent */ ; -#endif opt = getopt32long(argv, - "^""lurswtf:g:p:v" /* -v is accepted and ignored */ + "^""luf:rsw\xfd:\xfe:v" /* -v is accepted and ignored */ "\0" - "r--wstgp:w--rstgp:s--wrtgp:t--rswgp:g--rswtp:p--rswtg:l--u:u--l", + "l--u:u--l", hwclock_longopts, &rtcname, ¶m, ¶m ); +#if 0 //DEBUG + bb_error_msg("opt:0x%x", opt); + if (opt & OPT_PARAM_GET) bb_error_msg("OPT_PARAM_GET %s", param); + if (opt & OPT_PARAM_SET) bb_error_msg("OPT_PARAM_SET %s", param); + if (opt & OPT_SYSTZ ) bb_error_msg("OPT_SYSTZ"); + return 0; +#endif + /* All options apart from -luf are exclusive, enforce */ + exclusive = opt >> 3; + if ((exclusive - 1) & exclusive) /* more than one bit set? */ + bb_show_usage(); /* If -u or -l wasn't given, check if we are using utc */ - if (opt & (HWCLOCK_OPT_UTC | HWCLOCK_OPT_LOCALTIME)) - utc = (opt & HWCLOCK_OPT_UTC); + if (opt & (OPT_UTC | OPT_LOCALTIME)) + utc = (opt & OPT_UTC); else utc = rtc_adjtime_is_utc(); - if (opt & HWCLOCK_OPT_HCTOSYS) + if (opt & OPT_HCTOSYS) to_sys_clock(&rtcname, utc); - else if (opt & HWCLOCK_OPT_SYSTOHC) + else if (opt & OPT_SYSTOHC) from_sys_clock(&rtcname, utc); - else if (opt & HWCLOCK_OPT_SYSTZ) + else if (opt & OPT_SYSTZ) set_kernel_timezone_and_clock(utc, NULL); - else if (opt & HWCLOCK_OPT_PARAM_GET) + else if (opt & OPT_PARAM_GET) get_rtc_param(&rtcname, param); - else if (opt & HWCLOCK_OPT_PARAM_SET) + else if (opt & OPT_PARAM_SET) set_rtc_param(&rtcname, param); else - /* default HWCLOCK_OPT_SHOW */ + /* default OPT_SHOW */ show_clock(&rtcname, utc); return 0; -- cgit v1.2.3-55-g6feb From cf809e2f2dbf699035e4841e45070b947374a989 Mon Sep 17 00:00:00 2001 From: YU Jincheng Date: Fri, 7 Jul 2023 16:44:24 +0800 Subject: getfattr: new applet function old new delta getfattr_main - 309 +309 print_attr - 115 +115 packed_usage 34576 34631 +55 .rodata 105349 105395 +46 lgetxattr - 41 +41 getxattr - 41 +41 llistxattr - 35 +35 listxattr - 35 +35 applet_names 2806 2815 +9 applet_main 1620 1624 +4 ------------------------------------------------------------------------------ (add/remove: 7/0 grow/shrink: 4/0 up/down: 690/0) Total: 690 bytes Signed-off-by: YU Jincheng Signed-off-by: Denys Vlasenko --- miscutils/getfattr.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 miscutils/getfattr.c diff --git a/miscutils/getfattr.c b/miscutils/getfattr.c new file mode 100644 index 000000000..59b6f6bca --- /dev/null +++ b/miscutils/getfattr.c @@ -0,0 +1,131 @@ +/* + * getfattr - get extended attributes of filesystem objects. + * + * Copyright (C) 2023 by LoveSy + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +//config:config GETFATTR +//config: bool "getfattr (12.3 kb)" +//config: default y +//config: help +//config: Get extended attributes on files + +//applet:IF_GETFATTR(APPLET_NOEXEC(getfattr, getfattr, BB_DIR_USR_BIN, BB_SUID_DROP, getfattr)) + +//kbuild:lib-$(CONFIG_GETFATTR) += getfattr.o + +#include +#include +#include "libbb.h" + +//usage:#define getfattr_trivial_usage +//usage: "[-h] {-d|-n ATTR} FILE...\n" +//usage:#define getfattr_full_usage "\n\n" +//usage: "Get extended attributes" +//usage: "\n" +//usage: "\n -h Do not follow symlinks" +//usage: "\n -d Dump all attributes" +//usage: "\n -n ATTR Get attribute ATTR" + +enum { + OPT_h = (1 << 0), + OPT_d = (1 << 1), +}; + +static int print_attr(const char *file, const char *name, char **buf, size_t *bufsize) +{ + ssize_t len; + + if (*bufsize == 0) + goto grow; + again: + len = ((option_mask32 & OPT_h) ? lgetxattr: getxattr)(file, name, *buf, *bufsize); + if (len < 0) { + if (errno != ERANGE) + return len; + grow: + *bufsize = (*bufsize * 2) + 1024; + *buf = xrealloc(*buf, *bufsize); + goto again; + } + printf("%s=\"%.*s\"\n", name, len, *buf); + return 0; +} + +static ssize_t list_attr(const char *file, char **list, size_t *listsize) +{ + ssize_t len; + + if (*listsize == 0) + goto grow; + again: + len = ((option_mask32 & OPT_h) ? llistxattr : listxattr)(file, *list, *listsize); + if (len < 0) { + if (errno != ERANGE) + return len; + grow: + *listsize = (*listsize * 2) + 1024; + *list = xrealloc(*list, *listsize); + goto again; + } + return len; +} + +int getfattr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; +int getfattr_main(int argc UNUSED_PARAM, char **argv) +{ + const char *name; + int status; + int opt; + char *buf = NULL; + size_t bufsize = 0; + char *list = NULL; + size_t listsize = 0; + + opt = getopt32(argv, "^" + "hdn:" + /* Min one arg; exactly one of -n or -d is required. */ + "\0" "-1:d:n:n--d:d--n" + , &name + ); + argv += optind; + status = EXIT_SUCCESS; + + do { + int r; + if (opt & OPT_d) { + ssize_t len = list_attr(*argv, &list, &listsize); + if (len > 0) { + char *key; + printf("# file: %s\n", *argv); + key = list; + while (len > 0) { + ssize_t keylen; + r = print_attr(*argv, key, &buf, &bufsize); + if (r) + goto err; + keylen = strlen(key) + 1; + key += keylen; + len -= keylen; + } + bb_putchar('\n'); + } + } else { + printf("# file: %s\n", *argv); + r = print_attr(*argv, name, &buf, &bufsize); + if (r) { + err: + bb_simple_perror_msg(*argv); + status = EXIT_FAILURE; + // continue; maybe? + } + bb_putchar('\n'); + } + } while (*++argv); + + if (ENABLE_FEATURE_CLEAN_UP) + free(buf); + + fflush_stdout_and_exit(status); +} -- cgit v1.2.3-55-g6feb From a6a102ec4c8d96fcfb968c88fbdae80f6142c7bf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 17 Jul 2023 09:36:17 +0200 Subject: getfattr: fix "getfattr NOTEXIST" - now prints error msg function old new delta getfattr_main 309 307 -2 .rodata 105395 105391 -4 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-6) Total: -6 bytes Signed-off-by: Denys Vlasenko --- miscutils/getfattr.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/miscutils/getfattr.c b/miscutils/getfattr.c index 59b6f6bca..905aec65f 100644 --- a/miscutils/getfattr.c +++ b/miscutils/getfattr.c @@ -31,6 +31,7 @@ enum { OPT_h = (1 << 0), OPT_d = (1 << 1), + OPT_n = (1 << 2), }; static int print_attr(const char *file, const char *name, char **buf, size_t *bufsize) @@ -85,8 +86,9 @@ int getfattr_main(int argc UNUSED_PARAM, char **argv) opt = getopt32(argv, "^" "hdn:" - /* Min one arg; exactly one of -n or -d is required. */ - "\0" "-1:d:n:n--d:d--n" + /* Min one arg; -d and -n are exclusive */ + "\0" "-1:n--d:d--n" + //getfattr 2.5.1 does not enforce this: ":d:n" /* exactly one of -n or -d is required */ , &name ); argv += optind; @@ -94,8 +96,11 @@ int getfattr_main(int argc UNUSED_PARAM, char **argv) do { int r; - if (opt & OPT_d) { +//getfattr 2.5.1 with no -n/-d defaults to -d + if (!(opt & OPT_n)) { ssize_t len = list_attr(*argv, &list, &listsize); + if (len < 0) + goto err; if (len > 0) { char *key; printf("# file: %s\n", *argv); @@ -118,7 +123,7 @@ int getfattr_main(int argc UNUSED_PARAM, char **argv) err: bb_simple_perror_msg(*argv); status = EXIT_FAILURE; - // continue; maybe? + continue; } bb_putchar('\n'); } -- cgit v1.2.3-55-g6feb From c484846c4459affa769b84cbd0b586f2bbaec828 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 17 Jul 2023 17:29:36 +0200 Subject: introduce and use exitcode_t function old new delta strings_main 422 420 -2 setfattr_main 175 173 -2 brctl_main 1548 1546 -2 makedevs_main 979 975 -4 rev_main 337 332 -5 getfattr_main 307 302 -5 cut_main 1201 1196 -5 cksum_main 398 393 -5 umount_main 573 565 -8 ln_main 516 508 -8 expand_main 660 652 -8 df_main 1068 1060 -8 renice_main 346 332 -14 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/13 up/down: 0/-76) Total: -76 bytes Signed-off-by: Denys Vlasenko --- coreutils/cksum.c | 2 +- coreutils/cut.c | 2 +- coreutils/dd.c | 2 +- coreutils/df.c | 2 +- coreutils/expand.c | 2 +- coreutils/fold.c | 2 +- coreutils/ln.c | 2 +- coreutils/touch.c | 2 +- include/libbb.h | 7 +++++++ miscutils/getfattr.c | 2 +- miscutils/makedevs.c | 2 +- miscutils/setfattr.c | 2 +- miscutils/strings.c | 3 ++- networking/brctl.c | 2 +- networking/tc.c | 5 ++--- util-linux/renice.c | 2 +- util-linux/rev.c | 2 +- util-linux/umount.c | 2 +- 18 files changed, 26 insertions(+), 19 deletions(-) diff --git a/coreutils/cksum.c b/coreutils/cksum.c index badc63a6a..1fb6ef2d0 100644 --- a/coreutils/cksum.c +++ b/coreutils/cksum.c @@ -39,7 +39,7 @@ int cksum_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int cksum_main(int argc UNUSED_PARAM, char **argv) { uint32_t *crc32_table = crc32_filltable(NULL, IS_CKSUM); - int exit_code = EXIT_SUCCESS; + exitcode_t exit_code = EXIT_SUCCESS; #if ENABLE_DESKTOP getopt32(argv, ""); /* cksum coreutils 6.9 compat */ diff --git a/coreutils/cut.c b/coreutils/cut.c index 25b16d1a8..d129f9b9d 100644 --- a/coreutils/cut.c +++ b/coreutils/cut.c @@ -311,7 +311,7 @@ int cut_main(int argc UNUSED_PARAM, char **argv) } { - int retval = EXIT_SUCCESS; + exitcode_t retval = EXIT_SUCCESS; if (!*argv) *--argv = (char *)"-"; diff --git a/coreutils/dd.c b/coreutils/dd.c index c032ebe1b..8bb782781 100644 --- a/coreutils/dd.c +++ b/coreutils/dd.c @@ -375,7 +375,7 @@ int dd_main(int argc UNUSED_PARAM, char **argv) OP_oflag_direct, #endif }; - smallint exitcode = EXIT_FAILURE; + exitcode_t exitcode = EXIT_FAILURE; int i; size_t ibs = 512; char *ibuf; diff --git a/coreutils/df.c b/coreutils/df.c index 76e9cefbf..03aa78148 100644 --- a/coreutils/df.c +++ b/coreutils/df.c @@ -113,7 +113,7 @@ int df_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int df_main(int argc UNUSED_PARAM, char **argv) { unsigned long df_disp_hr = 1024; - int status = EXIT_SUCCESS; + exitcode_t status = EXIT_SUCCESS; unsigned opt; FILE *mount_table; struct mntent *mount_entry; diff --git a/coreutils/expand.c b/coreutils/expand.c index 47693e144..c4db26055 100644 --- a/coreutils/expand.c +++ b/coreutils/expand.c @@ -192,7 +192,7 @@ int expand_main(int argc UNUSED_PARAM, char **argv) FILE *file; unsigned tab_size; unsigned opt; - int exit_status = EXIT_SUCCESS; + exitcode_t exit_status = EXIT_SUCCESS; init_unicode(); diff --git a/coreutils/fold.c b/coreutils/fold.c index 2839c8c68..8112fe911 100644 --- a/coreutils/fold.c +++ b/coreutils/fold.c @@ -77,7 +77,7 @@ int fold_main(int argc UNUSED_PARAM, char **argv) char *line_out = NULL; const char *w_opt = "80"; unsigned width; - smallint exitcode = EXIT_SUCCESS; + exitcode_t exitcode = EXIT_SUCCESS; init_unicode(); diff --git a/coreutils/ln.c b/coreutils/ln.c index 34eec398a..080ba142e 100644 --- a/coreutils/ln.c +++ b/coreutils/ln.c @@ -52,7 +52,7 @@ int ln_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int ln_main(int argc, char **argv) { - int status = EXIT_SUCCESS; + exitcode_t status = EXIT_SUCCESS; int opts; char *last; char *src_name; diff --git a/coreutils/touch.c b/coreutils/touch.c index 8fde70e12..ced596c89 100644 --- a/coreutils/touch.c +++ b/coreutils/touch.c @@ -77,7 +77,7 @@ int touch_main(int argc UNUSED_PARAM, char **argv) { int fd; int opts; - smalluint status = EXIT_SUCCESS; + exitcode_t status = EXIT_SUCCESS; #if ENABLE_FEATURE_TOUCH_SUSV3 char *reference_file; char *date_str; diff --git a/include/libbb.h b/include/libbb.h index 640fa3988..eb97a9880 100644 --- a/include/libbb.h +++ b/include/libbb.h @@ -1444,6 +1444,13 @@ void bb_verror_msg(const char *s, va_list p, const char *strerr) FAST_FUNC; void bb_die_memory_exhausted(void) NORETURN FAST_FUNC; void bb_logenv_override(void) FAST_FUNC; +/* x86 benefits from narrow exit code variables + * (because it has no widening MOV imm8,word32 insn, has to use MOV imm32,w + * for "exitcode = EXIT_FAILURE" and similar. The downside is that sometimes +* gcc widens the variable to int in various ugly suboptimal ways). + */ +typedef smalluint exitcode_t; + #if ENABLE_FEATURE_SYSLOG_INFO void bb_info_msg(const char *s, ...) __attribute__ ((format (printf, 1, 2))) FAST_FUNC; void bb_simple_info_msg(const char *s) FAST_FUNC; diff --git a/miscutils/getfattr.c b/miscutils/getfattr.c index 905aec65f..cb42fdac0 100644 --- a/miscutils/getfattr.c +++ b/miscutils/getfattr.c @@ -77,7 +77,7 @@ int getfattr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int getfattr_main(int argc UNUSED_PARAM, char **argv) { const char *name; - int status; + exitcode_t status; int opt; char *buf = NULL; size_t bufsize = 0; diff --git a/miscutils/makedevs.c b/miscutils/makedevs.c index 48be91875..999a3b976 100644 --- a/miscutils/makedevs.c +++ b/miscutils/makedevs.c @@ -181,7 +181,7 @@ int makedevs_main(int argc UNUSED_PARAM, char **argv) { parser_t *parser; char *line = (char *)"-"; - int ret = EXIT_SUCCESS; + exitcode_t ret = EXIT_SUCCESS; getopt32(argv, "^" "d:" "\0" "=1", &line); argv += optind; diff --git a/miscutils/setfattr.c b/miscutils/setfattr.c index 10d1840c9..b68bc9452 100644 --- a/miscutils/setfattr.c +++ b/miscutils/setfattr.c @@ -32,7 +32,7 @@ int setfattr_main(int argc UNUSED_PARAM, char **argv) { const char *name; const char *value = ""; - int status; + exitcode_t status; int opt; enum { OPT_h = (1 << 0), diff --git a/miscutils/strings.c b/miscutils/strings.c index 036df5c5d..bd1850cbb 100644 --- a/miscutils/strings.c +++ b/miscutils/strings.c @@ -40,7 +40,8 @@ int strings_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int strings_main(int argc UNUSED_PARAM, char **argv) { - int n, c, status = EXIT_SUCCESS; + int n, c; + exitcode_t status = EXIT_SUCCESS; unsigned count; off_t offset; FILE *file; diff --git a/networking/brctl.c b/networking/brctl.c index 7b0270b51..0f8dc2f7a 100644 --- a/networking/brctl.c +++ b/networking/brctl.c @@ -538,7 +538,7 @@ int brctl_main(int argc UNUSED_PARAM, char **argv) DIR *net; struct dirent *ent; int need_hdr = 1; - int exitcode = EXIT_SUCCESS; + exitcode_t exitcode = EXIT_SUCCESS; if (*argv) { /* "show BR1 BR2 BR3" */ diff --git a/networking/tc.c b/networking/tc.c index 43187f7ee..3a79fd2d9 100644 --- a/networking/tc.c +++ b/networking/tc.c @@ -502,7 +502,7 @@ int tc_main(int argc UNUSED_PARAM, char **argv) }; struct rtnl_handle rth; struct tcmsg msg; - int ret, obj, cmd, arg; + int obj, cmd, arg; char *dev = NULL; INIT_G(); @@ -510,7 +510,6 @@ int tc_main(int argc UNUSED_PARAM, char **argv) if (!*++argv) bb_show_usage(); xrtnl_open(&rth); - ret = EXIT_SUCCESS; obj = index_in_substrings(objects, *argv++); if (obj < 0) @@ -625,5 +624,5 @@ int tc_main(int argc UNUSED_PARAM, char **argv) if (ENABLE_FEATURE_CLEAN_UP) { rtnl_close(&rth); } - return ret; + return EXIT_SUCCESS; } diff --git a/util-linux/renice.c b/util-linux/renice.c index 53f197cce..f2737f29b 100644 --- a/util-linux/renice.c +++ b/util-linux/renice.c @@ -45,7 +45,7 @@ int renice_main(int argc UNUSED_PARAM, char **argv) { static const char Xetpriority_msg[] ALIGN1 = "%cetpriority"; - int retval = EXIT_SUCCESS; + exitcode_t retval = EXIT_SUCCESS; int which = PRIO_PROCESS; /* Default 'which' value. */ int use_relative = 0; int adjustment, new_priority; diff --git a/util-linux/rev.c b/util-linux/rev.c index 12df2b9ff..aad53722d 100644 --- a/util-linux/rev.c +++ b/util-linux/rev.c @@ -51,7 +51,7 @@ static void strrev(CHAR_T *s, int len) int rev_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int rev_main(int argc UNUSED_PARAM, char **argv) { - int retval; + exitcode_t retval; size_t bufsize; char *buf; diff --git a/util-linux/umount.c b/util-linux/umount.c index 23da32868..f5c97a034 100644 --- a/util-linux/umount.c +++ b/util-linux/umount.c @@ -97,7 +97,7 @@ int umount_main(int argc UNUSED_PARAM, char **argv) struct mntent me; FILE *fp; char *fstype = NULL; - int status = EXIT_SUCCESS; + exitcode_t status = EXIT_SUCCESS; unsigned opt; struct mtab_list { char *dir; -- cgit v1.2.3-55-g6feb From 8f0845cad7bfc46939132b33f9cd0753b261b953 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 18 Jul 2023 16:41:12 +0200 Subject: libbb: rename source files, no code changes Signed-off-by: Denys Vlasenko --- libbb/Kbuild.src | 10 +- libbb/hash_md5_sha256_x86-32_shaNI.S | 284 ------- libbb/hash_md5_sha256_x86-64_shaNI.S | 290 ------- libbb/hash_md5_sha_x86-32_shaNI.S | 234 ------ libbb/hash_md5_sha_x86-64.S | 1489 ---------------------------------- libbb/hash_md5_sha_x86-64.S.sh | 478 ----------- libbb/hash_md5_sha_x86-64_shaNI.S | 232 ------ libbb/hash_sha1_hwaccel_x86-32.S | 234 ++++++ libbb/hash_sha1_hwaccel_x86-64.S | 232 ++++++ libbb/hash_sha1_x86-64.S | 1489 ++++++++++++++++++++++++++++++++++ libbb/hash_sha1_x86-64.S.sh | 478 +++++++++++ libbb/hash_sha256_hwaccel_x86-32.S | 284 +++++++ libbb/hash_sha256_hwaccel_x86-64.S | 290 +++++++ 13 files changed, 3012 insertions(+), 3012 deletions(-) delete mode 100644 libbb/hash_md5_sha256_x86-32_shaNI.S delete mode 100644 libbb/hash_md5_sha256_x86-64_shaNI.S delete mode 100644 libbb/hash_md5_sha_x86-32_shaNI.S delete mode 100644 libbb/hash_md5_sha_x86-64.S delete mode 100755 libbb/hash_md5_sha_x86-64.S.sh delete mode 100644 libbb/hash_md5_sha_x86-64_shaNI.S create mode 100644 libbb/hash_sha1_hwaccel_x86-32.S create mode 100644 libbb/hash_sha1_hwaccel_x86-64.S create mode 100644 libbb/hash_sha1_x86-64.S create mode 100755 libbb/hash_sha1_x86-64.S.sh create mode 100644 libbb/hash_sha256_hwaccel_x86-32.S create mode 100644 libbb/hash_sha256_hwaccel_x86-64.S diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 653025e56..c3b30003f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -56,11 +56,11 @@ lib-y += login.o lib-y += make_directory.o lib-y += makedev.o lib-y += hash_md5_sha.o -lib-y += hash_md5_sha_x86-64.o -lib-y += hash_md5_sha_x86-64_shaNI.o -lib-y += hash_md5_sha_x86-32_shaNI.o -lib-y += hash_md5_sha256_x86-64_shaNI.o -lib-y += hash_md5_sha256_x86-32_shaNI.o +lib-y += hash_sha1_x86-64.o +lib-y += hash_sha1_hwaccel_x86-64.o +lib-y += hash_sha1_hwaccel_x86-32.o +lib-y += hash_sha256_hwaccel_x86-64.o +lib-y += hash_sha256_hwaccel_x86-32.o # Alternative (disabled) MD5 implementation #lib-y += hash_md5prime.o lib-y += messages.o diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S deleted file mode 100644 index a0e4a571a..000000000 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ /dev/null @@ -1,284 +0,0 @@ -#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -// pshufb and palignr are SSSE3 insns. -// We do not check SSSE3 in cpuid, -// all SHA-capable CPUs support it as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha256_process_block64_shaNI, "ax", @progbits - .globl sha256_process_block64_shaNI - .hidden sha256_process_block64_shaNI - .type sha256_process_block64_shaNI, @function - -#define DATA_PTR %eax - -#define SHA256CONSTANTS %ecx - -#define MSG %xmm0 -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 - -#define XMMTMP %xmm7 - -#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) - - .balign 8 # allow decoders to fetch at least 2 first insns -sha256_process_block64_shaNI: - - movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */ - movu128 76+1*16(%eax), STATE1 /* EFGH */ -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - mova128 STATE1, STATE0 - /* --- -------------- ABCD -- EFGH */ - shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ - shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ - -/* XMMTMP holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP - movl $K256+8*16, SHA256CONSTANTS - - /* Rounds 0-3 */ - movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP0 - paddd 0*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 4-7 */ - movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP1 - paddd 1*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP2 - paddd 2*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP, MSG -/* ...to here */ - mova128 MSG, MSGTMP3 - paddd 3*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - mova128 MSGTMP0, MSG - paddd 4*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - mova128 MSGTMP1, MSG - paddd 5*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - mova128 MSGTMP2, MSG - paddd 6*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - mova128 MSGTMP3, MSG - paddd 7*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - mova128 MSGTMP0, MSG - paddd 8*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - mova128 MSGTMP1, MSG - paddd 9*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - mova128 MSGTMP2, MSG - paddd 10*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - mova128 MSGTMP3, MSG - paddd 11*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - mova128 MSGTMP0, MSG - paddd 12*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - mova128 MSGTMP1, MSG - paddd 13*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 56-59 */ - mova128 MSGTMP2, MSG - paddd 14*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 60-63 */ - mova128 MSGTMP3, MSG - paddd 15*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Write hash values back in the correct order */ - mova128 STATE0, XMMTMP -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - /* --- -------------- HGDC -- FEBA */ - shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ - shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ - /* add current hash values to previous ones */ - movu128 76+1*16(%eax), STATE1 - paddd XMMTMP, STATE1 - movu128 STATE1, 76+1*16(%eax) - movu128 76+0*16(%eax), XMMTMP - paddd XMMTMP, STATE0 - movu128 STATE0, 76+0*16(%eax) - - ret - .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI - - .section .rodata.cst256.K256, "aM", @progbits, 256 - .balign 16 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BSWAP32_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -#endif diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S deleted file mode 100644 index 172c2eae2..000000000 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ /dev/null @@ -1,290 +0,0 @@ -#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -// pshufb and palignr are SSSE3 insns. -// We do not check SSSE3 in cpuid, -// all SHA-capable CPUs support it as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha256_process_block64_shaNI, "ax", @progbits - .globl sha256_process_block64_shaNI - .hidden sha256_process_block64_shaNI - .type sha256_process_block64_shaNI, @function - -#define DATA_PTR %rdi - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 - -#define XMMTMP %xmm7 - -#define SAVE0 %xmm8 -#define SAVE1 %xmm9 - -#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) - - .balign 8 # allow decoders to fetch at least 2 first insns -sha256_process_block64_shaNI: - - movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ - movu128 80+1*16(%rdi), STATE1 /* EFGH */ -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - mova128 STATE1, STATE0 - /* --- -------------- ABCD -- EFGH */ - shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ - shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ - -/* XMMTMP holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP - leaq K256+8*16(%rip), SHA256CONSTANTS - - /* Save hash values for addition after rounds */ - mova128 STATE0, SAVE0 - mova128 STATE1, SAVE1 - - /* Rounds 0-3 */ - movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP0 - paddd 0*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 4-7 */ - movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP1 - paddd 1*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP2 - paddd 2*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP, MSG -/* ...to here */ - mova128 MSG, MSGTMP3 - paddd 3*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - mova128 MSGTMP0, MSG - paddd 4*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - mova128 MSGTMP1, MSG - paddd 5*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - mova128 MSGTMP2, MSG - paddd 6*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - mova128 MSGTMP3, MSG - paddd 7*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - mova128 MSGTMP0, MSG - paddd 8*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - mova128 MSGTMP1, MSG - paddd 9*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - mova128 MSGTMP2, MSG - paddd 10*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - mova128 MSGTMP3, MSG - paddd 11*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - mova128 MSGTMP0, MSG - paddd 12*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - mova128 MSGTMP1, MSG - paddd 13*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 56-59 */ - mova128 MSGTMP2, MSG - paddd 14*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 60-63 */ - mova128 MSGTMP3, MSG - paddd 15*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Add current hash values with previously saved */ - paddd SAVE0, STATE0 - paddd SAVE1, STATE1 - - /* Write hash values back in the correct order */ - mova128 STATE0, XMMTMP -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - /* --- -------------- HGDC -- FEBA */ - shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ - shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ - movu128 STATE0, 80+0*16(%rdi) - movu128 XMMTMP, 80+1*16(%rdi) - - ret - .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI - - .section .rodata.cst256.K256, "aM", @progbits, 256 - .balign 16 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BSWAP32_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -#endif diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S deleted file mode 100644 index 7455a29f0..000000000 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ /dev/null @@ -1,234 +0,0 @@ -#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define xor128 pxor -#define xor128 xorps -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -#define extr128_32 pextrd -//#define extr128_32 extractps # not shorter - -// pshufb is a SSSE3 insn. -// pinsrd, pextrd, extractps are SSE4.1 insns. -// We do not check SSSE3/SSE4.1 in cpuid, -// all SHA-capable CPUs support them as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64_shaNI, "ax", @progbits - .globl sha1_process_block64_shaNI - .hidden sha1_process_block64_shaNI - .type sha1_process_block64_shaNI, @function - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 - - .balign 8 # allow decoders to fetch at least 2 first insns -sha1_process_block64_shaNI: - /* load initial hash values */ - movu128 76(%eax), ABCD - xor128 E0, E0 - pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word - shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD - - mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 - - movu128 0*16(%eax), MSG0 - pshufb %xmm7, MSG0 - movu128 1*16(%eax), MSG1 - pshufb %xmm7, MSG1 - movu128 2*16(%eax), MSG2 - pshufb %xmm7, MSG2 - movu128 3*16(%eax), MSG3 - pshufb %xmm7, MSG3 - - /* Save hash values for addition after rounds */ - mova128 E0, %xmm7 - /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ - - /* Rounds 0-3 */ - paddd MSG0, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 12-15 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - xor128 MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte %xmm7, E0 - /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ - movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)... - - /* Write hash values back in the correct order */ - shuf128_32 $0x1B, ABCD, ABCD - paddd %xmm7, ABCD # ...add it to final ABCD - movu128 ABCD, 76(%eax) - extr128_32 $3, E0, 76+4*4(%eax) - - ret - .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI - - .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -#endif diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S deleted file mode 100644 index 2cdd22015..000000000 --- a/libbb/hash_md5_sha_x86-64.S +++ /dev/null @@ -1,1489 +0,0 @@ -### Generated by hash_md5_sha_x86-64.S.sh ### - -#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64, "ax", @progbits - .globl sha1_process_block64 - .hidden sha1_process_block64 - .type sha1_process_block64, @function - - .balign 8 # allow decoders to fetch at least 5 first insns -sha1_process_block64: - pushq %rbp # 1 byte insn - pushq %rbx # 1 byte insn -# pushq %r15 # 2 byte insn - pushq %r14 # 2 byte insn - pushq %r13 # 2 byte insn - pushq %r12 # 2 byte insn - pushq %rdi # we need ctx at the end - -#Register and stack use: -# eax..edx: a..d -# ebp: e -# esi,edi,r8..r14: temps -# r15: unused -# xmm0..xmm3: W[] -# xmm4,xmm5: temps -# xmm6: current round constant -# xmm7: all round constants -# -64(%rsp): area for passing RCONST + W[] from vector to integer units - - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - - movaps sha1const(%rip), %xmm7 - pshufd $0x00, %xmm7, %xmm6 - - # Load W[] to xmm0..3, byteswapping on the fly. - # - # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. - # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq $32, %rsi # rsi = W[1]:W[0] - rolq $32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, %xmm4 - punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, %xmm4 # add RCONST, spill to stack -# paddd %xmm6, %xmm4 -# movups %xmm4, -64+16*0(%rsp) - - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq $32, %r9 # r9 = W[5]:W[4] - rolq $32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq $32, %r11 # r11 = W[9]:W[8] - rolq $32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, %xmm4 - punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq $32, %r13 # r13 = W[13]:W[12] - rolq $32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, %xmm4 - punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) - -# 0 - leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] - shrq $32, %rsi - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 1 - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 2 - leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] - shrq $32, %r8 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 3 - leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 4 - leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] - shrq $32, %r9 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 5 - leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 6 - leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] - shrq $32, %r10 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 7 - leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 8 - leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] - shrq $32, %r11 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 9 - leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 10 - leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] - shrq $32, %r12 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 11 - leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0x55, %xmm7, %xmm6 -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 12 - leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] - shrq $32, %r13 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 13 - leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 14 - leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] - shrq $32, %r14 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 15 - leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 16 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 17 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 18 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 19 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 20 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 21 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 22 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 23 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 24 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 25 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 26 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 27 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 28 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 29 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 30 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 31 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0xaa, %xmm7, %xmm6 -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 32 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 33 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 34 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 35 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 36 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 37 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 38 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 39 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 40 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 41 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 42 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 43 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 44 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 45 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 46 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 47 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 48 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 49 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 50 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 51 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0xff, %xmm7, %xmm6 -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 52 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 53 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 54 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 55 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 56 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 57 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 58 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 59 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 60 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 61 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 62 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 63 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 64 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 65 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 66 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 67 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 68 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 69 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 70 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 71 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 72 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 73 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 74 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 75 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 76 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 77 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 78 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 79 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) - - popq %rdi # - popq %r12 # - addl %eax, 80(%rdi) # ctx->hash[0] += a - popq %r13 # - addl %ebx, 84(%rdi) # ctx->hash[1] += b - popq %r14 # - addl %ecx, 88(%rdi) # ctx->hash[2] += c -# popq %r15 # - addl %edx, 92(%rdi) # ctx->hash[3] += d - popq %rbx # - addl %ebp, 96(%rdi) # ctx->hash[4] += e - popq %rbp # - - ret - .size sha1_process_block64, .-sha1_process_block64 - - .section .rodata.cst16.sha1const, "aM", @progbits, 16 - .balign 16 -sha1const: - .long 0x5A827999 - .long 0x6ED9EBA1 - .long 0x8F1BBCDC - .long 0xCA62C1D6 - -#endif diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh deleted file mode 100755 index 653fe4989..000000000 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ /dev/null @@ -1,478 +0,0 @@ -#!/bin/sh - -# We don't regenerate it on every "make" invocation - only by hand. -# The reason is that the changes to generated code are difficult -# to visualize by looking only at this script, it helps when the commit -# also contains the diff of the generated file. -exec >hash_md5_sha_x86-64.S - -# Based on http://arctic.org/~dean/crypto/sha1.html. -# ("This SHA1 implementation is public domain.") -# -# x86-64 has at least SSE2 vector insns always available. -# We can use them without any CPUID checks (and without a need -# for a fallback code if needed insns are not available). -# This code uses them to calculate W[] ahead of time. -# -# Unfortunately, results are passed from vector unit to -# integer ALUs on the stack. MOVD/Q insns to move them directly -# from vector to integer registers are slower than store-to-load -# forwarding in LSU (on Skylake at least). -# -# The win against a purely integer code is small on Skylake, -# only about 7-8%. We offload about 1/3 of our operations to the vector unit. -# It can do 4 ops at once in one 128-bit register, -# but we have to use x2 of them because of W[0] complication, -# SSE2 has no "rotate each word by N bits" insns, -# moving data to/from vector unit is clunky, and Skylake -# has four integer ALUs unified with three vector ALUs, -# which makes pure integer code rather fast, and makes -# vector ops compete with integer ones. -# -# Zen3, with its separate vector ALUs, wins more, about 12%. - -xmmT1="%xmm4" -xmmT2="%xmm5" -xmmRCONST="%xmm6" -xmmALLRCONST="%xmm7" -T=`printf '\t'` - -# SSE instructions are longer than 4 bytes on average. -# Intel CPUs (up to Tiger Lake at least) can't decode -# more than 16 bytes of code in one cycle. -# By interleaving SSE code and integer code -# we mostly achieve a situation where 16-byte decode fetch window -# contains 4 (or more) insns. -# -# However. On Skylake, there was no observed difference, -# but on Zen3, non-interleaved code is ~3% faster -# (822 Mb/s versus 795 Mb/s hashing speed). -# Off for now: -interleave=false - -INTERLEAVE() { - $interleave || \ - { - # Generate non-interleaved code - # (it should work correctly too) - echo "$1" - echo "$2" - return - } - ( - echo "$1" | grep -v '^$' >"$0.temp1" - echo "$2" | grep -v '^$' >"$0.temp2" - exec 3<"$0.temp1" - exec 4<"$0.temp2" - IFS='' - while :; do - line1='' - line2='' - while :; do - read -r line1 <&3 - if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then - break - fi - echo "$line1" - done - while :; do - read -r line2 <&4 - if test "${line2:0:4}" = "${T}lea"; then - # We use 7-8 byte long forms of LEA. - # Do not interleave them with SSE insns - # which are also long. - echo "$line2" - read -r line2 <&4 - echo "$line2" - continue - fi - if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then - break - fi - echo "$line2" - done - test "$line1$line2" || break - echo "$line1" - echo "$line2" - done - rm "$0.temp1" "$0.temp2" - ) -} - -# movaps bswap32_mask(%rip), $xmmT1 -# Load W[] to xmm0..3, byteswapping on the fly. -# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 -# for use in RD1As instead of spilling them to stack. -# (We use rsi instead of rN because this makes two -# ADDs in two first RD1As shorter by one byte). -# movups 16*0(%rdi), %xmm0 -# pshufb $xmmT1, %xmm0 #SSSE3 insn -# movaps %xmm0, $xmmT2 -# paddd $xmmRCONST, $xmmT2 -# movq $xmmT2, %rsi -# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn -# #movhpd $xmmT2, %r8 #can only move to mem, not to reg -# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence -# movq $xmmT2, %r8 # instead -# ... -# -# ... -#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] -#+ addl %esi, %e$e # e += RCONST + W[n] -# ^^^^^^^^^^^^^^^^^^^^^^^^ -# The above is -97 bytes of code... -# ...but pshufb is a SSSE3 insn. Can't use it. - -echo \ -"### Generated by hash_md5_sha_x86-64.S.sh ### - -#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) -#ifdef __linux__ - .section .note.GNU-stack, \"\", @progbits -#endif - .section .text.sha1_process_block64, \"ax\", @progbits - .globl sha1_process_block64 - .hidden sha1_process_block64 - .type sha1_process_block64, @function - - .balign 8 # allow decoders to fetch at least 5 first insns -sha1_process_block64: - pushq %rbp # 1 byte insn - pushq %rbx # 1 byte insn -# pushq %r15 # 2 byte insn - pushq %r14 # 2 byte insn - pushq %r13 # 2 byte insn - pushq %r12 # 2 byte insn - pushq %rdi # we need ctx at the end - -#Register and stack use: -# eax..edx: a..d -# ebp: e -# esi,edi,r8..r14: temps -# r15: unused -# xmm0..xmm3: W[] -# xmm4,xmm5: temps -# xmm6: current round constant -# xmm7: all round constants -# -64(%rsp): area for passing RCONST + W[] from vector to integer units - - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - - movaps sha1const(%rip), $xmmALLRCONST - pshufd \$0x00, $xmmALLRCONST, $xmmRCONST - - # Load W[] to xmm0..3, byteswapping on the fly. - # - # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. - # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq \$32, %rsi # rsi = W[1]:W[0] - rolq \$32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, $xmmT1 - punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, $xmmT1 # add RCONST, spill to stack -# paddd $xmmRCONST, $xmmT1 -# movups $xmmT1, -64+16*0(%rsp) - - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq \$32, %r9 # r9 = W[5]:W[4] - rolq \$32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, $xmmT1 - punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq \$32, %r11 # r11 = W[9]:W[8] - rolq \$32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, $xmmT1 - punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq \$32, %r13 # r13 = W[13]:W[12] - rolq \$32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, $xmmT1 - punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) -" - -PREP() { -local xmmW0=$1 -local xmmW4=$2 -local xmmW8=$3 -local xmmW12=$4 -# the above must be %xmm0..3 in some permutation -local dstmem=$5 -#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); -#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); -#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); -#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); -#W[3] ^= rol(W[0], 1); -echo "# PREP $@ - movaps $xmmW12, $xmmT1 - psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - -# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps $xmmW0, $xmmT2 - shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - - xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps $xmmT2, $xmmW0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps $xmmW0, $xmmT2 - - xorps $xmmT1, $xmmT1 # rol(W0,1): - pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) - paddd $xmmW0, $xmmW0 # shift left by 1 - psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - - pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps $xmmT2, $xmmT1 - pslld \$2, $xmmT2 - psrld \$30, $xmmT1 -# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) - xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 - - xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) -" -# movq $xmmW0, %r8 # high latency (~6 cycles) -# movaps $xmmW0, $xmmT1 -# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower -# movq $xmmT1, %r10 # high latency -# movq %r8, %r9 -# movq %r10, %r11 -# shrq \$32, %r9 -# shrq \$32, %r11 -# ^^^ slower than passing the results on stack (!!!) -echo " - movaps $xmmW0, $xmmT2 - paddd $xmmRCONST, $xmmT2 - movups $xmmT2, $dstmem -" -} - -# It's possible to interleave integer insns in rounds to mostly eliminate -# dependency chains, but this likely to only help old Pentium-based -# CPUs (ones without OOO, which can only simultaneously execute a pair -# of _adjacent_ insns). -# Testing on old-ish Silvermont CPU (which has OOO window of only -# about ~8 insns) shows very small (~1%) speedup. - -RD1A() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n0=$(((n+0) & 15)) -local rN=$((7+n0/2)) -echo " -# $n -";test $n0 = 0 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] - shrq \$32, %rsi -";test $n0 = 1 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] -";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " - leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] - shrq \$32, %r$rN -";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " - leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] -";echo " - movl %e$c, %edi # c - xorl %e$d, %edi # ^d - andl %e$b, %edi # &b - xorl %e$d, %edi # (((c ^ d) & b) ^ d) - addl %edi, %e$e # e += (((c ^ d) & b) ^ d) - movl %e$a, %edi # - roll \$5, %edi # rotl32(a,5) - addl %edi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} -RD1B() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n13=$(((n+13) & 15)) -local n8=$(((n+8) & 15)) -local n2=$(((n+2) & 15)) -local n0=$(((n+0) & 15)) -echo " -# $n - movl %e$c, %edi # c - xorl %e$d, %edi # ^d - andl %e$b, %edi # &b - xorl %e$d, %edi # (((c ^ d) & b) ^ d) - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] - addl %edi, %e$e # e += (((c ^ d) & b) ^ d) - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} - -RD2() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n13=$(((n+13) & 15)) -local n8=$(((n+8) & 15)) -local n2=$(((n+2) & 15)) -local n0=$(((n+0) & 15)) -echo " -# $n - movl %e$c, %edi # c - xorl %e$d, %edi # ^d - xorl %e$b, %edi # ^b - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] - addl %edi, %e$e # e += (c ^ d ^ b) - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} - -RD3() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n13=$(((n+13) & 15)) -local n8=$(((n+8) & 15)) -local n2=$(((n+2) & 15)) -local n0=$(((n+0) & 15)) -echo " -# $n - movl %e$b, %edi # di: b - movl %e$b, %esi # si: b - orl %e$c, %edi # di: b | c - andl %e$c, %esi # si: b & c - andl %e$d, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %e$e # += ((b | c) & d) | (b & c) - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} - -{ -# Round 1 -RCONST=0x5A827999 -RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; -RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` -INTERLEAVE "$a" "$b" -a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" - PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` -INTERLEAVE "$a" "$b" - -# Round 2 -RCONST=0x6ED9EBA1 -a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` -INTERLEAVE "$a" "$b" -a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" - PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` -INTERLEAVE "$a" "$b" - -# Round 3 -RCONST=0x8F1BBCDC -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` -INTERLEAVE "$a" "$b" -a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" - PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` -INTERLEAVE "$a" "$b" - -# Round 4 has the same logic as round 2, only n and RCONST are different -RCONST=0xCA62C1D6 -a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` -INTERLEAVE "$a" "$b" -RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; -RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; -} | grep -v '^$' - -echo " - popq %rdi # - popq %r12 # - addl %eax, 80(%rdi) # ctx->hash[0] += a - popq %r13 # - addl %ebx, 84(%rdi) # ctx->hash[1] += b - popq %r14 # - addl %ecx, 88(%rdi) # ctx->hash[2] += c -# popq %r15 # - addl %edx, 92(%rdi) # ctx->hash[3] += d - popq %rbx # - addl %ebp, 96(%rdi) # ctx->hash[4] += e - popq %rbp # - - ret - .size sha1_process_block64, .-sha1_process_block64 - - .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 - .balign 16 -sha1const: - .long 0x5A827999 - .long 0x6ED9EBA1 - .long 0x8F1BBCDC - .long 0xCA62C1D6 - -#endif" diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S deleted file mode 100644 index 2f03e1ce4..000000000 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ /dev/null @@ -1,232 +0,0 @@ -#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define xor128 pxor -#define xor128 xorps -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -#define extr128_32 pextrd -//#define extr128_32 extractps # not shorter - -// pshufb is a SSSE3 insn. -// pinsrd, pextrd, extractps are SSE4.1 insns. -// We do not check SSSE3/SSE4.1 in cpuid, -// all SHA-capable CPUs support them as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64_shaNI, "ax", @progbits - .globl sha1_process_block64_shaNI - .hidden sha1_process_block64_shaNI - .type sha1_process_block64_shaNI, @function - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 - - .balign 8 # allow decoders to fetch at least 2 first insns -sha1_process_block64_shaNI: - /* load initial hash values */ - movu128 80(%rdi), ABCD - xor128 E0, E0 - pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word - shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD - - mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 - - movu128 0*16(%rdi), MSG0 - pshufb %xmm7, MSG0 - movu128 1*16(%rdi), MSG1 - pshufb %xmm7, MSG1 - movu128 2*16(%rdi), MSG2 - pshufb %xmm7, MSG2 - movu128 3*16(%rdi), MSG3 - pshufb %xmm7, MSG3 - - /* Save hash values for addition after rounds */ - mova128 E0, %xmm7 - mova128 ABCD, %xmm8 - - /* Rounds 0-3 */ - paddd MSG0, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 12-15 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - xor128 MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte %xmm7, E0 - paddd %xmm8, ABCD - - /* Write hash values back in the correct order */ - shuf128_32 $0x1B, ABCD, ABCD - movu128 ABCD, 80(%rdi) - extr128_32 $3, E0, 80+4*4(%rdi) - - ret - .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI - - .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -#endif diff --git a/libbb/hash_sha1_hwaccel_x86-32.S b/libbb/hash_sha1_hwaccel_x86-32.S new file mode 100644 index 000000000..7455a29f0 --- /dev/null +++ b/libbb/hash_sha1_hwaccel_x86-32.S @@ -0,0 +1,234 @@ +#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define xor128 pxor +#define xor128 xorps +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +#define extr128_32 pextrd +//#define extr128_32 extractps # not shorter + +// pshufb is a SSSE3 insn. +// pinsrd, pextrd, extractps are SSE4.1 insns. +// We do not check SSSE3/SSE4.1 in cpuid, +// all SHA-capable CPUs support them as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha1_process_block64_shaNI, "ax", @progbits + .globl sha1_process_block64_shaNI + .hidden sha1_process_block64_shaNI + .type sha1_process_block64_shaNI, @function + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha1_process_block64_shaNI: + /* load initial hash values */ + movu128 76(%eax), ABCD + xor128 E0, E0 + pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word + shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD + + mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 + + movu128 0*16(%eax), MSG0 + pshufb %xmm7, MSG0 + movu128 1*16(%eax), MSG1 + pshufb %xmm7, MSG1 + movu128 2*16(%eax), MSG2 + pshufb %xmm7, MSG2 + movu128 3*16(%eax), MSG3 + pshufb %xmm7, MSG3 + + /* Save hash values for addition after rounds */ + mova128 E0, %xmm7 + /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ + + /* Rounds 0-3 */ + paddd MSG0, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 12-15 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + xor128 MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte %xmm7, E0 + /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ + movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)... + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, ABCD, ABCD + paddd %xmm7, ABCD # ...add it to final ABCD + movu128 ABCD, 76(%eax) + extr128_32 $3, E0, 76+4*4(%eax) + + ret + .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI + + .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +#endif diff --git a/libbb/hash_sha1_hwaccel_x86-64.S b/libbb/hash_sha1_hwaccel_x86-64.S new file mode 100644 index 000000000..2f03e1ce4 --- /dev/null +++ b/libbb/hash_sha1_hwaccel_x86-64.S @@ -0,0 +1,232 @@ +#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define xor128 pxor +#define xor128 xorps +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +#define extr128_32 pextrd +//#define extr128_32 extractps # not shorter + +// pshufb is a SSSE3 insn. +// pinsrd, pextrd, extractps are SSE4.1 insns. +// We do not check SSSE3/SSE4.1 in cpuid, +// all SHA-capable CPUs support them as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha1_process_block64_shaNI, "ax", @progbits + .globl sha1_process_block64_shaNI + .hidden sha1_process_block64_shaNI + .type sha1_process_block64_shaNI, @function + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha1_process_block64_shaNI: + /* load initial hash values */ + movu128 80(%rdi), ABCD + xor128 E0, E0 + pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word + shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD + + mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 + + movu128 0*16(%rdi), MSG0 + pshufb %xmm7, MSG0 + movu128 1*16(%rdi), MSG1 + pshufb %xmm7, MSG1 + movu128 2*16(%rdi), MSG2 + pshufb %xmm7, MSG2 + movu128 3*16(%rdi), MSG3 + pshufb %xmm7, MSG3 + + /* Save hash values for addition after rounds */ + mova128 E0, %xmm7 + mova128 ABCD, %xmm8 + + /* Rounds 0-3 */ + paddd MSG0, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 12-15 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + xor128 MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte %xmm7, E0 + paddd %xmm8, ABCD + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, ABCD, ABCD + movu128 ABCD, 80(%rdi) + extr128_32 $3, E0, 80+4*4(%rdi) + + ret + .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI + + .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +#endif diff --git a/libbb/hash_sha1_x86-64.S b/libbb/hash_sha1_x86-64.S new file mode 100644 index 000000000..b1968fff6 --- /dev/null +++ b/libbb/hash_sha1_x86-64.S @@ -0,0 +1,1489 @@ +### Generated by hash_sha1_x86-64.S.sh ### + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha1_process_block64, "ax", @progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 5 first insns +sha1_process_block64: + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn +# pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi,r8..r14: temps +# r15: unused +# xmm0..xmm3: W[] +# xmm4,xmm5: temps +# xmm6: current round constant +# xmm7: all round constants +# -64(%rsp): area for passing RCONST + W[] from vector to integer units + + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + + movaps sha1const(%rip), %xmm7 + pshufd $0x00, %xmm7, %xmm6 + + # Load W[] to xmm0..3, byteswapping on the fly. + # + # For iterations 0..15, we pass W[] in rsi,r8..r14 + # for use in RD1As instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it is probably a wash. + # (We use rsi instead of rN because this makes two + # LEAs in two first RD1As shorter by one byte). + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r8 + bswapq %rsi + bswapq %r8 + rolq $32, %rsi # rsi = W[1]:W[0] + rolq $32, %r8 # r8 = W[3]:W[2] + movq %rsi, %xmm0 + movq %r8, %xmm4 + punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, %xmm4 # add RCONST, spill to stack +# paddd %xmm6, %xmm4 +# movups %xmm4, -64+16*0(%rsp) + + movq 4*4(%rdi), %r9 + movq 4*6(%rdi), %r10 + bswapq %r9 + bswapq %r10 + rolq $32, %r9 # r9 = W[5]:W[4] + rolq $32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 + movq %r10, %xmm4 + punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) + + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq $32, %r11 # r11 = W[9]:W[8] + rolq $32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, %xmm4 + punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) + + movq 4*12(%rdi), %r13 + movq 4*14(%rdi), %r14 + bswapq %r13 + bswapq %r14 + rolq $32, %r13 # r13 = W[13]:W[12] + rolq $32, %r14 # r14 = W[15]:W[14] + movq %r13, %xmm3 + movq %r14, %xmm4 + punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) + +# 0 + leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] + shrq $32, %rsi + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 1 + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 2 + leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] + shrq $32, %r8 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 3 + leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 4 + leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] + shrq $32, %r9 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 5 + leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 6 + leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] + shrq $32, %r10 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 7 + leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 8 + leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] + shrq $32, %r11 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 9 + leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 10 + leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] + shrq $32, %r12 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 11 + leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) + pshufd $0x55, %xmm7, %xmm6 +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 12 + leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] + shrq $32, %r13 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 13 + leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 14 + leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] + shrq $32, %r14 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 15 + leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 16 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 17 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 18 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 19 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 20 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 21 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 22 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 23 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 24 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 25 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 26 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 27 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 28 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 29 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 30 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 31 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) + pshufd $0xaa, %xmm7, %xmm6 +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 32 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 33 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 34 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 35 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 36 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 37 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 38 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 39 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 40 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 41 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 42 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 43 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 44 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 45 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 46 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 47 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 48 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 49 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 50 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 51 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) + pshufd $0xff, %xmm7, %xmm6 +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 52 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 53 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 54 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 55 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 56 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 57 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 58 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 59 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 60 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 61 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 62 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 63 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 64 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 65 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 66 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 67 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 68 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 69 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 70 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 71 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 72 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 73 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 74 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 75 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 76 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 77 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 78 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 79 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) + + popq %rdi # + popq %r12 # + addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # + addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # + addl %ecx, 88(%rdi) # ctx->hash[2] += c +# popq %r15 # + addl %edx, 92(%rdi) # ctx->hash[3] += d + popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbp # + + ret + .size sha1_process_block64, .-sha1_process_block64 + + .section .rodata.cst16.sha1const, "aM", @progbits, 16 + .balign 16 +sha1const: + .long 0x5A827999 + .long 0x6ED9EBA1 + .long 0x8F1BBCDC + .long 0xCA62C1D6 + +#endif diff --git a/libbb/hash_sha1_x86-64.S.sh b/libbb/hash_sha1_x86-64.S.sh new file mode 100755 index 000000000..3fc125d51 --- /dev/null +++ b/libbb/hash_sha1_x86-64.S.sh @@ -0,0 +1,478 @@ +#!/bin/sh + +# We don't regenerate it on every "make" invocation - only by hand. +# The reason is that the changes to generated code are difficult +# to visualize by looking only at this script, it helps when the commit +# also contains the diff of the generated file. +exec >hash_sha1_x86-64.S + +# Based on http://arctic.org/~dean/crypto/sha1.html. +# ("This SHA1 implementation is public domain.") +# +# x86-64 has at least SSE2 vector insns always available. +# We can use them without any CPUID checks (and without a need +# for a fallback code if needed insns are not available). +# This code uses them to calculate W[] ahead of time. +# +# Unfortunately, results are passed from vector unit to +# integer ALUs on the stack. MOVD/Q insns to move them directly +# from vector to integer registers are slower than store-to-load +# forwarding in LSU (on Skylake at least). +# +# The win against a purely integer code is small on Skylake, +# only about 7-8%. We offload about 1/3 of our operations to the vector unit. +# It can do 4 ops at once in one 128-bit register, +# but we have to use x2 of them because of W[0] complication, +# SSE2 has no "rotate each word by N bits" insns, +# moving data to/from vector unit is clunky, and Skylake +# has four integer ALUs unified with three vector ALUs, +# which makes pure integer code rather fast, and makes +# vector ops compete with integer ones. +# +# Zen3, with its separate vector ALUs, wins more, about 12%. + +xmmT1="%xmm4" +xmmT2="%xmm5" +xmmRCONST="%xmm6" +xmmALLRCONST="%xmm7" +T=`printf '\t'` + +# SSE instructions are longer than 4 bytes on average. +# Intel CPUs (up to Tiger Lake at least) can't decode +# more than 16 bytes of code in one cycle. +# By interleaving SSE code and integer code +# we mostly achieve a situation where 16-byte decode fetch window +# contains 4 (or more) insns. +# +# However. On Skylake, there was no observed difference, +# but on Zen3, non-interleaved code is ~3% faster +# (822 Mb/s versus 795 Mb/s hashing speed). +# Off for now: +interleave=false + +INTERLEAVE() { + $interleave || \ + { + # Generate non-interleaved code + # (it should work correctly too) + echo "$1" + echo "$2" + return + } + ( + echo "$1" | grep -v '^$' >"$0.temp1" + echo "$2" | grep -v '^$' >"$0.temp2" + exec 3<"$0.temp1" + exec 4<"$0.temp2" + IFS='' + while :; do + line1='' + line2='' + while :; do + read -r line1 <&3 + if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then + break + fi + echo "$line1" + done + while :; do + read -r line2 <&4 + if test "${line2:0:4}" = "${T}lea"; then + # We use 7-8 byte long forms of LEA. + # Do not interleave them with SSE insns + # which are also long. + echo "$line2" + read -r line2 <&4 + echo "$line2" + continue + fi + if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then + break + fi + echo "$line2" + done + test "$line1$line2" || break + echo "$line1" + echo "$line2" + done + rm "$0.temp1" "$0.temp2" + ) +} + +# movaps bswap32_mask(%rip), $xmmT1 +# Load W[] to xmm0..3, byteswapping on the fly. +# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 +# for use in RD1As instead of spilling them to stack. +# (We use rsi instead of rN because this makes two +# ADDs in two first RD1As shorter by one byte). +# movups 16*0(%rdi), %xmm0 +# pshufb $xmmT1, %xmm0 #SSSE3 insn +# movaps %xmm0, $xmmT2 +# paddd $xmmRCONST, $xmmT2 +# movq $xmmT2, %rsi +# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn +# #movhpd $xmmT2, %r8 #can only move to mem, not to reg +# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence +# movq $xmmT2, %r8 # instead +# ... +# +# ... +#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +#+ addl %esi, %e$e # e += RCONST + W[n] +# ^^^^^^^^^^^^^^^^^^^^^^^^ +# The above is -97 bytes of code... +# ...but pshufb is a SSSE3 insn. Can't use it. + +echo \ +"### Generated by hash_sha1_x86-64.S.sh ### + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) +#ifdef __linux__ + .section .note.GNU-stack, \"\", @progbits +#endif + .section .text.sha1_process_block64, \"ax\", @progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 5 first insns +sha1_process_block64: + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn +# pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi,r8..r14: temps +# r15: unused +# xmm0..xmm3: W[] +# xmm4,xmm5: temps +# xmm6: current round constant +# xmm7: all round constants +# -64(%rsp): area for passing RCONST + W[] from vector to integer units + + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + + movaps sha1const(%rip), $xmmALLRCONST + pshufd \$0x00, $xmmALLRCONST, $xmmRCONST + + # Load W[] to xmm0..3, byteswapping on the fly. + # + # For iterations 0..15, we pass W[] in rsi,r8..r14 + # for use in RD1As instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it is probably a wash. + # (We use rsi instead of rN because this makes two + # LEAs in two first RD1As shorter by one byte). + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r8 + bswapq %rsi + bswapq %r8 + rolq \$32, %rsi # rsi = W[1]:W[0] + rolq \$32, %r8 # r8 = W[3]:W[2] + movq %rsi, %xmm0 + movq %r8, $xmmT1 + punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, $xmmT1 # add RCONST, spill to stack +# paddd $xmmRCONST, $xmmT1 +# movups $xmmT1, -64+16*0(%rsp) + + movq 4*4(%rdi), %r9 + movq 4*6(%rdi), %r10 + bswapq %r9 + bswapq %r10 + rolq \$32, %r9 # r9 = W[5]:W[4] + rolq \$32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 + movq %r10, $xmmT1 + punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) + + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq \$32, %r11 # r11 = W[9]:W[8] + rolq \$32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, $xmmT1 + punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) + + movq 4*12(%rdi), %r13 + movq 4*14(%rdi), %r14 + bswapq %r13 + bswapq %r14 + rolq \$32, %r13 # r13 = W[13]:W[12] + rolq \$32, %r14 # r14 = W[15]:W[14] + movq %r13, %xmm3 + movq %r14, $xmmT1 + punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) +" + +PREP() { +local xmmW0=$1 +local xmmW4=$2 +local xmmW8=$3 +local xmmW12=$4 +# the above must be %xmm0..3 in some permutation +local dstmem=$5 +#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); +#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); +#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); +#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); +#W[3] ^= rol(W[0], 1); +echo "# PREP $@ + movaps $xmmW12, $xmmT1 + psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + +# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps $xmmW0, $xmmT2 + shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + + xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps $xmmT2, $xmmW0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps $xmmW0, $xmmT2 + + xorps $xmmT1, $xmmT1 # rol(W0,1): + pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) + paddd $xmmW0, $xmmW0 # shift left by 1 + psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + + pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps $xmmT2, $xmmT1 + pslld \$2, $xmmT2 + psrld \$30, $xmmT1 +# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) + xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 + + xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) +" +# movq $xmmW0, %r8 # high latency (~6 cycles) +# movaps $xmmW0, $xmmT1 +# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower +# movq $xmmT1, %r10 # high latency +# movq %r8, %r9 +# movq %r10, %r11 +# shrq \$32, %r9 +# shrq \$32, %r11 +# ^^^ slower than passing the results on stack (!!!) +echo " + movaps $xmmW0, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movups $xmmT2, $dstmem +" +} + +# It's possible to interleave integer insns in rounds to mostly eliminate +# dependency chains, but this likely to only help old Pentium-based +# CPUs (ones without OOO, which can only simultaneously execute a pair +# of _adjacent_ insns). +# Testing on old-ish Silvermont CPU (which has OOO window of only +# about ~8 insns) shows very small (~1%) speedup. + +RD1A() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n0=$(((n+0) & 15)) +local rN=$((7+n0/2)) +echo " +# $n +";test $n0 = 0 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] + shrq \$32, %rsi +";test $n0 = 1 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] + shrq \$32, %r$rN +";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] +";echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %edi # + roll \$5, %edi # rotl32(a,5) + addl %edi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +RD1B() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} + +RD2() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + xorl %e$b, %edi # ^b + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] + addl %edi, %e$e # e += (c ^ d ^ b) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} + +RD3() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$b, %edi # di: b + movl %e$b, %esi # si: b + orl %e$c, %edi # di: b | c + andl %e$c, %esi # si: b & c + andl %e$d, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %e$e # += ((b | c) & d) | (b & c) + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} + +{ +# Round 1 +RCONST=0x5A827999 +RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; +RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` +INTERLEAVE "$a" "$b" +a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" + PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` +INTERLEAVE "$a" "$b" + +# Round 2 +RCONST=0x6ED9EBA1 +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` +INTERLEAVE "$a" "$b" +a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" + PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` +INTERLEAVE "$a" "$b" + +# Round 3 +RCONST=0x8F1BBCDC +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` +INTERLEAVE "$a" "$b" +a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" + PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` +INTERLEAVE "$a" "$b" + +# Round 4 has the same logic as round 2, only n and RCONST are different +RCONST=0xCA62C1D6 +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` +INTERLEAVE "$a" "$b" +RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; +RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; +} | grep -v '^$' + +echo " + popq %rdi # + popq %r12 # + addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # + addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # + addl %ecx, 88(%rdi) # ctx->hash[2] += c +# popq %r15 # + addl %edx, 92(%rdi) # ctx->hash[3] += d + popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbp # + + ret + .size sha1_process_block64, .-sha1_process_block64 + + .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 + .balign 16 +sha1const: + .long 0x5A827999 + .long 0x6ED9EBA1 + .long 0x8F1BBCDC + .long 0xCA62C1D6 + +#endif" diff --git a/libbb/hash_sha256_hwaccel_x86-32.S b/libbb/hash_sha256_hwaccel_x86-32.S new file mode 100644 index 000000000..a0e4a571a --- /dev/null +++ b/libbb/hash_sha256_hwaccel_x86-32.S @@ -0,0 +1,284 @@ +#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +// pshufb and palignr are SSSE3 insns. +// We do not check SSSE3 in cpuid, +// all SHA-capable CPUs support it as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha256_process_block64_shaNI, "ax", @progbits + .globl sha256_process_block64_shaNI + .hidden sha256_process_block64_shaNI + .type sha256_process_block64_shaNI, @function + +#define DATA_PTR %eax + +#define SHA256CONSTANTS %ecx + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 + +#define XMMTMP %xmm7 + +#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) + + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: + + movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */ + movu128 76+1*16(%eax), STATE1 /* EFGH */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + mova128 STATE1, STATE0 + /* --- -------------- ABCD -- EFGH */ + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ + +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP + movl $K256+8*16, SHA256CONSTANTS + + /* Rounds 0-3 */ + movu128 0*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP0 + paddd 0*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 4-7 */ + movu128 1*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP1 + paddd 1*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movu128 2*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP2 + paddd 2*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movu128 3*16(DATA_PTR), MSG + pshufb XMMTMP, MSG +/* ...to here */ + mova128 MSG, MSGTMP3 + paddd 3*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + mova128 MSGTMP0, MSG + paddd 4*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + mova128 MSGTMP1, MSG + paddd 5*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + mova128 MSGTMP2, MSG + paddd 6*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + mova128 MSGTMP3, MSG + paddd 7*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + mova128 MSGTMP0, MSG + paddd 8*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + mova128 MSGTMP1, MSG + paddd 9*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + mova128 MSGTMP2, MSG + paddd 10*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + mova128 MSGTMP3, MSG + paddd 11*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + mova128 MSGTMP0, MSG + paddd 12*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + mova128 MSGTMP1, MSG + paddd 13*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 56-59 */ + mova128 MSGTMP2, MSG + paddd 14*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 60-63 */ + mova128 MSGTMP3, MSG + paddd 15*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Write hash values back in the correct order */ + mova128 STATE0, XMMTMP +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + /* --- -------------- HGDC -- FEBA */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ + /* add current hash values to previous ones */ + movu128 76+1*16(%eax), STATE1 + paddd XMMTMP, STATE1 + movu128 STATE1, 76+1*16(%eax) + movu128 76+0*16(%eax), XMMTMP + paddd XMMTMP, STATE0 + movu128 STATE0, 76+0*16(%eax) + + ret + .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI + + .section .rodata.cst256.K256, "aM", @progbits, 256 + .balign 16 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BSWAP32_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +#endif diff --git a/libbb/hash_sha256_hwaccel_x86-64.S b/libbb/hash_sha256_hwaccel_x86-64.S new file mode 100644 index 000000000..172c2eae2 --- /dev/null +++ b/libbb/hash_sha256_hwaccel_x86-64.S @@ -0,0 +1,290 @@ +#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +// pshufb and palignr are SSSE3 insns. +// We do not check SSSE3 in cpuid, +// all SHA-capable CPUs support it as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha256_process_block64_shaNI, "ax", @progbits + .globl sha256_process_block64_shaNI + .hidden sha256_process_block64_shaNI + .type sha256_process_block64_shaNI, @function + +#define DATA_PTR %rdi + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 + +#define XMMTMP %xmm7 + +#define SAVE0 %xmm8 +#define SAVE1 %xmm9 + +#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) + + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: + + movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ + movu128 80+1*16(%rdi), STATE1 /* EFGH */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + mova128 STATE1, STATE0 + /* --- -------------- ABCD -- EFGH */ + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ + +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP + leaq K256+8*16(%rip), SHA256CONSTANTS + + /* Save hash values for addition after rounds */ + mova128 STATE0, SAVE0 + mova128 STATE1, SAVE1 + + /* Rounds 0-3 */ + movu128 0*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP0 + paddd 0*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 4-7 */ + movu128 1*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP1 + paddd 1*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movu128 2*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP2 + paddd 2*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movu128 3*16(DATA_PTR), MSG + pshufb XMMTMP, MSG +/* ...to here */ + mova128 MSG, MSGTMP3 + paddd 3*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + mova128 MSGTMP0, MSG + paddd 4*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + mova128 MSGTMP1, MSG + paddd 5*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + mova128 MSGTMP2, MSG + paddd 6*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + mova128 MSGTMP3, MSG + paddd 7*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + mova128 MSGTMP0, MSG + paddd 8*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + mova128 MSGTMP1, MSG + paddd 9*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + mova128 MSGTMP2, MSG + paddd 10*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + mova128 MSGTMP3, MSG + paddd 11*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + mova128 MSGTMP0, MSG + paddd 12*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + mova128 MSGTMP1, MSG + paddd 13*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 56-59 */ + mova128 MSGTMP2, MSG + paddd 14*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 60-63 */ + mova128 MSGTMP3, MSG + paddd 15*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd SAVE0, STATE0 + paddd SAVE1, STATE1 + + /* Write hash values back in the correct order */ + mova128 STATE0, XMMTMP +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + /* --- -------------- HGDC -- FEBA */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ + movu128 STATE0, 80+0*16(%rdi) + movu128 XMMTMP, 80+1*16(%rdi) + + ret + .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI + + .section .rodata.cst256.K256, "aM", @progbits, 256 + .balign 16 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BSWAP32_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +#endif -- cgit v1.2.3-55-g6feb