diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 00:43:59 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 00:43:59 +0100 |
| commit | 711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (patch) | |
| tree | 170fa55e39133e3ba7182fa56d1643e25b55010a /libbb | |
| parent | a93668cc4277b14eaff07fcfdef9693c990ec824 (diff) | |
| download | busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.gz busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.bz2 busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.zip | |
libbb/sha1: optional x86-64 hardware accelerates hashing
function old new delta
sha1_process_block64_shaNI - 510 +510
sha1_begin 52 107 +55
.rodata 108285 108301 +16
static.shaNI - 1 +1
------------------------------------------------------------------------------
(add/remove: 4/0 grow/shrink: 2/0 up/down: 582/0) Total: 582 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
| -rw-r--r-- | libbb/Config.src | 7 | ||||
| -rw-r--r-- | libbb/Kbuild.src | 1 | ||||
| -rw-r--r-- | libbb/hash_md5_sha.c | 38 | ||||
| -rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 225 |
4 files changed, 270 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index c80bee286..708d3b0c8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
| @@ -63,6 +63,13 @@ config SHA1_SMALL | |||
| 63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
| 64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
| 65 | 65 | ||
| 66 | config SHA1_HWACCEL | ||
| 67 | bool "SHA1: Use hardware accelerated instructions if possible" | ||
| 68 | default y | ||
| 69 | help | ||
| 70 | On x86, this adds ~590 bytes of code. Throughput | ||
| 71 | is about twice as fast as fully-unrolled generic code. | ||
| 72 | |||
| 66 | config SHA3_SMALL | 73 | config SHA3_SMALL |
| 67 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" | 74 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" |
| 68 | default 1 # all "fast or small" options default to small | 75 | default 1 # all "fast or small" options default to small |
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 19b8aad60..a3db02b6f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src | |||
| @@ -57,6 +57,7 @@ lib-y += make_directory.o | |||
| 57 | lib-y += makedev.o | 57 | lib-y += makedev.o |
| 58 | lib-y += hash_md5_sha.o | 58 | lib-y += hash_md5_sha.o |
| 59 | lib-y += hash_md5_sha_x86-64.o | 59 | lib-y += hash_md5_sha_x86-64.o |
| 60 | lib-y += hash_md5_sha_x86-64_shaNI.o | ||
| 60 | # Alternative (disabled) MD5 implementation | 61 | # Alternative (disabled) MD5 implementation |
| 61 | #lib-y += hash_md5prime.o | 62 | #lib-y += hash_md5prime.o |
| 62 | lib-y += messages.o | 63 | lib-y += messages.o |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index ee19c1cb7..4c6904b48 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
| @@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 699 | 699 | ||
| 700 | /* in hash_md5_sha_x86-64.S */ | 700 | /* in hash_md5_sha_x86-64.S */ |
| 701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | 701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; |
| 702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); | 702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx); |
| 703 | 703 | ||
| 704 | # else | 704 | # else |
| 705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | 705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. |
| @@ -1142,6 +1142,28 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) | |||
| 1142 | } | 1142 | } |
| 1143 | #endif /* NEED_SHA512 */ | 1143 | #endif /* NEED_SHA512 */ |
| 1144 | 1144 | ||
| 1145 | #if ENABLE_SHA1_HWACCEL | ||
| 1146 | # if defined(__GNUC__) && defined(__x86_64__) | ||
| 1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
| 1148 | { | ||
| 1149 | asm ( | ||
| 1150 | "cpuid\n" | ||
| 1151 | : "=a"(*eax), /* Output */ | ||
| 1152 | "=b"(*ebx), | ||
| 1153 | "=c"(*ecx), | ||
| 1154 | "=d"(*edx) | ||
| 1155 | : "0"(*eax), /* Input */ | ||
| 1156 | "1"(*ebx), | ||
| 1157 | "2"(*ecx), | ||
| 1158 | "3"(*edx) | ||
| 1159 | /* No clobbered registers */ | ||
| 1160 | ); | ||
| 1161 | } | ||
| 1162 | struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | ||
| 1163 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
| 1164 | # endif | ||
| 1165 | #endif | ||
| 1166 | |||
| 1145 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | 1167 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) |
| 1146 | { | 1168 | { |
| 1147 | ctx->hash[0] = 0x67452301; | 1169 | ctx->hash[0] = 0x67452301; |
| @@ -1151,6 +1173,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | |||
| 1151 | ctx->hash[4] = 0xc3d2e1f0; | 1173 | ctx->hash[4] = 0xc3d2e1f0; |
| 1152 | ctx->total64 = 0; | 1174 | ctx->total64 = 0; |
| 1153 | ctx->process_block = sha1_process_block64; | 1175 | ctx->process_block = sha1_process_block64; |
| 1176 | #if ENABLE_SHA1_HWACCEL | ||
| 1177 | # if defined(__GNUC__) && defined(__x86_64__) | ||
| 1178 | { | ||
| 1179 | static smallint shaNI; | ||
| 1180 | if (!shaNI) { | ||
| 1181 | unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; | ||
| 1182 | cpuid(&eax, &ebx, &ecx, &edx); | ||
| 1183 | shaNI = ((ebx >> 28) & 2) - 1; | ||
| 1184 | } | ||
| 1185 | if (shaNI > 0) | ||
| 1186 | ctx->process_block = sha1_process_block64_shaNI; | ||
| 1187 | } | ||
| 1188 | # endif | ||
| 1189 | #endif | ||
| 1154 | } | 1190 | } |
| 1155 | 1191 | ||
| 1156 | static const uint32_t init256[] ALIGN4 = { | 1192 | static const uint32_t init256[] ALIGN4 = { |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S new file mode 100644 index 000000000..473b472f1 --- /dev/null +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
| @@ -0,0 +1,225 @@ | |||
| 1 | #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) | ||
| 2 | /* The code is adapted from Linux kernel's source */ | ||
| 3 | |||
| 4 | // We use shorter insns, even though they are for "wrong" | ||
| 5 | // data type (fp, not int). | ||
| 6 | // For Intel, there is no penalty for doing it at all | ||
| 7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
| 8 | // For AMD, the penalty is one extra cycle | ||
| 9 | // (allegedly: I failed to find measurable difference). | ||
| 10 | |||
| 11 | //#define mova128 movdqa | ||
| 12 | #define mova128 movaps | ||
| 13 | //#define movu128 movdqu | ||
| 14 | #define movu128 movups | ||
| 15 | //#define xor128 pxor | ||
| 16 | #define xor128 xorps | ||
| 17 | //#define shuf128_32 pshufd | ||
| 18 | #define shuf128_32 shufps | ||
| 19 | |||
| 20 | #define extr128_32 pextrd | ||
| 21 | //#define extr128_32 extractps # not shorter | ||
| 22 | |||
| 23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | ||
| 24 | .globl sha1_process_block64_shaNI | ||
| 25 | .hidden sha1_process_block64_shaNI | ||
| 26 | .type sha1_process_block64_shaNI, @function | ||
| 27 | |||
| 28 | #define ABCD %xmm0 | ||
| 29 | #define E0 %xmm1 /* Need two E's b/c they ping pong */ | ||
| 30 | #define E1 %xmm2 | ||
| 31 | #define MSG0 %xmm3 | ||
| 32 | #define MSG1 %xmm4 | ||
| 33 | #define MSG2 %xmm5 | ||
| 34 | #define MSG3 %xmm6 | ||
| 35 | #define SHUF_MASK %xmm7 | ||
| 36 | |||
| 37 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
| 38 | sha1_process_block64_shaNI: | ||
| 39 | /* load initial hash values */ | ||
| 40 | |||
| 41 | xor128 E0, E0 | ||
| 42 | movu128 80(%rdi), ABCD | ||
| 43 | pinsrd $3, 80+4*4(%rdi), E0 # load to upper 32-bit word | ||
| 44 | shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap | ||
| 45 | |||
| 46 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK | ||
| 47 | |||
| 48 | /* Save hash values for addition after rounds */ | ||
| 49 | mova128 E0, %xmm9 | ||
| 50 | mova128 ABCD, %xmm8 | ||
| 51 | |||
| 52 | /* Rounds 0-3 */ | ||
| 53 | movu128 0*16(%rdi), MSG0 | ||
| 54 | pshufb SHUF_MASK, MSG0 | ||
| 55 | paddd MSG0, E0 | ||
| 56 | mova128 ABCD, E1 | ||
| 57 | sha1rnds4 $0, E0, ABCD | ||
| 58 | |||
| 59 | /* Rounds 4-7 */ | ||
| 60 | movu128 1*16(%rdi), MSG1 | ||
| 61 | pshufb SHUF_MASK, MSG1 | ||
| 62 | sha1nexte MSG1, E1 | ||
| 63 | mova128 ABCD, E0 | ||
| 64 | sha1rnds4 $0, E1, ABCD | ||
| 65 | sha1msg1 MSG1, MSG0 | ||
| 66 | |||
| 67 | /* Rounds 8-11 */ | ||
| 68 | movu128 2*16(%rdi), MSG2 | ||
| 69 | pshufb SHUF_MASK, MSG2 | ||
| 70 | sha1nexte MSG2, E0 | ||
| 71 | mova128 ABCD, E1 | ||
| 72 | sha1rnds4 $0, E0, ABCD | ||
| 73 | sha1msg1 MSG2, MSG1 | ||
| 74 | xor128 MSG2, MSG0 | ||
| 75 | |||
| 76 | /* Rounds 12-15 */ | ||
| 77 | movu128 3*16(%rdi), MSG3 | ||
| 78 | pshufb SHUF_MASK, MSG3 | ||
| 79 | sha1nexte MSG3, E1 | ||
| 80 | mova128 ABCD, E0 | ||
| 81 | sha1msg2 MSG3, MSG0 | ||
| 82 | sha1rnds4 $0, E1, ABCD | ||
| 83 | sha1msg1 MSG3, MSG2 | ||
| 84 | xor128 MSG3, MSG1 | ||
| 85 | |||
| 86 | /* Rounds 16-19 */ | ||
| 87 | sha1nexte MSG0, E0 | ||
| 88 | mova128 ABCD, E1 | ||
| 89 | sha1msg2 MSG0, MSG1 | ||
| 90 | sha1rnds4 $0, E0, ABCD | ||
| 91 | sha1msg1 MSG0, MSG3 | ||
| 92 | xor128 MSG0, MSG2 | ||
| 93 | |||
| 94 | /* Rounds 20-23 */ | ||
| 95 | sha1nexte MSG1, E1 | ||
| 96 | mova128 ABCD, E0 | ||
| 97 | sha1msg2 MSG1, MSG2 | ||
| 98 | sha1rnds4 $1, E1, ABCD | ||
| 99 | sha1msg1 MSG1, MSG0 | ||
| 100 | xor128 MSG1, MSG3 | ||
| 101 | |||
| 102 | /* Rounds 24-27 */ | ||
| 103 | sha1nexte MSG2, E0 | ||
| 104 | mova128 ABCD, E1 | ||
| 105 | sha1msg2 MSG2, MSG3 | ||
| 106 | sha1rnds4 $1, E0, ABCD | ||
| 107 | sha1msg1 MSG2, MSG1 | ||
| 108 | xor128 MSG2, MSG0 | ||
| 109 | |||
| 110 | /* Rounds 28-31 */ | ||
| 111 | sha1nexte MSG3, E1 | ||
| 112 | mova128 ABCD, E0 | ||
| 113 | sha1msg2 MSG3, MSG0 | ||
| 114 | sha1rnds4 $1, E1, ABCD | ||
| 115 | sha1msg1 MSG3, MSG2 | ||
| 116 | xor128 MSG3, MSG1 | ||
| 117 | |||
| 118 | /* Rounds 32-35 */ | ||
| 119 | sha1nexte MSG0, E0 | ||
| 120 | mova128 ABCD, E1 | ||
| 121 | sha1msg2 MSG0, MSG1 | ||
| 122 | sha1rnds4 $1, E0, ABCD | ||
| 123 | sha1msg1 MSG0, MSG3 | ||
| 124 | xor128 MSG0, MSG2 | ||
| 125 | |||
| 126 | /* Rounds 36-39 */ | ||
| 127 | sha1nexte MSG1, E1 | ||
| 128 | mova128 ABCD, E0 | ||
| 129 | sha1msg2 MSG1, MSG2 | ||
| 130 | sha1rnds4 $1, E1, ABCD | ||
| 131 | sha1msg1 MSG1, MSG0 | ||
| 132 | xor128 MSG1, MSG3 | ||
| 133 | |||
| 134 | /* Rounds 40-43 */ | ||
| 135 | sha1nexte MSG2, E0 | ||
| 136 | mova128 ABCD, E1 | ||
| 137 | sha1msg2 MSG2, MSG3 | ||
| 138 | sha1rnds4 $2, E0, ABCD | ||
| 139 | sha1msg1 MSG2, MSG1 | ||
| 140 | xor128 MSG2, MSG0 | ||
| 141 | |||
| 142 | /* Rounds 44-47 */ | ||
| 143 | sha1nexte MSG3, E1 | ||
| 144 | mova128 ABCD, E0 | ||
| 145 | sha1msg2 MSG3, MSG0 | ||
| 146 | sha1rnds4 $2, E1, ABCD | ||
| 147 | sha1msg1 MSG3, MSG2 | ||
| 148 | xor128 MSG3, MSG1 | ||
| 149 | |||
| 150 | /* Rounds 48-51 */ | ||
| 151 | sha1nexte MSG0, E0 | ||
| 152 | mova128 ABCD, E1 | ||
| 153 | sha1msg2 MSG0, MSG1 | ||
| 154 | sha1rnds4 $2, E0, ABCD | ||
| 155 | sha1msg1 MSG0, MSG3 | ||
| 156 | xor128 MSG0, MSG2 | ||
| 157 | |||
| 158 | /* Rounds 52-55 */ | ||
| 159 | sha1nexte MSG1, E1 | ||
| 160 | mova128 ABCD, E0 | ||
| 161 | sha1msg2 MSG1, MSG2 | ||
| 162 | sha1rnds4 $2, E1, ABCD | ||
| 163 | sha1msg1 MSG1, MSG0 | ||
| 164 | xor128 MSG1, MSG3 | ||
| 165 | |||
| 166 | /* Rounds 56-59 */ | ||
| 167 | sha1nexte MSG2, E0 | ||
| 168 | mova128 ABCD, E1 | ||
| 169 | sha1msg2 MSG2, MSG3 | ||
| 170 | sha1rnds4 $2, E0, ABCD | ||
| 171 | sha1msg1 MSG2, MSG1 | ||
| 172 | xor128 MSG2, MSG0 | ||
| 173 | |||
| 174 | /* Rounds 60-63 */ | ||
| 175 | sha1nexte MSG3, E1 | ||
| 176 | mova128 ABCD, E0 | ||
| 177 | sha1msg2 MSG3, MSG0 | ||
| 178 | sha1rnds4 $3, E1, ABCD | ||
| 179 | sha1msg1 MSG3, MSG2 | ||
| 180 | xor128 MSG3, MSG1 | ||
| 181 | |||
| 182 | /* Rounds 64-67 */ | ||
| 183 | sha1nexte MSG0, E0 | ||
| 184 | mova128 ABCD, E1 | ||
| 185 | sha1msg2 MSG0, MSG1 | ||
| 186 | sha1rnds4 $3, E0, ABCD | ||
| 187 | sha1msg1 MSG0, MSG3 | ||
| 188 | xor128 MSG0, MSG2 | ||
| 189 | |||
| 190 | /* Rounds 68-71 */ | ||
| 191 | sha1nexte MSG1, E1 | ||
| 192 | mova128 ABCD, E0 | ||
| 193 | sha1msg2 MSG1, MSG2 | ||
| 194 | sha1rnds4 $3, E1, ABCD | ||
| 195 | xor128 MSG1, MSG3 | ||
| 196 | |||
| 197 | /* Rounds 72-75 */ | ||
| 198 | sha1nexte MSG2, E0 | ||
| 199 | mova128 ABCD, E1 | ||
| 200 | sha1msg2 MSG2, MSG3 | ||
| 201 | sha1rnds4 $3, E0, ABCD | ||
| 202 | |||
| 203 | /* Rounds 76-79 */ | ||
| 204 | sha1nexte MSG3, E1 | ||
| 205 | mova128 ABCD, E0 | ||
| 206 | sha1rnds4 $3, E1, ABCD | ||
| 207 | |||
| 208 | /* Add current hash values with previously saved */ | ||
| 209 | sha1nexte %xmm9, E0 | ||
| 210 | paddd %xmm8, ABCD | ||
| 211 | |||
| 212 | /* Write hash values back in the correct order */ | ||
| 213 | shuf128_32 $0x1B, ABCD, ABCD | ||
| 214 | movu128 ABCD, 80(%rdi) | ||
| 215 | extr128_32 $3, E0, 80+4*4(%rdi) | ||
| 216 | |||
| 217 | ret | ||
| 218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | ||
| 219 | |||
| 220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | ||
| 221 | .align 16 | ||
| 222 | PSHUFFLE_BYTE_FLIP_MASK: | ||
| 223 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
| 224 | |||
| 225 | #endif | ||
