diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 01:32:13 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 01:32:13 +0100 |
| commit | a96ccbefe417aaac6a2ce59c788e01fc0f83902f (patch) | |
| tree | 4151347572dcbce6777ba66c40cf0bc0135ee491 | |
| parent | 711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (diff) | |
| download | busybox-w32-a96ccbefe417aaac6a2ce59c788e01fc0f83902f.tar.gz busybox-w32-a96ccbefe417aaac6a2ce59c788e01fc0f83902f.tar.bz2 busybox-w32-a96ccbefe417aaac6a2ce59c788e01fc0f83902f.zip | |
libbb/sha1: optional i686 hardware accelerates hashing
function old new delta
sha1_process_block64_shaNI - 524 +524
sha1_begin 57 114 +57
.rodata 104353 104369 +16
static.shaNI - 1 +1
------------------------------------------------------------------------------
(add/remove: 4/0 grow/shrink: 2/0 up/down: 598/0) Total: 598 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | libbb/Kbuild.src | 1 | ||||
| -rw-r--r-- | libbb/hash_md5_sha.c | 21 | ||||
| -rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 231 |
3 files changed, 252 insertions, 1 deletions
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index a3db02b6f..e8bb24f6d 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src | |||
| @@ -58,6 +58,7 @@ lib-y += makedev.o | |||
| 58 | lib-y += hash_md5_sha.o | 58 | lib-y += hash_md5_sha.o |
| 59 | lib-y += hash_md5_sha_x86-64.o | 59 | lib-y += hash_md5_sha_x86-64.o |
| 60 | lib-y += hash_md5_sha_x86-64_shaNI.o | 60 | lib-y += hash_md5_sha_x86-64_shaNI.o |
| 61 | lib-y += hash_md5_sha_x86-32_shaNI.o | ||
| 61 | # Alternative (disabled) MD5 implementation | 62 | # Alternative (disabled) MD5 implementation |
| 62 | #lib-y += hash_md5prime.o | 63 | #lib-y += hash_md5prime.o |
| 63 | lib-y += messages.o | 64 | lib-y += messages.o |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 4c6904b48..0b3af723a 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
| @@ -1143,6 +1143,25 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) | |||
| 1143 | #endif /* NEED_SHA512 */ | 1143 | #endif /* NEED_SHA512 */ |
| 1144 | 1144 | ||
| 1145 | #if ENABLE_SHA1_HWACCEL | 1145 | #if ENABLE_SHA1_HWACCEL |
| 1146 | # if defined(__GNUC__) && defined(__i386__) | ||
| 1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
| 1148 | { | ||
| 1149 | asm ( | ||
| 1150 | " cpuid\n" | ||
| 1151 | : "=a"(*eax), /* Output */ | ||
| 1152 | "=b"(*ebx), | ||
| 1153 | "=c"(*ecx), | ||
| 1154 | "=d"(*edx) | ||
| 1155 | : "0"(*eax), /* Input */ | ||
| 1156 | "1"(*ebx), | ||
| 1157 | "2"(*ecx), | ||
| 1158 | "3"(*edx) | ||
| 1159 | /* No clobbered registers */ | ||
| 1160 | ); | ||
| 1161 | } | ||
| 1162 | struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; }; | ||
| 1163 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
| 1164 | # endif | ||
| 1146 | # if defined(__GNUC__) && defined(__x86_64__) | 1165 | # if defined(__GNUC__) && defined(__x86_64__) |
| 1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | 1166 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) |
| 1148 | { | 1167 | { |
| @@ -1174,7 +1193,7 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | |||
| 1174 | ctx->total64 = 0; | 1193 | ctx->total64 = 0; |
| 1175 | ctx->process_block = sha1_process_block64; | 1194 | ctx->process_block = sha1_process_block64; |
| 1176 | #if ENABLE_SHA1_HWACCEL | 1195 | #if ENABLE_SHA1_HWACCEL |
| 1177 | # if defined(__GNUC__) && defined(__x86_64__) | 1196 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 1178 | { | 1197 | { |
| 1179 | static smallint shaNI; | 1198 | static smallint shaNI; |
| 1180 | if (!shaNI) { | 1199 | if (!shaNI) { |
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S new file mode 100644 index 000000000..7202c7673 --- /dev/null +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
| @@ -0,0 +1,231 @@ | |||
| 1 | #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) | ||
| 2 | /* The code is adapted from Linux kernel's source */ | ||
| 3 | |||
| 4 | // We use shorter insns, even though they are for "wrong" | ||
| 5 | // data type (fp, not int). | ||
| 6 | // For Intel, there is no penalty for doing it at all | ||
| 7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
| 8 | // For AMD, the penalty is one extra cycle | ||
| 9 | // (allegedly: I failed to find measurable difference). | ||
| 10 | |||
| 11 | //#define mova128 movdqa | ||
| 12 | #define mova128 movaps | ||
| 13 | //#define movu128 movdqu | ||
| 14 | #define movu128 movups | ||
| 15 | //#define xor128 pxor | ||
| 16 | #define xor128 xorps | ||
| 17 | //#define shuf128_32 pshufd | ||
| 18 | #define shuf128_32 shufps | ||
| 19 | |||
| 20 | #define extr128_32 pextrd | ||
| 21 | //#define extr128_32 extractps # not shorter | ||
| 22 | |||
| 23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | ||
| 24 | .globl sha1_process_block64_shaNI | ||
| 25 | .hidden sha1_process_block64_shaNI | ||
| 26 | .type sha1_process_block64_shaNI, @function | ||
| 27 | |||
| 28 | #define ABCD %xmm0 | ||
| 29 | #define E0 %xmm1 /* Need two E's b/c they ping pong */ | ||
| 30 | #define E1 %xmm2 | ||
| 31 | #define MSG0 %xmm3 | ||
| 32 | #define MSG1 %xmm4 | ||
| 33 | #define MSG2 %xmm5 | ||
| 34 | #define MSG3 %xmm6 | ||
| 35 | #define SHUF_MASK %xmm7 | ||
| 36 | |||
| 37 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
| 38 | sha1_process_block64_shaNI: | ||
| 39 | pushl %ebp | ||
| 40 | movl %esp, %ebp | ||
| 41 | subl $32, %esp | ||
| 42 | andl $~0xF, %esp # paddd needs aligned memory operand | ||
| 43 | |||
| 44 | /* load initial hash values */ | ||
| 45 | xor128 E0, E0 | ||
| 46 | movu128 76(%eax), ABCD | ||
| 47 | pinsrd $3, 76+4*4(%eax), E0 # load to upper 32-bit word | ||
| 48 | shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap | ||
| 49 | |||
| 50 | mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK | ||
| 51 | |||
| 52 | /* Save hash values for addition after rounds */ | ||
| 53 | movu128 E0, 16(%esp) | ||
| 54 | movu128 ABCD, (%esp) | ||
| 55 | |||
| 56 | /* Rounds 0-3 */ | ||
| 57 | movu128 0*16(%eax), MSG0 | ||
| 58 | pshufb SHUF_MASK, MSG0 | ||
| 59 | paddd MSG0, E0 | ||
| 60 | mova128 ABCD, E1 | ||
| 61 | sha1rnds4 $0, E0, ABCD | ||
| 62 | |||
| 63 | /* Rounds 4-7 */ | ||
| 64 | movu128 1*16(%eax), MSG1 | ||
| 65 | pshufb SHUF_MASK, MSG1 | ||
| 66 | sha1nexte MSG1, E1 | ||
| 67 | mova128 ABCD, E0 | ||
| 68 | sha1rnds4 $0, E1, ABCD | ||
| 69 | sha1msg1 MSG1, MSG0 | ||
| 70 | |||
| 71 | /* Rounds 8-11 */ | ||
| 72 | movu128 2*16(%eax), MSG2 | ||
| 73 | pshufb SHUF_MASK, MSG2 | ||
| 74 | sha1nexte MSG2, E0 | ||
| 75 | mova128 ABCD, E1 | ||
| 76 | sha1rnds4 $0, E0, ABCD | ||
| 77 | sha1msg1 MSG2, MSG1 | ||
| 78 | xor128 MSG2, MSG0 | ||
| 79 | |||
| 80 | /* Rounds 12-15 */ | ||
| 81 | movu128 3*16(%eax), MSG3 | ||
| 82 | pshufb SHUF_MASK, MSG3 | ||
| 83 | sha1nexte MSG3, E1 | ||
| 84 | mova128 ABCD, E0 | ||
| 85 | sha1msg2 MSG3, MSG0 | ||
| 86 | sha1rnds4 $0, E1, ABCD | ||
| 87 | sha1msg1 MSG3, MSG2 | ||
| 88 | xor128 MSG3, MSG1 | ||
| 89 | |||
| 90 | /* Rounds 16-19 */ | ||
| 91 | sha1nexte MSG0, E0 | ||
| 92 | mova128 ABCD, E1 | ||
| 93 | sha1msg2 MSG0, MSG1 | ||
| 94 | sha1rnds4 $0, E0, ABCD | ||
| 95 | sha1msg1 MSG0, MSG3 | ||
| 96 | xor128 MSG0, MSG2 | ||
| 97 | |||
| 98 | /* Rounds 20-23 */ | ||
| 99 | sha1nexte MSG1, E1 | ||
| 100 | mova128 ABCD, E0 | ||
| 101 | sha1msg2 MSG1, MSG2 | ||
| 102 | sha1rnds4 $1, E1, ABCD | ||
| 103 | sha1msg1 MSG1, MSG0 | ||
| 104 | xor128 MSG1, MSG3 | ||
| 105 | |||
| 106 | /* Rounds 24-27 */ | ||
| 107 | sha1nexte MSG2, E0 | ||
| 108 | mova128 ABCD, E1 | ||
| 109 | sha1msg2 MSG2, MSG3 | ||
| 110 | sha1rnds4 $1, E0, ABCD | ||
| 111 | sha1msg1 MSG2, MSG1 | ||
| 112 | xor128 MSG2, MSG0 | ||
| 113 | |||
| 114 | /* Rounds 28-31 */ | ||
| 115 | sha1nexte MSG3, E1 | ||
| 116 | mova128 ABCD, E0 | ||
| 117 | sha1msg2 MSG3, MSG0 | ||
| 118 | sha1rnds4 $1, E1, ABCD | ||
| 119 | sha1msg1 MSG3, MSG2 | ||
| 120 | xor128 MSG3, MSG1 | ||
| 121 | |||
| 122 | /* Rounds 32-35 */ | ||
| 123 | sha1nexte MSG0, E0 | ||
| 124 | mova128 ABCD, E1 | ||
| 125 | sha1msg2 MSG0, MSG1 | ||
| 126 | sha1rnds4 $1, E0, ABCD | ||
| 127 | sha1msg1 MSG0, MSG3 | ||
| 128 | xor128 MSG0, MSG2 | ||
| 129 | |||
| 130 | /* Rounds 36-39 */ | ||
| 131 | sha1nexte MSG1, E1 | ||
| 132 | mova128 ABCD, E0 | ||
| 133 | sha1msg2 MSG1, MSG2 | ||
| 134 | sha1rnds4 $1, E1, ABCD | ||
| 135 | sha1msg1 MSG1, MSG0 | ||
| 136 | xor128 MSG1, MSG3 | ||
| 137 | |||
| 138 | /* Rounds 40-43 */ | ||
| 139 | sha1nexte MSG2, E0 | ||
| 140 | mova128 ABCD, E1 | ||
| 141 | sha1msg2 MSG2, MSG3 | ||
| 142 | sha1rnds4 $2, E0, ABCD | ||
| 143 | sha1msg1 MSG2, MSG1 | ||
| 144 | xor128 MSG2, MSG0 | ||
| 145 | |||
| 146 | /* Rounds 44-47 */ | ||
| 147 | sha1nexte MSG3, E1 | ||
| 148 | mova128 ABCD, E0 | ||
| 149 | sha1msg2 MSG3, MSG0 | ||
| 150 | sha1rnds4 $2, E1, ABCD | ||
| 151 | sha1msg1 MSG3, MSG2 | ||
| 152 | xor128 MSG3, MSG1 | ||
| 153 | |||
| 154 | /* Rounds 48-51 */ | ||
| 155 | sha1nexte MSG0, E0 | ||
| 156 | mova128 ABCD, E1 | ||
| 157 | sha1msg2 MSG0, MSG1 | ||
| 158 | sha1rnds4 $2, E0, ABCD | ||
| 159 | sha1msg1 MSG0, MSG3 | ||
| 160 | xor128 MSG0, MSG2 | ||
| 161 | |||
| 162 | /* Rounds 52-55 */ | ||
| 163 | sha1nexte MSG1, E1 | ||
| 164 | mova128 ABCD, E0 | ||
| 165 | sha1msg2 MSG1, MSG2 | ||
| 166 | sha1rnds4 $2, E1, ABCD | ||
| 167 | sha1msg1 MSG1, MSG0 | ||
| 168 | xor128 MSG1, MSG3 | ||
| 169 | |||
| 170 | /* Rounds 56-59 */ | ||
| 171 | sha1nexte MSG2, E0 | ||
| 172 | mova128 ABCD, E1 | ||
| 173 | sha1msg2 MSG2, MSG3 | ||
| 174 | sha1rnds4 $2, E0, ABCD | ||
| 175 | sha1msg1 MSG2, MSG1 | ||
| 176 | xor128 MSG2, MSG0 | ||
| 177 | |||
| 178 | /* Rounds 60-63 */ | ||
| 179 | sha1nexte MSG3, E1 | ||
| 180 | mova128 ABCD, E0 | ||
| 181 | sha1msg2 MSG3, MSG0 | ||
| 182 | sha1rnds4 $3, E1, ABCD | ||
| 183 | sha1msg1 MSG3, MSG2 | ||
| 184 | xor128 MSG3, MSG1 | ||
| 185 | |||
| 186 | /* Rounds 64-67 */ | ||
| 187 | sha1nexte MSG0, E0 | ||
| 188 | mova128 ABCD, E1 | ||
| 189 | sha1msg2 MSG0, MSG1 | ||
| 190 | sha1rnds4 $3, E0, ABCD | ||
| 191 | sha1msg1 MSG0, MSG3 | ||
| 192 | xor128 MSG0, MSG2 | ||
| 193 | |||
| 194 | /* Rounds 68-71 */ | ||
| 195 | sha1nexte MSG1, E1 | ||
| 196 | mova128 ABCD, E0 | ||
| 197 | sha1msg2 MSG1, MSG2 | ||
| 198 | sha1rnds4 $3, E1, ABCD | ||
| 199 | xor128 MSG1, MSG3 | ||
| 200 | |||
| 201 | /* Rounds 72-75 */ | ||
| 202 | sha1nexte MSG2, E0 | ||
| 203 | mova128 ABCD, E1 | ||
| 204 | sha1msg2 MSG2, MSG3 | ||
| 205 | sha1rnds4 $3, E0, ABCD | ||
| 206 | |||
| 207 | /* Rounds 76-79 */ | ||
| 208 | sha1nexte MSG3, E1 | ||
| 209 | mova128 ABCD, E0 | ||
| 210 | sha1rnds4 $3, E1, ABCD | ||
| 211 | |||
| 212 | /* Add current hash values with previously saved */ | ||
| 213 | sha1nexte 16(%esp), E0 | ||
| 214 | paddd (%esp), ABCD | ||
| 215 | |||
| 216 | /* Write hash values back in the correct order */ | ||
| 217 | shuf128_32 $0x1B, ABCD, ABCD | ||
| 218 | movu128 ABCD, 76(%eax) | ||
| 219 | extr128_32 $3, E0, 76+4*4(%eax) | ||
| 220 | |||
| 221 | movl %ebp, %esp | ||
| 222 | popl %ebp | ||
| 223 | ret | ||
| 224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | ||
| 225 | |||
| 226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | ||
| 227 | .align 16 | ||
| 228 | PSHUFFLE_BYTE_FLIP_MASK: | ||
| 229 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
| 230 | |||
| 231 | #endif | ||
