diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-08 08:20:27 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-08 08:22:17 +0100 |
| commit | 71a1cccaad679bd102f87283f78c581a8fb0e255 (patch) | |
| tree | bfd33cfa7b1f31aedebecfe10fb920515f6f5eae | |
| parent | 4923f74e5873b25b8205a4059964cff75ee731a8 (diff) | |
| download | busybox-w32-71a1cccaad679bd102f87283f78c581a8fb0e255.tar.gz busybox-w32-71a1cccaad679bd102f87283f78c581a8fb0e255.tar.bz2 busybox-w32-71a1cccaad679bd102f87283f78c581a8fb0e255.zip | |
libbb/sha1: shrink x86 hardware accelerated hashing
function old new delta
sha1_process_block64_shaNI 32-bit 524 517 -7
sha1_process_block64_shaNI 64-bit 510 508 -2
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 37 | ||||
| -rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 24 |
2 files changed, 29 insertions, 32 deletions
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 5d082ebfb..0f3fe57ca 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
| @@ -32,14 +32,10 @@ | |||
| 32 | #define MSG1 %xmm4 | 32 | #define MSG1 %xmm4 |
| 33 | #define MSG2 %xmm5 | 33 | #define MSG2 %xmm5 |
| 34 | #define MSG3 %xmm6 | 34 | #define MSG3 %xmm6 |
| 35 | #define SHUF_MASK %xmm7 | ||
| 36 | 35 | ||
| 37 | .balign 8 # allow decoders to fetch at least 3 first insns | 36 | .balign 8 # allow decoders to fetch at least 2 first insns |
| 38 | sha1_process_block64_shaNI: | 37 | sha1_process_block64_shaNI: |
| 39 | pushl %ebp | 38 | subl $16, %esp |
| 40 | movl %esp, %ebp | ||
| 41 | subl $32, %esp | ||
| 42 | andl $~0xF, %esp # paddd needs aligned memory operand | ||
| 43 | 39 | ||
| 44 | /* load initial hash values */ | 40 | /* load initial hash values */ |
| 45 | xor128 E0, E0 | 41 | xor128 E0, E0 |
| @@ -47,30 +43,33 @@ sha1_process_block64_shaNI: | |||
| 47 | pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word | 43 | pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word |
| 48 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD | 44 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD |
| 49 | 45 | ||
| 50 | mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK | 46 | mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 |
| 47 | |||
| 48 | movu128 0*16(%eax), MSG0 | ||
| 49 | pshufb %xmm7, MSG0 | ||
| 50 | movu128 1*16(%eax), MSG1 | ||
| 51 | pshufb %xmm7, MSG1 | ||
| 52 | movu128 2*16(%eax), MSG2 | ||
| 53 | pshufb %xmm7, MSG2 | ||
| 54 | movu128 3*16(%eax), MSG3 | ||
| 55 | pshufb %xmm7, MSG3 | ||
| 51 | 56 | ||
| 52 | /* Save hash values for addition after rounds */ | 57 | /* Save hash values for addition after rounds */ |
| 53 | movu128 E0, 16(%esp) | 58 | movu128 E0, %xmm7 |
| 54 | movu128 ABCD, (%esp) | 59 | movu128 ABCD, (%esp) |
| 55 | 60 | ||
| 56 | /* Rounds 0-3 */ | 61 | /* Rounds 0-3 */ |
| 57 | movu128 0*16(%eax), MSG0 | ||
| 58 | pshufb SHUF_MASK, MSG0 | ||
| 59 | paddd MSG0, E0 | 62 | paddd MSG0, E0 |
| 60 | mova128 ABCD, E1 | 63 | mova128 ABCD, E1 |
| 61 | sha1rnds4 $0, E0, ABCD | 64 | sha1rnds4 $0, E0, ABCD |
| 62 | 65 | ||
| 63 | /* Rounds 4-7 */ | 66 | /* Rounds 4-7 */ |
| 64 | movu128 1*16(%eax), MSG1 | ||
| 65 | pshufb SHUF_MASK, MSG1 | ||
| 66 | sha1nexte MSG1, E1 | 67 | sha1nexte MSG1, E1 |
| 67 | mova128 ABCD, E0 | 68 | mova128 ABCD, E0 |
| 68 | sha1rnds4 $0, E1, ABCD | 69 | sha1rnds4 $0, E1, ABCD |
| 69 | sha1msg1 MSG1, MSG0 | 70 | sha1msg1 MSG1, MSG0 |
| 70 | 71 | ||
| 71 | /* Rounds 8-11 */ | 72 | /* Rounds 8-11 */ |
| 72 | movu128 2*16(%eax), MSG2 | ||
| 73 | pshufb SHUF_MASK, MSG2 | ||
| 74 | sha1nexte MSG2, E0 | 73 | sha1nexte MSG2, E0 |
| 75 | mova128 ABCD, E1 | 74 | mova128 ABCD, E1 |
| 76 | sha1rnds4 $0, E0, ABCD | 75 | sha1rnds4 $0, E0, ABCD |
| @@ -78,8 +77,6 @@ sha1_process_block64_shaNI: | |||
| 78 | xor128 MSG2, MSG0 | 77 | xor128 MSG2, MSG0 |
| 79 | 78 | ||
| 80 | /* Rounds 12-15 */ | 79 | /* Rounds 12-15 */ |
| 81 | movu128 3*16(%eax), MSG3 | ||
| 82 | pshufb SHUF_MASK, MSG3 | ||
| 83 | sha1nexte MSG3, E1 | 80 | sha1nexte MSG3, E1 |
| 84 | mova128 ABCD, E0 | 81 | mova128 ABCD, E0 |
| 85 | sha1msg2 MSG3, MSG0 | 82 | sha1msg2 MSG3, MSG0 |
| @@ -210,16 +207,16 @@ sha1_process_block64_shaNI: | |||
| 210 | sha1rnds4 $3, E1, ABCD | 207 | sha1rnds4 $3, E1, ABCD |
| 211 | 208 | ||
| 212 | /* Add current hash values with previously saved */ | 209 | /* Add current hash values with previously saved */ |
| 213 | sha1nexte 16(%esp), E0 | 210 | sha1nexte %xmm7, E0 |
| 214 | paddd (%esp), ABCD | 211 | movu128 (%esp), %xmm7 |
| 212 | paddd %xmm7, ABCD | ||
| 215 | 213 | ||
| 216 | /* Write hash values back in the correct order */ | 214 | /* Write hash values back in the correct order */ |
| 217 | shuf128_32 $0x1B, ABCD, ABCD | 215 | shuf128_32 $0x1B, ABCD, ABCD |
| 218 | movu128 ABCD, 76(%eax) | 216 | movu128 ABCD, 76(%eax) |
| 219 | extr128_32 $3, E0, 76+4*4(%eax) | 217 | extr128_32 $3, E0, 76+4*4(%eax) |
| 220 | 218 | ||
| 221 | movl %ebp, %esp | 219 | addl $16, %esp |
| 222 | popl %ebp | ||
| 223 | ret | 220 | ret |
| 224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 221 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
| 225 | 222 | ||
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index 8ddec87ce..fc2ca92e8 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
| @@ -32,7 +32,6 @@ | |||
| 32 | #define MSG1 %xmm4 | 32 | #define MSG1 %xmm4 |
| 33 | #define MSG2 %xmm5 | 33 | #define MSG2 %xmm5 |
| 34 | #define MSG3 %xmm6 | 34 | #define MSG3 %xmm6 |
| 35 | #define SHUF_MASK %xmm7 | ||
| 36 | 35 | ||
| 37 | .balign 8 # allow decoders to fetch at least 2 first insns | 36 | .balign 8 # allow decoders to fetch at least 2 first insns |
| 38 | sha1_process_block64_shaNI: | 37 | sha1_process_block64_shaNI: |
| @@ -43,30 +42,33 @@ sha1_process_block64_shaNI: | |||
| 43 | pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word | 42 | pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word |
| 44 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD | 43 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD |
| 45 | 44 | ||
| 46 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK | 45 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 |
| 46 | |||
| 47 | movu128 0*16(%rdi), MSG0 | ||
| 48 | pshufb %xmm7, MSG0 | ||
| 49 | movu128 1*16(%rdi), MSG1 | ||
| 50 | pshufb %xmm7, MSG1 | ||
| 51 | movu128 2*16(%rdi), MSG2 | ||
| 52 | pshufb %xmm7, MSG2 | ||
| 53 | movu128 3*16(%rdi), MSG3 | ||
| 54 | pshufb %xmm7, MSG3 | ||
| 47 | 55 | ||
| 48 | /* Save hash values for addition after rounds */ | 56 | /* Save hash values for addition after rounds */ |
| 49 | mova128 E0, %xmm9 | 57 | mova128 E0, %xmm7 |
| 50 | mova128 ABCD, %xmm8 | 58 | mova128 ABCD, %xmm8 |
| 51 | 59 | ||
| 52 | /* Rounds 0-3 */ | 60 | /* Rounds 0-3 */ |
| 53 | movu128 0*16(%rdi), MSG0 | ||
| 54 | pshufb SHUF_MASK, MSG0 | ||
| 55 | paddd MSG0, E0 | 61 | paddd MSG0, E0 |
| 56 | mova128 ABCD, E1 | 62 | mova128 ABCD, E1 |
| 57 | sha1rnds4 $0, E0, ABCD | 63 | sha1rnds4 $0, E0, ABCD |
| 58 | 64 | ||
| 59 | /* Rounds 4-7 */ | 65 | /* Rounds 4-7 */ |
| 60 | movu128 1*16(%rdi), MSG1 | ||
| 61 | pshufb SHUF_MASK, MSG1 | ||
| 62 | sha1nexte MSG1, E1 | 66 | sha1nexte MSG1, E1 |
| 63 | mova128 ABCD, E0 | 67 | mova128 ABCD, E0 |
| 64 | sha1rnds4 $0, E1, ABCD | 68 | sha1rnds4 $0, E1, ABCD |
| 65 | sha1msg1 MSG1, MSG0 | 69 | sha1msg1 MSG1, MSG0 |
| 66 | 70 | ||
| 67 | /* Rounds 8-11 */ | 71 | /* Rounds 8-11 */ |
| 68 | movu128 2*16(%rdi), MSG2 | ||
| 69 | pshufb SHUF_MASK, MSG2 | ||
| 70 | sha1nexte MSG2, E0 | 72 | sha1nexte MSG2, E0 |
| 71 | mova128 ABCD, E1 | 73 | mova128 ABCD, E1 |
| 72 | sha1rnds4 $0, E0, ABCD | 74 | sha1rnds4 $0, E0, ABCD |
| @@ -74,8 +76,6 @@ sha1_process_block64_shaNI: | |||
| 74 | xor128 MSG2, MSG0 | 76 | xor128 MSG2, MSG0 |
| 75 | 77 | ||
| 76 | /* Rounds 12-15 */ | 78 | /* Rounds 12-15 */ |
| 77 | movu128 3*16(%rdi), MSG3 | ||
| 78 | pshufb SHUF_MASK, MSG3 | ||
| 79 | sha1nexte MSG3, E1 | 79 | sha1nexte MSG3, E1 |
| 80 | mova128 ABCD, E0 | 80 | mova128 ABCD, E0 |
| 81 | sha1msg2 MSG3, MSG0 | 81 | sha1msg2 MSG3, MSG0 |
| @@ -206,7 +206,7 @@ sha1_process_block64_shaNI: | |||
| 206 | sha1rnds4 $3, E1, ABCD | 206 | sha1rnds4 $3, E1, ABCD |
| 207 | 207 | ||
| 208 | /* Add current hash values with previously saved */ | 208 | /* Add current hash values with previously saved */ |
| 209 | sha1nexte %xmm9, E0 | 209 | sha1nexte %xmm7, E0 |
| 210 | paddd %xmm8, ABCD | 210 | paddd %xmm8, ABCD |
| 211 | 211 | ||
| 212 | /* Write hash values back in the correct order */ | 212 | /* Write hash values back in the correct order */ |
