diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-08 03:29:16 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-08 03:29:16 +0100 |
commit | 4923f74e5873b25b8205a4059964cff75ee731a8 (patch) | |
tree | 303d731fc684080fb6438657a235cd7b002d6702 | |
parent | c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (diff) | |
download | busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.gz busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.bz2 busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.zip |
libbb/sha1: shrink unrolled x86-64 code
function old new delta
sha1_process_block64 3482 3481 -1
.rodata 108460 108412 -48
------------------------------------------------------------------------------
(add/remove: 1/4 grow/shrink: 0/2 up/down: 0/-49) Total: -49 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 33 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 34 |
2 files changed, 21 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index e26c46f25..287cfe547 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -24,6 +24,7 @@ sha1_process_block64: | |||
24 | # xmm0..xmm3: W[] | 24 | # xmm0..xmm3: W[] |
25 | # xmm4,xmm5: temps | 25 | # xmm4,xmm5: temps |
26 | # xmm6: current round constant | 26 | # xmm6: current round constant |
27 | # xmm7: all round constants | ||
27 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units | 28 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
28 | 29 | ||
29 | movl 80(%rdi), %eax # a = ctx->hash[0] | 30 | movl 80(%rdi), %eax # a = ctx->hash[0] |
@@ -32,16 +33,17 @@ sha1_process_block64: | |||
32 | movl 92(%rdi), %edx # d = ctx->hash[3] | 33 | movl 92(%rdi), %edx # d = ctx->hash[3] |
33 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 34 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
34 | 35 | ||
35 | movaps rconst0x5A827999(%rip), %xmm6 | 36 | movaps sha1const(%rip), %xmm7 |
37 | pshufd $0x00, %xmm7, %xmm6 | ||
36 | 38 | ||
37 | # Load W[] to xmm registers, byteswapping on the fly. | 39 | # Load W[] to xmm registers, byteswapping on the fly. |
38 | # | 40 | # |
39 | # For iterations 0..15, we pass W[] in rsi,r8..r14 | 41 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
40 | # for use in RD1A's instead of spilling them to stack. | 42 | # for use in RD1As instead of spilling them to stack. |
41 | # We lose parallelized addition of RCONST, but LEA | 43 | # We lose parallelized addition of RCONST, but LEA |
42 | # can do two additions at once, so it's probably a wash. | 44 | # can do two additions at once, so it is probably a wash. |
43 | # (We use rsi instead of rN because this makes two | 45 | # (We use rsi instead of rN because this makes two |
44 | # LEAs in two first RD1A's shorter by one byte). | 46 | # LEAs in two first RD1As shorter by one byte). |
45 | movq 4*0(%rdi), %rsi | 47 | movq 4*0(%rdi), %rsi |
46 | movq 4*2(%rdi), %r8 | 48 | movq 4*2(%rdi), %r8 |
47 | bswapq %rsi | 49 | bswapq %rsi |
@@ -253,7 +255,7 @@ sha1_process_block64: | |||
253 | roll $5, %edi # rotl32(a,5) | 255 | roll $5, %edi # rotl32(a,5) |
254 | addl %edi, %edx # e += rotl32(a,5) | 256 | addl %edi, %edx # e += rotl32(a,5) |
255 | rorl $2, %eax # b = rotl32(b,30) | 257 | rorl $2, %eax # b = rotl32(b,30) |
256 | movaps rconst0x6ED9EBA1(%rip), %xmm6 | 258 | pshufd $0x55, %xmm7, %xmm6 |
257 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 259 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
258 | movaps %xmm0, %xmm4 | 260 | movaps %xmm0, %xmm4 |
259 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 261 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
@@ -614,7 +616,7 @@ sha1_process_block64: | |||
614 | roll $5, %esi # rotl32(a,5) | 616 | roll $5, %esi # rotl32(a,5) |
615 | addl %esi, %edx # e += rotl32(a,5) | 617 | addl %esi, %edx # e += rotl32(a,5) |
616 | rorl $2, %eax # b = rotl32(b,30) | 618 | rorl $2, %eax # b = rotl32(b,30) |
617 | movaps rconst0x8F1BBCDC(%rip), %xmm6 | 619 | pshufd $0xaa, %xmm7, %xmm6 |
618 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 620 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
619 | movaps %xmm1, %xmm4 | 621 | movaps %xmm1, %xmm4 |
620 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 622 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
@@ -1001,7 +1003,7 @@ sha1_process_block64: | |||
1001 | roll $5, %esi # rotl32(a,5) | 1003 | roll $5, %esi # rotl32(a,5) |
1002 | addl %esi, %edx # e += rotl32(a,5) | 1004 | addl %esi, %edx # e += rotl32(a,5) |
1003 | rorl $2, %eax # b = rotl32(b,30) | 1005 | rorl $2, %eax # b = rotl32(b,30) |
1004 | movaps rconst0xCA62C1D6(%rip), %xmm6 | 1006 | pshufd $0xff, %xmm7, %xmm6 |
1005 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 1007 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
1006 | movaps %xmm2, %xmm4 | 1008 | movaps %xmm2, %xmm4 |
1007 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1009 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
@@ -1475,25 +1477,10 @@ sha1_process_block64: | |||
1475 | 1477 | ||
1476 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 | 1478 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 |
1477 | .balign 16 | 1479 | .balign 16 |
1478 | rconst0x5A827999: | 1480 | sha1const: |
1479 | .long 0x5A827999 | 1481 | .long 0x5A827999 |
1480 | .long 0x5A827999 | ||
1481 | .long 0x5A827999 | ||
1482 | .long 0x5A827999 | ||
1483 | rconst0x6ED9EBA1: | ||
1484 | .long 0x6ED9EBA1 | ||
1485 | .long 0x6ED9EBA1 | ||
1486 | .long 0x6ED9EBA1 | ||
1487 | .long 0x6ED9EBA1 | 1482 | .long 0x6ED9EBA1 |
1488 | rconst0x8F1BBCDC: | ||
1489 | .long 0x8F1BBCDC | 1483 | .long 0x8F1BBCDC |
1490 | .long 0x8F1BBCDC | ||
1491 | .long 0x8F1BBCDC | ||
1492 | .long 0x8F1BBCDC | ||
1493 | rconst0xCA62C1D6: | ||
1494 | .long 0xCA62C1D6 | ||
1495 | .long 0xCA62C1D6 | ||
1496 | .long 0xCA62C1D6 | ||
1497 | .long 0xCA62C1D6 | 1484 | .long 0xCA62C1D6 |
1498 | 1485 | ||
1499 | #endif | 1486 | #endif |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index fb1e4b57e..a10ac411d 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -34,6 +34,7 @@ exec >hash_md5_sha_x86-64.S | |||
34 | xmmT1="%xmm4" | 34 | xmmT1="%xmm4" |
35 | xmmT2="%xmm5" | 35 | xmmT2="%xmm5" |
36 | xmmRCONST="%xmm6" | 36 | xmmRCONST="%xmm6" |
37 | xmmALLRCONST="%xmm7" | ||
37 | T=`printf '\t'` | 38 | T=`printf '\t'` |
38 | 39 | ||
39 | # SSE instructions are longer than 4 bytes on average. | 40 | # SSE instructions are longer than 4 bytes on average. |
@@ -125,6 +126,7 @@ sha1_process_block64: | |||
125 | # xmm0..xmm3: W[] | 126 | # xmm0..xmm3: W[] |
126 | # xmm4,xmm5: temps | 127 | # xmm4,xmm5: temps |
127 | # xmm6: current round constant | 128 | # xmm6: current round constant |
129 | # xmm7: all round constants | ||
128 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units | 130 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
129 | 131 | ||
130 | movl 80(%rdi), %eax # a = ctx->hash[0] | 132 | movl 80(%rdi), %eax # a = ctx->hash[0] |
@@ -133,16 +135,17 @@ sha1_process_block64: | |||
133 | movl 92(%rdi), %edx # d = ctx->hash[3] | 135 | movl 92(%rdi), %edx # d = ctx->hash[3] |
134 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 136 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
135 | 137 | ||
136 | movaps rconst0x5A827999(%rip), $xmmRCONST | 138 | movaps sha1const(%rip), $xmmALLRCONST |
139 | pshufd \$0x00, $xmmALLRCONST, $xmmRCONST | ||
137 | 140 | ||
138 | # Load W[] to xmm registers, byteswapping on the fly. | 141 | # Load W[] to xmm registers, byteswapping on the fly. |
139 | # | 142 | # |
140 | # For iterations 0..15, we pass W[] in rsi,r8..r14 | 143 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
141 | # for use in RD1A's instead of spilling them to stack. | 144 | # for use in RD1As instead of spilling them to stack. |
142 | # We lose parallelized addition of RCONST, but LEA | 145 | # We lose parallelized addition of RCONST, but LEA |
143 | # can do two additions at once, so it's probably a wash. | 146 | # can do two additions at once, so it is probably a wash. |
144 | # (We use rsi instead of rN because this makes two | 147 | # (We use rsi instead of rN because this makes two |
145 | # LEAs in two first RD1A's shorter by one byte). | 148 | # LEAs in two first RD1As shorter by one byte). |
146 | movq 4*0(%rdi), %rsi | 149 | movq 4*0(%rdi), %rsi |
147 | movq 4*2(%rdi), %r8 | 150 | movq 4*2(%rdi), %r8 |
148 | bswapq %rsi | 151 | bswapq %rsi |
@@ -359,7 +362,7 @@ RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx | |||
359 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | 362 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
360 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` | 363 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
361 | INTERLEAVE "$a" "$b" | 364 | INTERLEAVE "$a" "$b" |
362 | a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" | 365 | a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" |
363 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | 366 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
364 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` | 367 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` |
365 | INTERLEAVE "$a" "$b" | 368 | INTERLEAVE "$a" "$b" |
@@ -378,7 +381,7 @@ INTERLEAVE "$a" "$b" | |||
378 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | 381 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
379 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` | 382 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` |
380 | INTERLEAVE "$a" "$b" | 383 | INTERLEAVE "$a" "$b" |
381 | a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" | 384 | a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" |
382 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | 385 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
383 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` | 386 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` |
384 | INTERLEAVE "$a" "$b" | 387 | INTERLEAVE "$a" "$b" |
@@ -397,7 +400,7 @@ INTERLEAVE "$a" "$b" | |||
397 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | 400 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
398 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` | 401 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` |
399 | INTERLEAVE "$a" "$b" | 402 | INTERLEAVE "$a" "$b" |
400 | a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" | 403 | a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" |
401 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | 404 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
402 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` | 405 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` |
403 | INTERLEAVE "$a" "$b" | 406 | INTERLEAVE "$a" "$b" |
@@ -439,25 +442,10 @@ echo " | |||
439 | 442 | ||
440 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 | 443 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 |
441 | .balign 16 | 444 | .balign 16 |
442 | rconst0x5A827999: | 445 | sha1const: |
443 | .long 0x5A827999 | 446 | .long 0x5A827999 |
444 | .long 0x5A827999 | ||
445 | .long 0x5A827999 | ||
446 | .long 0x5A827999 | ||
447 | rconst0x6ED9EBA1: | ||
448 | .long 0x6ED9EBA1 | ||
449 | .long 0x6ED9EBA1 | ||
450 | .long 0x6ED9EBA1 | ||
451 | .long 0x6ED9EBA1 | 447 | .long 0x6ED9EBA1 |
452 | rconst0x8F1BBCDC: | ||
453 | .long 0x8F1BBCDC | 448 | .long 0x8F1BBCDC |
454 | .long 0x8F1BBCDC | ||
455 | .long 0x8F1BBCDC | ||
456 | .long 0x8F1BBCDC | ||
457 | rconst0xCA62C1D6: | ||
458 | .long 0xCA62C1D6 | ||
459 | .long 0xCA62C1D6 | ||
460 | .long 0xCA62C1D6 | ||
461 | .long 0xCA62C1D6 | 449 | .long 0xCA62C1D6 |
462 | 450 | ||
463 | #endif" | 451 | #endif" |