aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-02-08 03:29:16 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-02-08 03:29:16 +0100
commit4923f74e5873b25b8205a4059964cff75ee731a8 (patch)
tree303d731fc684080fb6438657a235cd7b002d6702
parentc193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (diff)
downloadbusybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.gz
busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.bz2
busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.zip
libbb/sha1: shrink unrolled x86-64 code
function old new delta sha1_process_block64 3482 3481 -1 .rodata 108460 108412 -48 ------------------------------------------------------------------------------ (add/remove: 1/4 grow/shrink: 0/2 up/down: 0/-49) Total: -49 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/hash_md5_sha_x86-64.S33
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh34
2 files changed, 21 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index e26c46f25..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -24,6 +24,7 @@ sha1_process_block64:
24# xmm0..xmm3: W[] 24# xmm0..xmm3: W[]
25# xmm4,xmm5: temps 25# xmm4,xmm5: temps
26# xmm6: current round constant 26# xmm6: current round constant
27# xmm7: all round constants
27# -64(%rsp): area for passing RCONST + W[] from vector to integer units 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
28 29
29 movl 80(%rdi), %eax # a = ctx->hash[0] 30 movl 80(%rdi), %eax # a = ctx->hash[0]
@@ -32,16 +33,17 @@ sha1_process_block64:
32 movl 92(%rdi), %edx # d = ctx->hash[3] 33 movl 92(%rdi), %edx # d = ctx->hash[3]
33 movl 96(%rdi), %ebp # e = ctx->hash[4] 34 movl 96(%rdi), %ebp # e = ctx->hash[4]
34 35
35 movaps rconst0x5A827999(%rip), %xmm6 36 movaps sha1const(%rip), %xmm7
37 pshufd $0x00, %xmm7, %xmm6
36 38
37 # Load W[] to xmm registers, byteswapping on the fly. 39 # Load W[] to xmm registers, byteswapping on the fly.
38 # 40 #
39 # For iterations 0..15, we pass W[] in rsi,r8..r14 41 # For iterations 0..15, we pass W[] in rsi,r8..r14
40 # for use in RD1A's instead of spilling them to stack. 42 # for use in RD1As instead of spilling them to stack.
41 # We lose parallelized addition of RCONST, but LEA 43 # We lose parallelized addition of RCONST, but LEA
42 # can do two additions at once, so it's probably a wash. 44 # can do two additions at once, so it is probably a wash.
43 # (We use rsi instead of rN because this makes two 45 # (We use rsi instead of rN because this makes two
44 # LEAs in two first RD1A's shorter by one byte). 46 # LEAs in two first RD1As shorter by one byte).
45 movq 4*0(%rdi), %rsi 47 movq 4*0(%rdi), %rsi
46 movq 4*2(%rdi), %r8 48 movq 4*2(%rdi), %r8
47 bswapq %rsi 49 bswapq %rsi
@@ -253,7 +255,7 @@ sha1_process_block64:
253 roll $5, %edi # rotl32(a,5) 255 roll $5, %edi # rotl32(a,5)
254 addl %edi, %edx # e += rotl32(a,5) 256 addl %edi, %edx # e += rotl32(a,5)
255 rorl $2, %eax # b = rotl32(b,30) 257 rorl $2, %eax # b = rotl32(b,30)
256 movaps rconst0x6ED9EBA1(%rip), %xmm6 258 pshufd $0x55, %xmm7, %xmm6
257# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 259# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
258 movaps %xmm0, %xmm4 260 movaps %xmm0, %xmm4
259 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 261 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -614,7 +616,7 @@ sha1_process_block64:
614 roll $5, %esi # rotl32(a,5) 616 roll $5, %esi # rotl32(a,5)
615 addl %esi, %edx # e += rotl32(a,5) 617 addl %esi, %edx # e += rotl32(a,5)
616 rorl $2, %eax # b = rotl32(b,30) 618 rorl $2, %eax # b = rotl32(b,30)
617 movaps rconst0x8F1BBCDC(%rip), %xmm6 619 pshufd $0xaa, %xmm7, %xmm6
618# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 620# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
619 movaps %xmm1, %xmm4 621 movaps %xmm1, %xmm4
620 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 622 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1001,7 +1003,7 @@ sha1_process_block64:
1001 roll $5, %esi # rotl32(a,5) 1003 roll $5, %esi # rotl32(a,5)
1002 addl %esi, %edx # e += rotl32(a,5) 1004 addl %esi, %edx # e += rotl32(a,5)
1003 rorl $2, %eax # b = rotl32(b,30) 1005 rorl $2, %eax # b = rotl32(b,30)
1004 movaps rconst0xCA62C1D6(%rip), %xmm6 1006 pshufd $0xff, %xmm7, %xmm6
1005# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) 1007# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1006 movaps %xmm2, %xmm4 1008 movaps %xmm2, %xmm4
1007 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1009 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1475,25 +1477,10 @@ sha1_process_block64:
1475 1477
1476 .section .rodata.cst16.sha1const, "aM", @progbits, 16 1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1477 .balign 16 1479 .balign 16
1478rconst0x5A827999: 1480sha1const:
1479 .long 0x5A827999 1481 .long 0x5A827999
1480 .long 0x5A827999
1481 .long 0x5A827999
1482 .long 0x5A827999
1483rconst0x6ED9EBA1:
1484 .long 0x6ED9EBA1
1485 .long 0x6ED9EBA1
1486 .long 0x6ED9EBA1
1487 .long 0x6ED9EBA1 1482 .long 0x6ED9EBA1
1488rconst0x8F1BBCDC:
1489 .long 0x8F1BBCDC 1483 .long 0x8F1BBCDC
1490 .long 0x8F1BBCDC
1491 .long 0x8F1BBCDC
1492 .long 0x8F1BBCDC
1493rconst0xCA62C1D6:
1494 .long 0xCA62C1D6
1495 .long 0xCA62C1D6
1496 .long 0xCA62C1D6
1497 .long 0xCA62C1D6 1484 .long 0xCA62C1D6
1498 1485
1499#endif 1486#endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index fb1e4b57e..a10ac411d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -34,6 +34,7 @@ exec >hash_md5_sha_x86-64.S
34xmmT1="%xmm4" 34xmmT1="%xmm4"
35xmmT2="%xmm5" 35xmmT2="%xmm5"
36xmmRCONST="%xmm6" 36xmmRCONST="%xmm6"
37xmmALLRCONST="%xmm7"
37T=`printf '\t'` 38T=`printf '\t'`
38 39
39# SSE instructions are longer than 4 bytes on average. 40# SSE instructions are longer than 4 bytes on average.
@@ -125,6 +126,7 @@ sha1_process_block64:
125# xmm0..xmm3: W[] 126# xmm0..xmm3: W[]
126# xmm4,xmm5: temps 127# xmm4,xmm5: temps
127# xmm6: current round constant 128# xmm6: current round constant
129# xmm7: all round constants
128# -64(%rsp): area for passing RCONST + W[] from vector to integer units 130# -64(%rsp): area for passing RCONST + W[] from vector to integer units
129 131
130 movl 80(%rdi), %eax # a = ctx->hash[0] 132 movl 80(%rdi), %eax # a = ctx->hash[0]
@@ -133,16 +135,17 @@ sha1_process_block64:
133 movl 92(%rdi), %edx # d = ctx->hash[3] 135 movl 92(%rdi), %edx # d = ctx->hash[3]
134 movl 96(%rdi), %ebp # e = ctx->hash[4] 136 movl 96(%rdi), %ebp # e = ctx->hash[4]
135 137
136 movaps rconst0x5A827999(%rip), $xmmRCONST 138 movaps sha1const(%rip), $xmmALLRCONST
139 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
137 140
138 # Load W[] to xmm registers, byteswapping on the fly. 141 # Load W[] to xmm registers, byteswapping on the fly.
139 # 142 #
140 # For iterations 0..15, we pass W[] in rsi,r8..r14 143 # For iterations 0..15, we pass W[] in rsi,r8..r14
141 # for use in RD1A's instead of spilling them to stack. 144 # for use in RD1As instead of spilling them to stack.
142 # We lose parallelized addition of RCONST, but LEA 145 # We lose parallelized addition of RCONST, but LEA
143 # can do two additions at once, so it's probably a wash. 146 # can do two additions at once, so it is probably a wash.
144 # (We use rsi instead of rN because this makes two 147 # (We use rsi instead of rN because this makes two
145 # LEAs in two first RD1A's shorter by one byte). 148 # LEAs in two first RD1As shorter by one byte).
146 movq 4*0(%rdi), %rsi 149 movq 4*0(%rdi), %rsi
147 movq 4*2(%rdi), %r8 150 movq 4*2(%rdi), %r8
148 bswapq %rsi 151 bswapq %rsi
@@ -359,7 +362,7 @@ RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx
359a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` 362a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
360b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` 363b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
361INTERLEAVE "$a" "$b" 364INTERLEAVE "$a" "$b"
362a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" 365a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST"
363 PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` 366 PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
364b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` 367b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
365INTERLEAVE "$a" "$b" 368INTERLEAVE "$a" "$b"
@@ -378,7 +381,7 @@ INTERLEAVE "$a" "$b"
378a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` 381a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
379b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` 382b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
380INTERLEAVE "$a" "$b" 383INTERLEAVE "$a" "$b"
381a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" 384a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST"
382 PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` 385 PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
383b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` 386b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
384INTERLEAVE "$a" "$b" 387INTERLEAVE "$a" "$b"
@@ -397,7 +400,7 @@ INTERLEAVE "$a" "$b"
397a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` 400a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
398b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` 401b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
399INTERLEAVE "$a" "$b" 402INTERLEAVE "$a" "$b"
400a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" 403a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST"
401 PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` 404 PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
402b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` 405b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
403INTERLEAVE "$a" "$b" 406INTERLEAVE "$a" "$b"
@@ -439,25 +442,10 @@ echo "
439 442
440 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 443 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
441 .balign 16 444 .balign 16
442rconst0x5A827999: 445sha1const:
443 .long 0x5A827999 446 .long 0x5A827999
444 .long 0x5A827999
445 .long 0x5A827999
446 .long 0x5A827999
447rconst0x6ED9EBA1:
448 .long 0x6ED9EBA1
449 .long 0x6ED9EBA1
450 .long 0x6ED9EBA1
451 .long 0x6ED9EBA1 447 .long 0x6ED9EBA1
452rconst0x8F1BBCDC:
453 .long 0x8F1BBCDC 448 .long 0x8F1BBCDC
454 .long 0x8F1BBCDC
455 .long 0x8F1BBCDC
456 .long 0x8F1BBCDC
457rconst0xCA62C1D6:
458 .long 0xCA62C1D6
459 .long 0xCA62C1D6
460 .long 0xCA62C1D6
461 .long 0xCA62C1D6 449 .long 0xCA62C1D6
462 450
463#endif" 451#endif"