aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha_x86-64.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S')
-rw-r--r--libbb/hash_md5_sha_x86-64.S33
1 files changed, 10 insertions, 23 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index e26c46f25..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -24,6 +24,7 @@ sha1_process_block64:
24# xmm0..xmm3: W[] 24# xmm0..xmm3: W[]
25# xmm4,xmm5: temps 25# xmm4,xmm5: temps
26# xmm6: current round constant 26# xmm6: current round constant
27# xmm7: all round constants
27# -64(%rsp): area for passing RCONST + W[] from vector to integer units 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
28 29
29 movl 80(%rdi), %eax # a = ctx->hash[0] 30 movl 80(%rdi), %eax # a = ctx->hash[0]
@@ -32,16 +33,17 @@ sha1_process_block64:
32 movl 92(%rdi), %edx # d = ctx->hash[3] 33 movl 92(%rdi), %edx # d = ctx->hash[3]
33 movl 96(%rdi), %ebp # e = ctx->hash[4] 34 movl 96(%rdi), %ebp # e = ctx->hash[4]
34 35
35 movaps rconst0x5A827999(%rip), %xmm6 36 movaps sha1const(%rip), %xmm7
37 pshufd $0x00, %xmm7, %xmm6
36 38
37 # Load W[] to xmm registers, byteswapping on the fly. 39 # Load W[] to xmm registers, byteswapping on the fly.
38 # 40 #
39 # For iterations 0..15, we pass W[] in rsi,r8..r14 41 # For iterations 0..15, we pass W[] in rsi,r8..r14
40 # for use in RD1A's instead of spilling them to stack. 42 # for use in RD1As instead of spilling them to stack.
41 # We lose parallelized addition of RCONST, but LEA 43 # We lose parallelized addition of RCONST, but LEA
42 # can do two additions at once, so it's probably a wash. 44 # can do two additions at once, so it is probably a wash.
43 # (We use rsi instead of rN because this makes two 45 # (We use rsi instead of rN because this makes two
44 # LEAs in two first RD1A's shorter by one byte). 46 # LEAs in two first RD1As shorter by one byte).
45 movq 4*0(%rdi), %rsi 47 movq 4*0(%rdi), %rsi
46 movq 4*2(%rdi), %r8 48 movq 4*2(%rdi), %r8
47 bswapq %rsi 49 bswapq %rsi
@@ -253,7 +255,7 @@ sha1_process_block64:
253 roll $5, %edi # rotl32(a,5) 255 roll $5, %edi # rotl32(a,5)
254 addl %edi, %edx # e += rotl32(a,5) 256 addl %edi, %edx # e += rotl32(a,5)
255 rorl $2, %eax # b = rotl32(b,30) 257 rorl $2, %eax # b = rotl32(b,30)
256 movaps rconst0x6ED9EBA1(%rip), %xmm6 258 pshufd $0x55, %xmm7, %xmm6
257# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 259# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
258 movaps %xmm0, %xmm4 260 movaps %xmm0, %xmm4
259 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 261 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -614,7 +616,7 @@ sha1_process_block64:
614 roll $5, %esi # rotl32(a,5) 616 roll $5, %esi # rotl32(a,5)
615 addl %esi, %edx # e += rotl32(a,5) 617 addl %esi, %edx # e += rotl32(a,5)
616 rorl $2, %eax # b = rotl32(b,30) 618 rorl $2, %eax # b = rotl32(b,30)
617 movaps rconst0x8F1BBCDC(%rip), %xmm6 619 pshufd $0xaa, %xmm7, %xmm6
618# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 620# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
619 movaps %xmm1, %xmm4 621 movaps %xmm1, %xmm4
620 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 622 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1001,7 +1003,7 @@ sha1_process_block64:
1001 roll $5, %esi # rotl32(a,5) 1003 roll $5, %esi # rotl32(a,5)
1002 addl %esi, %edx # e += rotl32(a,5) 1004 addl %esi, %edx # e += rotl32(a,5)
1003 rorl $2, %eax # b = rotl32(b,30) 1005 rorl $2, %eax # b = rotl32(b,30)
1004 movaps rconst0xCA62C1D6(%rip), %xmm6 1006 pshufd $0xff, %xmm7, %xmm6
1005# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) 1007# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1006 movaps %xmm2, %xmm4 1008 movaps %xmm2, %xmm4
1007 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1009 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1475,25 +1477,10 @@ sha1_process_block64:
1475 1477
1476 .section .rodata.cst16.sha1const, "aM", @progbits, 16 1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1477 .balign 16 1479 .balign 16
1478rconst0x5A827999: 1480sha1const:
1479 .long 0x5A827999 1481 .long 0x5A827999
1480 .long 0x5A827999
1481 .long 0x5A827999
1482 .long 0x5A827999
1483rconst0x6ED9EBA1:
1484 .long 0x6ED9EBA1
1485 .long 0x6ED9EBA1
1486 .long 0x6ED9EBA1
1487 .long 0x6ED9EBA1 1482 .long 0x6ED9EBA1
1488rconst0x8F1BBCDC:
1489 .long 0x8F1BBCDC 1483 .long 0x8F1BBCDC
1490 .long 0x8F1BBCDC
1491 .long 0x8F1BBCDC
1492 .long 0x8F1BBCDC
1493rconst0xCA62C1D6:
1494 .long 0xCA62C1D6
1495 .long 0xCA62C1D6
1496 .long 0xCA62C1D6
1497 .long 0xCA62C1D6 1484 .long 0xCA62C1D6
1498 1485
1499#endif 1486#endif