diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-23 09:27:30 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-23 12:57:27 +0100 |
| commit | 39369ff460f3e2dbfec7f6be181b2fb98f3c1867 (patch) | |
| tree | 1a67a6376490c729fb58944002cdcabb262b2f50 /libbb | |
| parent | 1e825acf8d715fe49af040cb02f9e96c26955832 (diff) | |
| download | busybox-w32-39369ff460f3e2dbfec7f6be181b2fb98f3c1867.tar.gz busybox-w32-39369ff460f3e2dbfec7f6be181b2fb98f3c1867.tar.bz2 busybox-w32-39369ff460f3e2dbfec7f6be181b2fb98f3c1867.zip | |
libbb/sha1: use SSE2 in unrolled x86-64 code. ~10% faster
function old new delta
.rodata 108241 108305 +64
sha1_process_block64 3502 3495 -7
------------------------------------------------------------------------------
(add/remove: 5/0 grow/shrink: 1/1 up/down: 64/-7) Total: 57 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
| -rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 992 | ||||
| -rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 440 |
2 files changed, 854 insertions, 578 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 87fb616a1..069a18719 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
| @@ -20,16 +20,10 @@ sha1_process_block64: | |||
| 20 | # eax..edx: a..d | 20 | # eax..edx: a..d |
| 21 | # ebp: e | 21 | # ebp: e |
| 22 | # esi,edi: temps | 22 | # esi,edi: temps |
| 23 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 23 | # xmm0..xmm3: W[] |
| 24 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 24 | # xmm4,xmm5: temps |
| 25 | movl $3, %eax | 25 | # xmm6: current round constant |
| 26 | 1: | 26 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
| 27 | movq (%rdi,%rax,8), %rsi | ||
| 28 | bswapq %rsi | ||
| 29 | rolq $32, %rsi | ||
| 30 | movq %rsi, -32(%rsp,%rax,8) | ||
| 31 | decl %eax | ||
| 32 | jns 1b | ||
| 33 | 27 | ||
| 34 | movl 80(%rdi), %eax # a = ctx->hash[0] | 28 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 35 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 29 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| @@ -37,587 +31,709 @@ sha1_process_block64: | |||
| 37 | movl 92(%rdi), %edx # d = ctx->hash[3] | 31 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 38 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 32 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
| 39 | 33 | ||
| 34 | movaps rconst0x5A827999(%rip), %xmm6 | ||
| 35 | |||
| 36 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | ||
| 37 | # instead of spilling them to stack. | ||
| 38 | # (We lose parallelized addition of RCONST, but LEA | ||
| 39 | # can do two additions at once, so...) | ||
| 40 | movq 4*0(%rdi), %rsi | ||
| 41 | movq 4*2(%rdi), %r10 | ||
| 42 | bswapq %rsi | ||
| 43 | bswapq %r10 | ||
| 44 | rolq $32, %rsi # rsi = W[1]:W[0] | ||
| 45 | rolq $32, %r10 | ||
| 46 | movq %rsi, %xmm0 | ||
| 47 | movq %r10, %xmm4 | ||
| 48 | punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | ||
| 49 | movaps %xmm0, %xmm4 | ||
| 50 | paddd %xmm6, %xmm4 | ||
| 51 | movups %xmm4, -64+4*0(%rsp) | ||
| 52 | |||
| 53 | movq 4*4(%rdi), %r8 | ||
| 54 | movq 4*6(%rdi), %r10 | ||
| 55 | bswapq %r8 | ||
| 56 | bswapq %r10 | ||
| 57 | rolq $32, %r8 | ||
| 58 | rolq $32, %r10 | ||
| 59 | movq %r8, %xmm1 | ||
| 60 | movq %r10, %xmm4 | ||
| 61 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | ||
| 62 | movaps %xmm1, %xmm4 | ||
| 63 | paddd %xmm6, %xmm4 | ||
| 64 | movups %xmm4, -64+4*4(%rsp) | ||
| 65 | |||
| 40 | movq 4*8(%rdi), %r8 | 66 | movq 4*8(%rdi), %r8 |
| 41 | movq 4*10(%rdi), %r10 | 67 | movq 4*10(%rdi), %r10 |
| 42 | bswapq %r8 | 68 | bswapq %r8 |
| 43 | bswapq %r10 | 69 | bswapq %r10 |
| 70 | movl %r8d, %r9d # r9d = W[9] | ||
| 71 | rolq $32, %r8 # r8 = W[9]:W[8] | ||
| 72 | movl %r10d, %r11d # r11d = W[11] | ||
| 73 | rolq $32, %r10 # r10 = W[11]:W[10] | ||
| 74 | movq %r8, %xmm2 | ||
| 75 | movq %r10, %xmm4 | ||
| 76 | punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
| 77 | |||
| 44 | movq 4*12(%rdi), %r12 | 78 | movq 4*12(%rdi), %r12 |
| 45 | movq 4*14(%rdi), %r14 | 79 | movq 4*14(%rdi), %r14 |
| 46 | bswapq %r12 | 80 | bswapq %r12 |
| 47 | bswapq %r14 | 81 | bswapq %r14 |
| 48 | movl %r8d, %r9d | 82 | movl %r12d, %r13d # r13d = W[13] |
| 49 | shrq $32, %r8 | 83 | rolq $32, %r12 # r12 = W[13]:W[12] |
| 50 | movl %r10d, %r11d | 84 | movl %r14d, %r15d # r15d = W[15] |
| 51 | shrq $32, %r10 | 85 | rolq $32, %r14 # r14 = W[15]:W[14] |
| 52 | movl %r12d, %r13d | 86 | movq %r12, %xmm3 |
| 53 | shrq $32, %r12 | 87 | movq %r14, %xmm4 |
| 54 | movl %r14d, %r15d | 88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) |
| 55 | shrq $32, %r14 | ||
| 56 | 89 | ||
| 57 | # 0 | 90 | # 0 |
| 58 | # W[0], already in %esi | 91 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
| 59 | movl %ecx, %edi # c | 92 | movl %ecx, %edi # c |
| 60 | xorl %edx, %edi # ^d | 93 | xorl %edx, %edi # ^d |
| 61 | andl %ebx, %edi # &b | 94 | andl %ebx, %edi # &b |
| 62 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 95 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
| 63 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
| 64 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 96 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
| 65 | movl %eax, %esi # | 97 | movl %eax, %esi # |
| 66 | roll $5, %esi # rotl32(a,5) | 98 | roll $5, %esi # rotl32(a,5) |
| 67 | addl %esi, %ebp # e += rotl32(a,5) | 99 | addl %esi, %ebp # e += rotl32(a,5) |
| 68 | rorl $2, %ebx # b = rotl32(b,30) | 100 | rorl $2, %ebx # b = rotl32(b,30) |
| 69 | # 1 | 101 | # 1 |
| 70 | movl -32+4*1(%rsp), %esi # W[n] | 102 | addl -64+4*1(%rsp), %edx # e += RCONST + W[n] |
| 71 | movl %ebx, %edi # c | 103 | movl %ebx, %edi # c |
| 72 | xorl %ecx, %edi # ^d | 104 | xorl %ecx, %edi # ^d |
| 73 | andl %eax, %edi # &b | 105 | andl %eax, %edi # &b |
| 74 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 106 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
| 75 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
| 76 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 107 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
| 77 | movl %ebp, %esi # | 108 | movl %ebp, %esi # |
| 78 | roll $5, %esi # rotl32(a,5) | 109 | roll $5, %esi # rotl32(a,5) |
| 79 | addl %esi, %edx # e += rotl32(a,5) | 110 | addl %esi, %edx # e += rotl32(a,5) |
| 80 | rorl $2, %eax # b = rotl32(b,30) | 111 | rorl $2, %eax # b = rotl32(b,30) |
| 81 | # 2 | 112 | # 2 |
| 82 | movl -32+4*2(%rsp), %esi # W[n] | 113 | addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] |
| 83 | movl %eax, %edi # c | 114 | movl %eax, %edi # c |
| 84 | xorl %ebx, %edi # ^d | 115 | xorl %ebx, %edi # ^d |
| 85 | andl %ebp, %edi # &b | 116 | andl %ebp, %edi # &b |
| 86 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 117 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
| 87 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
| 88 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 118 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
| 89 | movl %edx, %esi # | 119 | movl %edx, %esi # |
| 90 | roll $5, %esi # rotl32(a,5) | 120 | roll $5, %esi # rotl32(a,5) |
| 91 | addl %esi, %ecx # e += rotl32(a,5) | 121 | addl %esi, %ecx # e += rotl32(a,5) |
| 92 | rorl $2, %ebp # b = rotl32(b,30) | 122 | rorl $2, %ebp # b = rotl32(b,30) |
| 93 | # 3 | 123 | # 3 |
| 94 | movl -32+4*3(%rsp), %esi # W[n] | 124 | addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] |
| 95 | movl %ebp, %edi # c | 125 | movl %ebp, %edi # c |
| 96 | xorl %eax, %edi # ^d | 126 | xorl %eax, %edi # ^d |
| 97 | andl %edx, %edi # &b | 127 | andl %edx, %edi # &b |
| 98 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 128 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
| 99 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n] | ||
| 100 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 129 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
| 101 | movl %ecx, %esi # | 130 | movl %ecx, %esi # |
| 102 | roll $5, %esi # rotl32(a,5) | 131 | roll $5, %esi # rotl32(a,5) |
| 103 | addl %esi, %ebx # e += rotl32(a,5) | 132 | addl %esi, %ebx # e += rotl32(a,5) |
| 104 | rorl $2, %edx # b = rotl32(b,30) | 133 | rorl $2, %edx # b = rotl32(b,30) |
| 105 | # 4 | 134 | # 4 |
| 106 | movl -32+4*4(%rsp), %esi # W[n] | 135 | addl -64+4*4(%rsp), %eax # e += RCONST + W[n] |
| 107 | movl %edx, %edi # c | 136 | movl %edx, %edi # c |
| 108 | xorl %ebp, %edi # ^d | 137 | xorl %ebp, %edi # ^d |
| 109 | andl %ecx, %edi # &b | 138 | andl %ecx, %edi # &b |
| 110 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 139 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
| 111 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n] | ||
| 112 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 140 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
| 113 | movl %ebx, %esi # | 141 | movl %ebx, %esi # |
| 114 | roll $5, %esi # rotl32(a,5) | 142 | roll $5, %esi # rotl32(a,5) |
| 115 | addl %esi, %eax # e += rotl32(a,5) | 143 | addl %esi, %eax # e += rotl32(a,5) |
| 116 | rorl $2, %ecx # b = rotl32(b,30) | 144 | rorl $2, %ecx # b = rotl32(b,30) |
| 117 | # 5 | 145 | # 5 |
| 118 | movl -32+4*5(%rsp), %esi # W[n] | 146 | addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] |
| 119 | movl %ecx, %edi # c | 147 | movl %ecx, %edi # c |
| 120 | xorl %edx, %edi # ^d | 148 | xorl %edx, %edi # ^d |
| 121 | andl %ebx, %edi # &b | 149 | andl %ebx, %edi # &b |
| 122 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 150 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
| 123 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
| 124 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 151 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
| 125 | movl %eax, %esi # | 152 | movl %eax, %esi # |
| 126 | roll $5, %esi # rotl32(a,5) | 153 | roll $5, %esi # rotl32(a,5) |
| 127 | addl %esi, %ebp # e += rotl32(a,5) | 154 | addl %esi, %ebp # e += rotl32(a,5) |
| 128 | rorl $2, %ebx # b = rotl32(b,30) | 155 | rorl $2, %ebx # b = rotl32(b,30) |
| 129 | # 6 | 156 | # 6 |
| 130 | movl -32+4*6(%rsp), %esi # W[n] | 157 | addl -64+4*6(%rsp), %edx # e += RCONST + W[n] |
| 131 | movl %ebx, %edi # c | 158 | movl %ebx, %edi # c |
| 132 | xorl %ecx, %edi # ^d | 159 | xorl %ecx, %edi # ^d |
| 133 | andl %eax, %edi # &b | 160 | andl %eax, %edi # &b |
| 134 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 161 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
| 135 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
| 136 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 162 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
| 137 | movl %ebp, %esi # | 163 | movl %ebp, %esi # |
| 138 | roll $5, %esi # rotl32(a,5) | 164 | roll $5, %esi # rotl32(a,5) |
| 139 | addl %esi, %edx # e += rotl32(a,5) | 165 | addl %esi, %edx # e += rotl32(a,5) |
| 140 | rorl $2, %eax # b = rotl32(b,30) | 166 | rorl $2, %eax # b = rotl32(b,30) |
| 141 | # 7 | 167 | # 7 |
| 142 | movl -32+4*7(%rsp), %esi # W[n] | 168 | addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] |
| 143 | movl %eax, %edi # c | 169 | movl %eax, %edi # c |
| 144 | xorl %ebx, %edi # ^d | 170 | xorl %ebx, %edi # ^d |
| 145 | andl %ebp, %edi # &b | 171 | andl %ebp, %edi # &b |
| 146 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 172 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
| 147 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
| 148 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 173 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
| 149 | movl %edx, %esi # | 174 | movl %edx, %esi # |
| 150 | roll $5, %esi # rotl32(a,5) | 175 | roll $5, %esi # rotl32(a,5) |
| 151 | addl %esi, %ecx # e += rotl32(a,5) | 176 | addl %esi, %ecx # e += rotl32(a,5) |
| 152 | rorl $2, %ebp # b = rotl32(b,30) | 177 | rorl $2, %ebp # b = rotl32(b,30) |
| 178 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
| 179 | movaps %xmm3, %xmm4 | ||
| 180 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 181 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 182 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 183 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 184 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 185 | xorps %xmm5, %xmm0 # ^ | ||
| 186 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 187 | movaps %xmm0, %xmm5 | ||
| 188 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 189 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 190 | paddd %xmm0, %xmm0 # shift left by 1 | ||
| 191 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
| 192 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 193 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 194 | movaps %xmm5, %xmm4 | ||
| 195 | pslld $2, %xmm5 | ||
| 196 | psrld $30, %xmm4 | ||
| 197 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 198 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
| 199 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 200 | movaps %xmm0, %xmm5 | ||
| 201 | paddd %xmm6, %xmm5 | ||
| 202 | movups %xmm5, -64+16*0(%rsp) | ||
| 153 | # 8 | 203 | # 8 |
| 154 | # W[n], in %r8 | 204 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
| 155 | movl %ebp, %edi # c | 205 | movl %ebp, %edi # c |
| 156 | xorl %eax, %edi # ^d | 206 | xorl %eax, %edi # ^d |
| 157 | andl %edx, %edi # &b | 207 | andl %edx, %edi # &b |
| 158 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 208 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
| 159 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] | ||
| 160 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 209 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
| 161 | movl %ecx, %esi # | 210 | movl %ecx, %esi # |
| 162 | roll $5, %esi # rotl32(a,5) | 211 | roll $5, %esi # rotl32(a,5) |
| 163 | addl %esi, %ebx # e += rotl32(a,5) | 212 | addl %esi, %ebx # e += rotl32(a,5) |
| 164 | rorl $2, %edx # b = rotl32(b,30) | 213 | rorl $2, %edx # b = rotl32(b,30) |
| 165 | # 9 | 214 | # 9 |
| 166 | # W[n], in %r9 | 215 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
| 167 | movl %edx, %edi # c | 216 | movl %edx, %edi # c |
| 168 | xorl %ebp, %edi # ^d | 217 | xorl %ebp, %edi # ^d |
| 169 | andl %ecx, %edi # &b | 218 | andl %ecx, %edi # &b |
| 170 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 219 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
| 171 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] | ||
| 172 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 220 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
| 173 | movl %ebx, %esi # | 221 | movl %ebx, %esi # |
| 174 | roll $5, %esi # rotl32(a,5) | 222 | roll $5, %esi # rotl32(a,5) |
| 175 | addl %esi, %eax # e += rotl32(a,5) | 223 | addl %esi, %eax # e += rotl32(a,5) |
| 176 | rorl $2, %ecx # b = rotl32(b,30) | 224 | rorl $2, %ecx # b = rotl32(b,30) |
| 177 | # 10 | 225 | # 10 |
| 178 | # W[n], in %r10 | 226 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] |
| 179 | movl %ecx, %edi # c | 227 | movl %ecx, %edi # c |
| 180 | xorl %edx, %edi # ^d | 228 | xorl %edx, %edi # ^d |
| 181 | andl %ebx, %edi # &b | 229 | andl %ebx, %edi # &b |
| 182 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 230 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
| 183 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] | ||
| 184 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 231 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
| 185 | movl %eax, %esi # | 232 | movl %eax, %esi # |
| 186 | roll $5, %esi # rotl32(a,5) | 233 | roll $5, %esi # rotl32(a,5) |
| 187 | addl %esi, %ebp # e += rotl32(a,5) | 234 | addl %esi, %ebp # e += rotl32(a,5) |
| 188 | rorl $2, %ebx # b = rotl32(b,30) | 235 | rorl $2, %ebx # b = rotl32(b,30) |
| 189 | # 11 | 236 | # 11 |
| 190 | # W[n], in %r11 | 237 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] |
| 191 | movl %ebx, %edi # c | 238 | movl %ebx, %edi # c |
| 192 | xorl %ecx, %edi # ^d | 239 | xorl %ecx, %edi # ^d |
| 193 | andl %eax, %edi # &b | 240 | andl %eax, %edi # &b |
| 194 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 241 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
| 195 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] | ||
| 196 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 242 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
| 197 | movl %ebp, %esi # | 243 | movl %ebp, %esi # |
| 198 | roll $5, %esi # rotl32(a,5) | 244 | roll $5, %esi # rotl32(a,5) |
| 199 | addl %esi, %edx # e += rotl32(a,5) | 245 | addl %esi, %edx # e += rotl32(a,5) |
| 200 | rorl $2, %eax # b = rotl32(b,30) | 246 | rorl $2, %eax # b = rotl32(b,30) |
| 247 | movaps rconst0x6ED9EBA1(%rip), %xmm6 | ||
| 248 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
| 249 | movaps %xmm0, %xmm4 | ||
| 250 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 251 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 252 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 253 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 254 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 255 | xorps %xmm5, %xmm1 # ^ | ||
| 256 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 257 | movaps %xmm1, %xmm5 | ||
| 258 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 259 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 260 | paddd %xmm1, %xmm1 # shift left by 1 | ||
| 261 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
| 262 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 263 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 264 | movaps %xmm5, %xmm4 | ||
| 265 | pslld $2, %xmm5 | ||
| 266 | psrld $30, %xmm4 | ||
| 267 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 268 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
| 269 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 270 | movaps %xmm1, %xmm5 | ||
| 271 | paddd %xmm6, %xmm5 | ||
| 272 | movups %xmm5, -64+16*1(%rsp) | ||
| 201 | # 12 | 273 | # 12 |
| 202 | # W[n], in %r12 | 274 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] |
| 203 | movl %eax, %edi # c | 275 | movl %eax, %edi # c |
| 204 | xorl %ebx, %edi # ^d | 276 | xorl %ebx, %edi # ^d |
| 205 | andl %ebp, %edi # &b | 277 | andl %ebp, %edi # &b |
| 206 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 278 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
| 207 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] | ||
| 208 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 279 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
| 209 | movl %edx, %esi # | 280 | movl %edx, %esi # |
| 210 | roll $5, %esi # rotl32(a,5) | 281 | roll $5, %esi # rotl32(a,5) |
| 211 | addl %esi, %ecx # e += rotl32(a,5) | 282 | addl %esi, %ecx # e += rotl32(a,5) |
| 212 | rorl $2, %ebp # b = rotl32(b,30) | 283 | rorl $2, %ebp # b = rotl32(b,30) |
| 213 | # 13 | 284 | # 13 |
| 214 | # W[n], in %r13 | 285 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
| 215 | movl %ebp, %edi # c | 286 | movl %ebp, %edi # c |
| 216 | xorl %eax, %edi # ^d | 287 | xorl %eax, %edi # ^d |
| 217 | andl %edx, %edi # &b | 288 | andl %edx, %edi # &b |
| 218 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 289 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
| 219 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] | ||
| 220 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 290 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
| 221 | movl %ecx, %esi # | 291 | movl %ecx, %esi # |
| 222 | roll $5, %esi # rotl32(a,5) | 292 | roll $5, %esi # rotl32(a,5) |
| 223 | addl %esi, %ebx # e += rotl32(a,5) | 293 | addl %esi, %ebx # e += rotl32(a,5) |
| 224 | rorl $2, %edx # b = rotl32(b,30) | 294 | rorl $2, %edx # b = rotl32(b,30) |
| 225 | # 14 | 295 | # 14 |
| 226 | # W[n], in %r14 | 296 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
| 227 | movl %edx, %edi # c | 297 | movl %edx, %edi # c |
| 228 | xorl %ebp, %edi # ^d | 298 | xorl %ebp, %edi # ^d |
| 229 | andl %ecx, %edi # &b | 299 | andl %ecx, %edi # &b |
| 230 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 300 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
| 231 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] | ||
| 232 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 301 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
| 233 | movl %ebx, %esi # | 302 | movl %ebx, %esi # |
| 234 | roll $5, %esi # rotl32(a,5) | 303 | roll $5, %esi # rotl32(a,5) |
| 235 | addl %esi, %eax # e += rotl32(a,5) | 304 | addl %esi, %eax # e += rotl32(a,5) |
| 236 | rorl $2, %ecx # b = rotl32(b,30) | 305 | rorl $2, %ecx # b = rotl32(b,30) |
| 237 | # 15 | 306 | # 15 |
| 238 | # W[n], in %r15 | 307 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] |
| 239 | movl %ecx, %edi # c | 308 | movl %ecx, %edi # c |
| 240 | xorl %edx, %edi # ^d | 309 | xorl %edx, %edi # ^d |
| 241 | andl %ebx, %edi # &b | 310 | andl %ebx, %edi # &b |
| 242 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 311 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
| 243 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] | ||
| 244 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 312 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
| 245 | movl %eax, %esi # | 313 | movl %eax, %esi # |
| 246 | roll $5, %esi # rotl32(a,5) | 314 | roll $5, %esi # rotl32(a,5) |
| 247 | addl %esi, %ebp # e += rotl32(a,5) | 315 | addl %esi, %ebp # e += rotl32(a,5) |
| 248 | rorl $2, %ebx # b = rotl32(b,30) | 316 | rorl $2, %ebx # b = rotl32(b,30) |
| 317 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
| 318 | movaps %xmm1, %xmm4 | ||
| 319 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 320 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 321 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 322 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 323 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 324 | xorps %xmm5, %xmm2 # ^ | ||
| 325 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 326 | movaps %xmm2, %xmm5 | ||
| 327 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 328 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 329 | paddd %xmm2, %xmm2 # shift left by 1 | ||
| 330 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
| 331 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 332 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 333 | movaps %xmm5, %xmm4 | ||
| 334 | pslld $2, %xmm5 | ||
| 335 | psrld $30, %xmm4 | ||
| 336 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 337 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
| 338 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 339 | movaps %xmm2, %xmm5 | ||
| 340 | paddd %xmm6, %xmm5 | ||
| 341 | movups %xmm5, -64+16*2(%rsp) | ||
| 249 | # 16 | 342 | # 16 |
| 250 | movl %r13d, %esi # W[(n+13) & 15] | ||
| 251 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
| 252 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
| 253 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
| 254 | roll %esi # | ||
| 255 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
| 256 | movl %ebx, %edi # c | 343 | movl %ebx, %edi # c |
| 257 | xorl %ecx, %edi # ^d | 344 | xorl %ecx, %edi # ^d |
| 258 | andl %eax, %edi # &b | 345 | andl %eax, %edi # &b |
| 259 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 346 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
| 260 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 347 | addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] |
| 261 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 348 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
| 262 | movl %ebp, %esi # | 349 | movl %ebp, %esi # |
| 263 | roll $5, %esi # rotl32(a,5) | 350 | roll $5, %esi # rotl32(a,5) |
| 264 | addl %esi, %edx # e += rotl32(a,5) | 351 | addl %esi, %edx # e += rotl32(a,5) |
| 265 | rorl $2, %eax # b = rotl32(b,30) | 352 | rorl $2, %eax # b = rotl32(b,30) |
| 266 | # 17 | 353 | # 17 |
| 267 | movl %r14d, %esi # W[(n+13) & 15] | ||
| 268 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
| 269 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
| 270 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
| 271 | roll %esi # | ||
| 272 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
| 273 | movl %eax, %edi # c | 354 | movl %eax, %edi # c |
| 274 | xorl %ebx, %edi # ^d | 355 | xorl %ebx, %edi # ^d |
| 275 | andl %ebp, %edi # &b | 356 | andl %ebp, %edi # &b |
| 276 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 357 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
| 277 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 358 | addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] |
| 278 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 359 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
| 279 | movl %edx, %esi # | 360 | movl %edx, %esi # |
| 280 | roll $5, %esi # rotl32(a,5) | 361 | roll $5, %esi # rotl32(a,5) |
| 281 | addl %esi, %ecx # e += rotl32(a,5) | 362 | addl %esi, %ecx # e += rotl32(a,5) |
| 282 | rorl $2, %ebp # b = rotl32(b,30) | 363 | rorl $2, %ebp # b = rotl32(b,30) |
| 283 | # 18 | 364 | # 18 |
| 284 | movl %r15d, %esi # W[(n+13) & 15] | ||
| 285 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
| 286 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
| 287 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
| 288 | roll %esi # | ||
| 289 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
| 290 | movl %ebp, %edi # c | 365 | movl %ebp, %edi # c |
| 291 | xorl %eax, %edi # ^d | 366 | xorl %eax, %edi # ^d |
| 292 | andl %edx, %edi # &b | 367 | andl %edx, %edi # &b |
| 293 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 368 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
| 294 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 369 | addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] |
| 295 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 370 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
| 296 | movl %ecx, %esi # | 371 | movl %ecx, %esi # |
| 297 | roll $5, %esi # rotl32(a,5) | 372 | roll $5, %esi # rotl32(a,5) |
| 298 | addl %esi, %ebx # e += rotl32(a,5) | 373 | addl %esi, %ebx # e += rotl32(a,5) |
| 299 | rorl $2, %edx # b = rotl32(b,30) | 374 | rorl $2, %edx # b = rotl32(b,30) |
| 300 | # 19 | 375 | # 19 |
| 301 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
| 302 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
| 303 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
| 304 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
| 305 | roll %esi # | ||
| 306 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
| 307 | movl %edx, %edi # c | 376 | movl %edx, %edi # c |
| 308 | xorl %ebp, %edi # ^d | 377 | xorl %ebp, %edi # ^d |
| 309 | andl %ecx, %edi # &b | 378 | andl %ecx, %edi # &b |
| 310 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 379 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
| 311 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 380 | addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] |
| 312 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 381 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
| 313 | movl %ebx, %esi # | 382 | movl %ebx, %esi # |
| 314 | roll $5, %esi # rotl32(a,5) | 383 | roll $5, %esi # rotl32(a,5) |
| 315 | addl %esi, %eax # e += rotl32(a,5) | 384 | addl %esi, %eax # e += rotl32(a,5) |
| 316 | rorl $2, %ecx # b = rotl32(b,30) | 385 | rorl $2, %ecx # b = rotl32(b,30) |
| 386 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
| 387 | movaps %xmm2, %xmm4 | ||
| 388 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 389 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 390 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 391 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 392 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 393 | xorps %xmm5, %xmm3 # ^ | ||
| 394 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 395 | movaps %xmm3, %xmm5 | ||
| 396 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 397 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 398 | paddd %xmm3, %xmm3 # shift left by 1 | ||
| 399 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
| 400 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 401 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 402 | movaps %xmm5, %xmm4 | ||
| 403 | pslld $2, %xmm5 | ||
| 404 | psrld $30, %xmm4 | ||
| 405 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 406 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
| 407 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 408 | movaps %xmm3, %xmm5 | ||
| 409 | paddd %xmm6, %xmm5 | ||
| 410 | movups %xmm5, -64+16*3(%rsp) | ||
| 317 | # 20 | 411 | # 20 |
| 318 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
| 319 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
| 320 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
| 321 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
| 322 | roll %esi # | ||
| 323 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
| 324 | movl %ecx, %edi # c | 412 | movl %ecx, %edi # c |
| 325 | xorl %edx, %edi # ^d | 413 | xorl %edx, %edi # ^d |
| 326 | xorl %ebx, %edi # ^b | 414 | xorl %ebx, %edi # ^b |
| 327 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 415 | addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] |
| 328 | addl %edi, %ebp # e += (c ^ d ^ b) | 416 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 329 | movl %eax, %esi # | 417 | movl %eax, %esi # |
| 330 | roll $5, %esi # rotl32(a,5) | 418 | roll $5, %esi # rotl32(a,5) |
| 331 | addl %esi, %ebp # e += rotl32(a,5) | 419 | addl %esi, %ebp # e += rotl32(a,5) |
| 332 | rorl $2, %ebx # b = rotl32(b,30) | 420 | rorl $2, %ebx # b = rotl32(b,30) |
| 333 | # 21 | 421 | # 21 |
| 334 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
| 335 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
| 336 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
| 337 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
| 338 | roll %esi # | ||
| 339 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
| 340 | movl %ebx, %edi # c | 422 | movl %ebx, %edi # c |
| 341 | xorl %ecx, %edi # ^d | 423 | xorl %ecx, %edi # ^d |
| 342 | xorl %eax, %edi # ^b | 424 | xorl %eax, %edi # ^b |
| 343 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 425 | addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] |
| 344 | addl %edi, %edx # e += (c ^ d ^ b) | 426 | addl %edi, %edx # e += (c ^ d ^ b) |
| 345 | movl %ebp, %esi # | 427 | movl %ebp, %esi # |
| 346 | roll $5, %esi # rotl32(a,5) | 428 | roll $5, %esi # rotl32(a,5) |
| 347 | addl %esi, %edx # e += rotl32(a,5) | 429 | addl %esi, %edx # e += rotl32(a,5) |
| 348 | rorl $2, %eax # b = rotl32(b,30) | 430 | rorl $2, %eax # b = rotl32(b,30) |
| 349 | # 22 | 431 | # 22 |
| 350 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
| 351 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
| 352 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
| 353 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
| 354 | roll %esi # | ||
| 355 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
| 356 | movl %eax, %edi # c | 432 | movl %eax, %edi # c |
| 357 | xorl %ebx, %edi # ^d | 433 | xorl %ebx, %edi # ^d |
| 358 | xorl %ebp, %edi # ^b | 434 | xorl %ebp, %edi # ^b |
| 359 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 435 | addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] |
| 360 | addl %edi, %ecx # e += (c ^ d ^ b) | 436 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 361 | movl %edx, %esi # | 437 | movl %edx, %esi # |
| 362 | roll $5, %esi # rotl32(a,5) | 438 | roll $5, %esi # rotl32(a,5) |
| 363 | addl %esi, %ecx # e += rotl32(a,5) | 439 | addl %esi, %ecx # e += rotl32(a,5) |
| 364 | rorl $2, %ebp # b = rotl32(b,30) | 440 | rorl $2, %ebp # b = rotl32(b,30) |
| 365 | # 23 | 441 | # 23 |
| 366 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
| 367 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
| 368 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
| 369 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
| 370 | roll %esi # | ||
| 371 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
| 372 | movl %ebp, %edi # c | 442 | movl %ebp, %edi # c |
| 373 | xorl %eax, %edi # ^d | 443 | xorl %eax, %edi # ^d |
| 374 | xorl %edx, %edi # ^b | 444 | xorl %edx, %edi # ^b |
| 375 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 445 | addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] |
| 376 | addl %edi, %ebx # e += (c ^ d ^ b) | 446 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 377 | movl %ecx, %esi # | 447 | movl %ecx, %esi # |
| 378 | roll $5, %esi # rotl32(a,5) | 448 | roll $5, %esi # rotl32(a,5) |
| 379 | addl %esi, %ebx # e += rotl32(a,5) | 449 | addl %esi, %ebx # e += rotl32(a,5) |
| 380 | rorl $2, %edx # b = rotl32(b,30) | 450 | rorl $2, %edx # b = rotl32(b,30) |
| 451 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
| 452 | movaps %xmm3, %xmm4 | ||
| 453 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 454 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 455 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 456 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 457 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 458 | xorps %xmm5, %xmm0 # ^ | ||
| 459 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 460 | movaps %xmm0, %xmm5 | ||
| 461 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 462 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 463 | paddd %xmm0, %xmm0 # shift left by 1 | ||
| 464 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
| 465 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 466 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 467 | movaps %xmm5, %xmm4 | ||
| 468 | pslld $2, %xmm5 | ||
| 469 | psrld $30, %xmm4 | ||
| 470 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 471 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
| 472 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 473 | movaps %xmm0, %xmm5 | ||
| 474 | paddd %xmm6, %xmm5 | ||
| 475 | movups %xmm5, -64+16*0(%rsp) | ||
| 381 | # 24 | 476 | # 24 |
| 382 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
| 383 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
| 384 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
| 385 | roll %r8d # | ||
| 386 | movl %edx, %edi # c | 477 | movl %edx, %edi # c |
| 387 | xorl %ebp, %edi # ^d | 478 | xorl %ebp, %edi # ^d |
| 388 | xorl %ecx, %edi # ^b | 479 | xorl %ecx, %edi # ^b |
| 389 | leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] | 480 | addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] |
| 390 | addl %edi, %eax # e += (c ^ d ^ b) | 481 | addl %edi, %eax # e += (c ^ d ^ b) |
| 391 | movl %ebx, %esi # | 482 | movl %ebx, %esi # |
| 392 | roll $5, %esi # rotl32(a,5) | 483 | roll $5, %esi # rotl32(a,5) |
| 393 | addl %esi, %eax # e += rotl32(a,5) | 484 | addl %esi, %eax # e += rotl32(a,5) |
| 394 | rorl $2, %ecx # b = rotl32(b,30) | 485 | rorl $2, %ecx # b = rotl32(b,30) |
| 395 | # 25 | 486 | # 25 |
| 396 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
| 397 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
| 398 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
| 399 | roll %r9d # | ||
| 400 | movl %ecx, %edi # c | 487 | movl %ecx, %edi # c |
| 401 | xorl %edx, %edi # ^d | 488 | xorl %edx, %edi # ^d |
| 402 | xorl %ebx, %edi # ^b | 489 | xorl %ebx, %edi # ^b |
| 403 | leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] | 490 | addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] |
| 404 | addl %edi, %ebp # e += (c ^ d ^ b) | 491 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 405 | movl %eax, %esi # | 492 | movl %eax, %esi # |
| 406 | roll $5, %esi # rotl32(a,5) | 493 | roll $5, %esi # rotl32(a,5) |
| 407 | addl %esi, %ebp # e += rotl32(a,5) | 494 | addl %esi, %ebp # e += rotl32(a,5) |
| 408 | rorl $2, %ebx # b = rotl32(b,30) | 495 | rorl $2, %ebx # b = rotl32(b,30) |
| 409 | # 26 | 496 | # 26 |
| 410 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
| 411 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
| 412 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
| 413 | roll %r10d # | ||
| 414 | movl %ebx, %edi # c | 497 | movl %ebx, %edi # c |
| 415 | xorl %ecx, %edi # ^d | 498 | xorl %ecx, %edi # ^d |
| 416 | xorl %eax, %edi # ^b | 499 | xorl %eax, %edi # ^b |
| 417 | leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] | 500 | addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] |
| 418 | addl %edi, %edx # e += (c ^ d ^ b) | 501 | addl %edi, %edx # e += (c ^ d ^ b) |
| 419 | movl %ebp, %esi # | 502 | movl %ebp, %esi # |
| 420 | roll $5, %esi # rotl32(a,5) | 503 | roll $5, %esi # rotl32(a,5) |
| 421 | addl %esi, %edx # e += rotl32(a,5) | 504 | addl %esi, %edx # e += rotl32(a,5) |
| 422 | rorl $2, %eax # b = rotl32(b,30) | 505 | rorl $2, %eax # b = rotl32(b,30) |
| 423 | # 27 | 506 | # 27 |
| 424 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
| 425 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
| 426 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
| 427 | roll %r11d # | ||
| 428 | movl %eax, %edi # c | 507 | movl %eax, %edi # c |
| 429 | xorl %ebx, %edi # ^d | 508 | xorl %ebx, %edi # ^d |
| 430 | xorl %ebp, %edi # ^b | 509 | xorl %ebp, %edi # ^b |
| 431 | leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] | 510 | addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] |
| 432 | addl %edi, %ecx # e += (c ^ d ^ b) | 511 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 433 | movl %edx, %esi # | 512 | movl %edx, %esi # |
| 434 | roll $5, %esi # rotl32(a,5) | 513 | roll $5, %esi # rotl32(a,5) |
| 435 | addl %esi, %ecx # e += rotl32(a,5) | 514 | addl %esi, %ecx # e += rotl32(a,5) |
| 436 | rorl $2, %ebp # b = rotl32(b,30) | 515 | rorl $2, %ebp # b = rotl32(b,30) |
| 516 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
| 517 | movaps %xmm0, %xmm4 | ||
| 518 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 519 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 520 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 521 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 522 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 523 | xorps %xmm5, %xmm1 # ^ | ||
| 524 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 525 | movaps %xmm1, %xmm5 | ||
| 526 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 527 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 528 | paddd %xmm1, %xmm1 # shift left by 1 | ||
| 529 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
| 530 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 531 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 532 | movaps %xmm5, %xmm4 | ||
| 533 | pslld $2, %xmm5 | ||
| 534 | psrld $30, %xmm4 | ||
| 535 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 536 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
| 537 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 538 | movaps %xmm1, %xmm5 | ||
| 539 | paddd %xmm6, %xmm5 | ||
| 540 | movups %xmm5, -64+16*1(%rsp) | ||
| 437 | # 28 | 541 | # 28 |
| 438 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
| 439 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
| 440 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
| 441 | roll %r12d # | ||
| 442 | movl %ebp, %edi # c | 542 | movl %ebp, %edi # c |
| 443 | xorl %eax, %edi # ^d | 543 | xorl %eax, %edi # ^d |
| 444 | xorl %edx, %edi # ^b | 544 | xorl %edx, %edi # ^b |
| 445 | leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] | 545 | addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] |
| 446 | addl %edi, %ebx # e += (c ^ d ^ b) | 546 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 447 | movl %ecx, %esi # | 547 | movl %ecx, %esi # |
| 448 | roll $5, %esi # rotl32(a,5) | 548 | roll $5, %esi # rotl32(a,5) |
| 449 | addl %esi, %ebx # e += rotl32(a,5) | 549 | addl %esi, %ebx # e += rotl32(a,5) |
| 450 | rorl $2, %edx # b = rotl32(b,30) | 550 | rorl $2, %edx # b = rotl32(b,30) |
| 451 | # 29 | 551 | # 29 |
| 452 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
| 453 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
| 454 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
| 455 | roll %r13d # | ||
| 456 | movl %edx, %edi # c | 552 | movl %edx, %edi # c |
| 457 | xorl %ebp, %edi # ^d | 553 | xorl %ebp, %edi # ^d |
| 458 | xorl %ecx, %edi # ^b | 554 | xorl %ecx, %edi # ^b |
| 459 | leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] | 555 | addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] |
| 460 | addl %edi, %eax # e += (c ^ d ^ b) | 556 | addl %edi, %eax # e += (c ^ d ^ b) |
| 461 | movl %ebx, %esi # | 557 | movl %ebx, %esi # |
| 462 | roll $5, %esi # rotl32(a,5) | 558 | roll $5, %esi # rotl32(a,5) |
| 463 | addl %esi, %eax # e += rotl32(a,5) | 559 | addl %esi, %eax # e += rotl32(a,5) |
| 464 | rorl $2, %ecx # b = rotl32(b,30) | 560 | rorl $2, %ecx # b = rotl32(b,30) |
| 465 | # 30 | 561 | # 30 |
| 466 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
| 467 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
| 468 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
| 469 | roll %r14d # | ||
| 470 | movl %ecx, %edi # c | 562 | movl %ecx, %edi # c |
| 471 | xorl %edx, %edi # ^d | 563 | xorl %edx, %edi # ^d |
| 472 | xorl %ebx, %edi # ^b | 564 | xorl %ebx, %edi # ^b |
| 473 | leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] | 565 | addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] |
| 474 | addl %edi, %ebp # e += (c ^ d ^ b) | 566 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 475 | movl %eax, %esi # | 567 | movl %eax, %esi # |
| 476 | roll $5, %esi # rotl32(a,5) | 568 | roll $5, %esi # rotl32(a,5) |
| 477 | addl %esi, %ebp # e += rotl32(a,5) | 569 | addl %esi, %ebp # e += rotl32(a,5) |
| 478 | rorl $2, %ebx # b = rotl32(b,30) | 570 | rorl $2, %ebx # b = rotl32(b,30) |
| 479 | # 31 | 571 | # 31 |
| 480 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
| 481 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
| 482 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
| 483 | roll %r15d # | ||
| 484 | movl %ebx, %edi # c | 572 | movl %ebx, %edi # c |
| 485 | xorl %ecx, %edi # ^d | 573 | xorl %ecx, %edi # ^d |
| 486 | xorl %eax, %edi # ^b | 574 | xorl %eax, %edi # ^b |
| 487 | leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] | 575 | addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] |
| 488 | addl %edi, %edx # e += (c ^ d ^ b) | 576 | addl %edi, %edx # e += (c ^ d ^ b) |
| 489 | movl %ebp, %esi # | 577 | movl %ebp, %esi # |
| 490 | roll $5, %esi # rotl32(a,5) | 578 | roll $5, %esi # rotl32(a,5) |
| 491 | addl %esi, %edx # e += rotl32(a,5) | 579 | addl %esi, %edx # e += rotl32(a,5) |
| 492 | rorl $2, %eax # b = rotl32(b,30) | 580 | rorl $2, %eax # b = rotl32(b,30) |
| 581 | movaps rconst0x8F1BBCDC(%rip), %xmm6 | ||
| 582 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
| 583 | movaps %xmm1, %xmm4 | ||
| 584 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 585 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 586 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 587 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 588 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 589 | xorps %xmm5, %xmm2 # ^ | ||
| 590 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 591 | movaps %xmm2, %xmm5 | ||
| 592 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 593 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 594 | paddd %xmm2, %xmm2 # shift left by 1 | ||
| 595 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
| 596 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 597 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 598 | movaps %xmm5, %xmm4 | ||
| 599 | pslld $2, %xmm5 | ||
| 600 | psrld $30, %xmm4 | ||
| 601 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 602 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
| 603 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 604 | movaps %xmm2, %xmm5 | ||
| 605 | paddd %xmm6, %xmm5 | ||
| 606 | movups %xmm5, -64+16*2(%rsp) | ||
| 493 | # 32 | 607 | # 32 |
| 494 | movl %r13d, %esi # W[(n+13) & 15] | ||
| 495 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
| 496 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
| 497 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
| 498 | roll %esi # | ||
| 499 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
| 500 | movl %eax, %edi # c | 608 | movl %eax, %edi # c |
| 501 | xorl %ebx, %edi # ^d | 609 | xorl %ebx, %edi # ^d |
| 502 | xorl %ebp, %edi # ^b | 610 | xorl %ebp, %edi # ^b |
| 503 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 611 | addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] |
| 504 | addl %edi, %ecx # e += (c ^ d ^ b) | 612 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 505 | movl %edx, %esi # | 613 | movl %edx, %esi # |
| 506 | roll $5, %esi # rotl32(a,5) | 614 | roll $5, %esi # rotl32(a,5) |
| 507 | addl %esi, %ecx # e += rotl32(a,5) | 615 | addl %esi, %ecx # e += rotl32(a,5) |
| 508 | rorl $2, %ebp # b = rotl32(b,30) | 616 | rorl $2, %ebp # b = rotl32(b,30) |
| 509 | # 33 | 617 | # 33 |
| 510 | movl %r14d, %esi # W[(n+13) & 15] | ||
| 511 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
| 512 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
| 513 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
| 514 | roll %esi # | ||
| 515 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
| 516 | movl %ebp, %edi # c | 618 | movl %ebp, %edi # c |
| 517 | xorl %eax, %edi # ^d | 619 | xorl %eax, %edi # ^d |
| 518 | xorl %edx, %edi # ^b | 620 | xorl %edx, %edi # ^b |
| 519 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 621 | addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] |
| 520 | addl %edi, %ebx # e += (c ^ d ^ b) | 622 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 521 | movl %ecx, %esi # | 623 | movl %ecx, %esi # |
| 522 | roll $5, %esi # rotl32(a,5) | 624 | roll $5, %esi # rotl32(a,5) |
| 523 | addl %esi, %ebx # e += rotl32(a,5) | 625 | addl %esi, %ebx # e += rotl32(a,5) |
| 524 | rorl $2, %edx # b = rotl32(b,30) | 626 | rorl $2, %edx # b = rotl32(b,30) |
| 525 | # 34 | 627 | # 34 |
| 526 | movl %r15d, %esi # W[(n+13) & 15] | ||
| 527 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
| 528 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
| 529 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
| 530 | roll %esi # | ||
| 531 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
| 532 | movl %edx, %edi # c | 628 | movl %edx, %edi # c |
| 533 | xorl %ebp, %edi # ^d | 629 | xorl %ebp, %edi # ^d |
| 534 | xorl %ecx, %edi # ^b | 630 | xorl %ecx, %edi # ^b |
| 535 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 631 | addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] |
| 536 | addl %edi, %eax # e += (c ^ d ^ b) | 632 | addl %edi, %eax # e += (c ^ d ^ b) |
| 537 | movl %ebx, %esi # | 633 | movl %ebx, %esi # |
| 538 | roll $5, %esi # rotl32(a,5) | 634 | roll $5, %esi # rotl32(a,5) |
| 539 | addl %esi, %eax # e += rotl32(a,5) | 635 | addl %esi, %eax # e += rotl32(a,5) |
| 540 | rorl $2, %ecx # b = rotl32(b,30) | 636 | rorl $2, %ecx # b = rotl32(b,30) |
| 541 | # 35 | 637 | # 35 |
| 542 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
| 543 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
| 544 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
| 545 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
| 546 | roll %esi # | ||
| 547 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
| 548 | movl %ecx, %edi # c | 638 | movl %ecx, %edi # c |
| 549 | xorl %edx, %edi # ^d | 639 | xorl %edx, %edi # ^d |
| 550 | xorl %ebx, %edi # ^b | 640 | xorl %ebx, %edi # ^b |
| 551 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 641 | addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] |
| 552 | addl %edi, %ebp # e += (c ^ d ^ b) | 642 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 553 | movl %eax, %esi # | 643 | movl %eax, %esi # |
| 554 | roll $5, %esi # rotl32(a,5) | 644 | roll $5, %esi # rotl32(a,5) |
| 555 | addl %esi, %ebp # e += rotl32(a,5) | 645 | addl %esi, %ebp # e += rotl32(a,5) |
| 556 | rorl $2, %ebx # b = rotl32(b,30) | 646 | rorl $2, %ebx # b = rotl32(b,30) |
| 647 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
| 648 | movaps %xmm2, %xmm4 | ||
| 649 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 650 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 651 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 652 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 653 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 654 | xorps %xmm5, %xmm3 # ^ | ||
| 655 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 656 | movaps %xmm3, %xmm5 | ||
| 657 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 658 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 659 | paddd %xmm3, %xmm3 # shift left by 1 | ||
| 660 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
| 661 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 662 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 663 | movaps %xmm5, %xmm4 | ||
| 664 | pslld $2, %xmm5 | ||
| 665 | psrld $30, %xmm4 | ||
| 666 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 667 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
| 668 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 669 | movaps %xmm3, %xmm5 | ||
| 670 | paddd %xmm6, %xmm5 | ||
| 671 | movups %xmm5, -64+16*3(%rsp) | ||
| 557 | # 36 | 672 | # 36 |
| 558 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
| 559 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
| 560 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
| 561 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
| 562 | roll %esi # | ||
| 563 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
| 564 | movl %ebx, %edi # c | 673 | movl %ebx, %edi # c |
| 565 | xorl %ecx, %edi # ^d | 674 | xorl %ecx, %edi # ^d |
| 566 | xorl %eax, %edi # ^b | 675 | xorl %eax, %edi # ^b |
| 567 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 676 | addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] |
| 568 | addl %edi, %edx # e += (c ^ d ^ b) | 677 | addl %edi, %edx # e += (c ^ d ^ b) |
| 569 | movl %ebp, %esi # | 678 | movl %ebp, %esi # |
| 570 | roll $5, %esi # rotl32(a,5) | 679 | roll $5, %esi # rotl32(a,5) |
| 571 | addl %esi, %edx # e += rotl32(a,5) | 680 | addl %esi, %edx # e += rotl32(a,5) |
| 572 | rorl $2, %eax # b = rotl32(b,30) | 681 | rorl $2, %eax # b = rotl32(b,30) |
| 573 | # 37 | 682 | # 37 |
| 574 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
| 575 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
| 576 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
| 577 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
| 578 | roll %esi # | ||
| 579 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
| 580 | movl %eax, %edi # c | 683 | movl %eax, %edi # c |
| 581 | xorl %ebx, %edi # ^d | 684 | xorl %ebx, %edi # ^d |
| 582 | xorl %ebp, %edi # ^b | 685 | xorl %ebp, %edi # ^b |
| 583 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 686 | addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] |
| 584 | addl %edi, %ecx # e += (c ^ d ^ b) | 687 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 585 | movl %edx, %esi # | 688 | movl %edx, %esi # |
| 586 | roll $5, %esi # rotl32(a,5) | 689 | roll $5, %esi # rotl32(a,5) |
| 587 | addl %esi, %ecx # e += rotl32(a,5) | 690 | addl %esi, %ecx # e += rotl32(a,5) |
| 588 | rorl $2, %ebp # b = rotl32(b,30) | 691 | rorl $2, %ebp # b = rotl32(b,30) |
| 589 | # 38 | 692 | # 38 |
| 590 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
| 591 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
| 592 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
| 593 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
| 594 | roll %esi # | ||
| 595 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
| 596 | movl %ebp, %edi # c | 693 | movl %ebp, %edi # c |
| 597 | xorl %eax, %edi # ^d | 694 | xorl %eax, %edi # ^d |
| 598 | xorl %edx, %edi # ^b | 695 | xorl %edx, %edi # ^b |
| 599 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 696 | addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] |
| 600 | addl %edi, %ebx # e += (c ^ d ^ b) | 697 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 601 | movl %ecx, %esi # | 698 | movl %ecx, %esi # |
| 602 | roll $5, %esi # rotl32(a,5) | 699 | roll $5, %esi # rotl32(a,5) |
| 603 | addl %esi, %ebx # e += rotl32(a,5) | 700 | addl %esi, %ebx # e += rotl32(a,5) |
| 604 | rorl $2, %edx # b = rotl32(b,30) | 701 | rorl $2, %edx # b = rotl32(b,30) |
| 605 | # 39 | 702 | # 39 |
| 606 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
| 607 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
| 608 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
| 609 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
| 610 | roll %esi # | ||
| 611 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
| 612 | movl %edx, %edi # c | 703 | movl %edx, %edi # c |
| 613 | xorl %ebp, %edi # ^d | 704 | xorl %ebp, %edi # ^d |
| 614 | xorl %ecx, %edi # ^b | 705 | xorl %ecx, %edi # ^b |
| 615 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 706 | addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] |
| 616 | addl %edi, %eax # e += (c ^ d ^ b) | 707 | addl %edi, %eax # e += (c ^ d ^ b) |
| 617 | movl %ebx, %esi # | 708 | movl %ebx, %esi # |
| 618 | roll $5, %esi # rotl32(a,5) | 709 | roll $5, %esi # rotl32(a,5) |
| 619 | addl %esi, %eax # e += rotl32(a,5) | 710 | addl %esi, %eax # e += rotl32(a,5) |
| 620 | rorl $2, %ecx # b = rotl32(b,30) | 711 | rorl $2, %ecx # b = rotl32(b,30) |
| 712 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
| 713 | movaps %xmm3, %xmm4 | ||
| 714 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 715 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 716 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 717 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 718 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 719 | xorps %xmm5, %xmm0 # ^ | ||
| 720 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 721 | movaps %xmm0, %xmm5 | ||
| 722 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 723 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 724 | paddd %xmm0, %xmm0 # shift left by 1 | ||
| 725 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
| 726 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 727 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 728 | movaps %xmm5, %xmm4 | ||
| 729 | pslld $2, %xmm5 | ||
| 730 | psrld $30, %xmm4 | ||
| 731 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 732 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
| 733 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 734 | movaps %xmm0, %xmm5 | ||
| 735 | paddd %xmm6, %xmm5 | ||
| 736 | movups %xmm5, -64+16*0(%rsp) | ||
| 621 | # 40 | 737 | # 40 |
| 622 | movl %ebx, %edi # di: b | 738 | movl %ebx, %edi # di: b |
| 623 | movl %ebx, %esi # si: b | 739 | movl %ebx, %esi # si: b |
| @@ -625,12 +741,8 @@ sha1_process_block64: | |||
| 625 | andl %ecx, %esi # si: b & c | 741 | andl %ecx, %esi # si: b & c |
| 626 | andl %edx, %edi # di: (b | c) & d | 742 | andl %edx, %edi # di: (b | c) & d |
| 627 | orl %esi, %edi # ((b | c) & d) | (b & c) | 743 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 628 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
| 629 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
| 630 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
| 631 | roll %r8d # | ||
| 632 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 744 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
| 633 | leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] | 745 | addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] |
| 634 | movl %eax, %esi # | 746 | movl %eax, %esi # |
| 635 | roll $5, %esi # rotl32(a,5) | 747 | roll $5, %esi # rotl32(a,5) |
| 636 | addl %esi, %ebp # e += rotl32(a,5) | 748 | addl %esi, %ebp # e += rotl32(a,5) |
| @@ -642,12 +754,8 @@ sha1_process_block64: | |||
| 642 | andl %ebx, %esi # si: b & c | 754 | andl %ebx, %esi # si: b & c |
| 643 | andl %ecx, %edi # di: (b | c) & d | 755 | andl %ecx, %edi # di: (b | c) & d |
| 644 | orl %esi, %edi # ((b | c) & d) | (b & c) | 756 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 645 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
| 646 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
| 647 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
| 648 | roll %r9d # | ||
| 649 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 757 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
| 650 | leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] | 758 | addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] |
| 651 | movl %ebp, %esi # | 759 | movl %ebp, %esi # |
| 652 | roll $5, %esi # rotl32(a,5) | 760 | roll $5, %esi # rotl32(a,5) |
| 653 | addl %esi, %edx # e += rotl32(a,5) | 761 | addl %esi, %edx # e += rotl32(a,5) |
| @@ -659,12 +767,8 @@ sha1_process_block64: | |||
| 659 | andl %eax, %esi # si: b & c | 767 | andl %eax, %esi # si: b & c |
| 660 | andl %ebx, %edi # di: (b | c) & d | 768 | andl %ebx, %edi # di: (b | c) & d |
| 661 | orl %esi, %edi # ((b | c) & d) | (b & c) | 769 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 662 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
| 663 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
| 664 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
| 665 | roll %r10d # | ||
| 666 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 770 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
| 667 | leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] | 771 | addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] |
| 668 | movl %edx, %esi # | 772 | movl %edx, %esi # |
| 669 | roll $5, %esi # rotl32(a,5) | 773 | roll $5, %esi # rotl32(a,5) |
| 670 | addl %esi, %ecx # e += rotl32(a,5) | 774 | addl %esi, %ecx # e += rotl32(a,5) |
| @@ -676,16 +780,37 @@ sha1_process_block64: | |||
| 676 | andl %ebp, %esi # si: b & c | 780 | andl %ebp, %esi # si: b & c |
| 677 | andl %eax, %edi # di: (b | c) & d | 781 | andl %eax, %edi # di: (b | c) & d |
| 678 | orl %esi, %edi # ((b | c) & d) | (b & c) | 782 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 679 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
| 680 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
| 681 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
| 682 | roll %r11d # | ||
| 683 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 783 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
| 684 | leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] | 784 | addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] |
| 685 | movl %ecx, %esi # | 785 | movl %ecx, %esi # |
| 686 | roll $5, %esi # rotl32(a,5) | 786 | roll $5, %esi # rotl32(a,5) |
| 687 | addl %esi, %ebx # e += rotl32(a,5) | 787 | addl %esi, %ebx # e += rotl32(a,5) |
| 688 | rorl $2, %edx # b = rotl32(b,30) | 788 | rorl $2, %edx # b = rotl32(b,30) |
| 789 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
| 790 | movaps %xmm0, %xmm4 | ||
| 791 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 792 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 793 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 794 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 795 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 796 | xorps %xmm5, %xmm1 # ^ | ||
| 797 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 798 | movaps %xmm1, %xmm5 | ||
| 799 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 800 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 801 | paddd %xmm1, %xmm1 # shift left by 1 | ||
| 802 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
| 803 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 804 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 805 | movaps %xmm5, %xmm4 | ||
| 806 | pslld $2, %xmm5 | ||
| 807 | psrld $30, %xmm4 | ||
| 808 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 809 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
| 810 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 811 | movaps %xmm1, %xmm5 | ||
| 812 | paddd %xmm6, %xmm5 | ||
| 813 | movups %xmm5, -64+16*1(%rsp) | ||
| 689 | # 44 | 814 | # 44 |
| 690 | movl %ecx, %edi # di: b | 815 | movl %ecx, %edi # di: b |
| 691 | movl %ecx, %esi # si: b | 816 | movl %ecx, %esi # si: b |
| @@ -693,12 +818,8 @@ sha1_process_block64: | |||
| 693 | andl %edx, %esi # si: b & c | 818 | andl %edx, %esi # si: b & c |
| 694 | andl %ebp, %edi # di: (b | c) & d | 819 | andl %ebp, %edi # di: (b | c) & d |
| 695 | orl %esi, %edi # ((b | c) & d) | (b & c) | 820 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 696 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
| 697 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
| 698 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
| 699 | roll %r12d # | ||
| 700 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 821 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
| 701 | leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] | 822 | addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] |
| 702 | movl %ebx, %esi # | 823 | movl %ebx, %esi # |
| 703 | roll $5, %esi # rotl32(a,5) | 824 | roll $5, %esi # rotl32(a,5) |
| 704 | addl %esi, %eax # e += rotl32(a,5) | 825 | addl %esi, %eax # e += rotl32(a,5) |
| @@ -710,12 +831,8 @@ sha1_process_block64: | |||
| 710 | andl %ecx, %esi # si: b & c | 831 | andl %ecx, %esi # si: b & c |
| 711 | andl %edx, %edi # di: (b | c) & d | 832 | andl %edx, %edi # di: (b | c) & d |
| 712 | orl %esi, %edi # ((b | c) & d) | (b & c) | 833 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 713 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
| 714 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
| 715 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
| 716 | roll %r13d # | ||
| 717 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 834 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
| 718 | leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] | 835 | addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] |
| 719 | movl %eax, %esi # | 836 | movl %eax, %esi # |
| 720 | roll $5, %esi # rotl32(a,5) | 837 | roll $5, %esi # rotl32(a,5) |
| 721 | addl %esi, %ebp # e += rotl32(a,5) | 838 | addl %esi, %ebp # e += rotl32(a,5) |
| @@ -727,12 +844,8 @@ sha1_process_block64: | |||
| 727 | andl %ebx, %esi # si: b & c | 844 | andl %ebx, %esi # si: b & c |
| 728 | andl %ecx, %edi # di: (b | c) & d | 845 | andl %ecx, %edi # di: (b | c) & d |
| 729 | orl %esi, %edi # ((b | c) & d) | (b & c) | 846 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 730 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
| 731 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
| 732 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
| 733 | roll %r14d # | ||
| 734 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 847 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
| 735 | leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] | 848 | addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] |
| 736 | movl %ebp, %esi # | 849 | movl %ebp, %esi # |
| 737 | roll $5, %esi # rotl32(a,5) | 850 | roll $5, %esi # rotl32(a,5) |
| 738 | addl %esi, %edx # e += rotl32(a,5) | 851 | addl %esi, %edx # e += rotl32(a,5) |
| @@ -744,16 +857,37 @@ sha1_process_block64: | |||
| 744 | andl %eax, %esi # si: b & c | 857 | andl %eax, %esi # si: b & c |
| 745 | andl %ebx, %edi # di: (b | c) & d | 858 | andl %ebx, %edi # di: (b | c) & d |
| 746 | orl %esi, %edi # ((b | c) & d) | (b & c) | 859 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 747 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
| 748 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
| 749 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
| 750 | roll %r15d # | ||
| 751 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 860 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
| 752 | leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] | 861 | addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] |
| 753 | movl %edx, %esi # | 862 | movl %edx, %esi # |
| 754 | roll $5, %esi # rotl32(a,5) | 863 | roll $5, %esi # rotl32(a,5) |
| 755 | addl %esi, %ecx # e += rotl32(a,5) | 864 | addl %esi, %ecx # e += rotl32(a,5) |
| 756 | rorl $2, %ebp # b = rotl32(b,30) | 865 | rorl $2, %ebp # b = rotl32(b,30) |
| 866 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
| 867 | movaps %xmm1, %xmm4 | ||
| 868 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 869 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 870 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 871 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 872 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 873 | xorps %xmm5, %xmm2 # ^ | ||
| 874 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 875 | movaps %xmm2, %xmm5 | ||
| 876 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 877 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 878 | paddd %xmm2, %xmm2 # shift left by 1 | ||
| 879 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
| 880 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 881 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 882 | movaps %xmm5, %xmm4 | ||
| 883 | pslld $2, %xmm5 | ||
| 884 | psrld $30, %xmm4 | ||
| 885 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 886 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
| 887 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 888 | movaps %xmm2, %xmm5 | ||
| 889 | paddd %xmm6, %xmm5 | ||
| 890 | movups %xmm5, -64+16*2(%rsp) | ||
| 757 | # 48 | 891 | # 48 |
| 758 | movl %edx, %edi # di: b | 892 | movl %edx, %edi # di: b |
| 759 | movl %edx, %esi # si: b | 893 | movl %edx, %esi # si: b |
| @@ -761,14 +895,8 @@ sha1_process_block64: | |||
| 761 | andl %ebp, %esi # si: b & c | 895 | andl %ebp, %esi # si: b & c |
| 762 | andl %eax, %edi # di: (b | c) & d | 896 | andl %eax, %edi # di: (b | c) & d |
| 763 | orl %esi, %edi # ((b | c) & d) | (b & c) | 897 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 764 | movl %r13d, %esi # W[(n+13) & 15] | ||
| 765 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
| 766 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
| 767 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
| 768 | roll %esi # | ||
| 769 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
| 770 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 898 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
| 771 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 899 | addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] |
| 772 | movl %ecx, %esi # | 900 | movl %ecx, %esi # |
| 773 | roll $5, %esi # rotl32(a,5) | 901 | roll $5, %esi # rotl32(a,5) |
| 774 | addl %esi, %ebx # e += rotl32(a,5) | 902 | addl %esi, %ebx # e += rotl32(a,5) |
| @@ -780,14 +908,8 @@ sha1_process_block64: | |||
| 780 | andl %edx, %esi # si: b & c | 908 | andl %edx, %esi # si: b & c |
| 781 | andl %ebp, %edi # di: (b | c) & d | 909 | andl %ebp, %edi # di: (b | c) & d |
| 782 | orl %esi, %edi # ((b | c) & d) | (b & c) | 910 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 783 | movl %r14d, %esi # W[(n+13) & 15] | ||
| 784 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
| 785 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
| 786 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
| 787 | roll %esi # | ||
| 788 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
| 789 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 911 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
| 790 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 912 | addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] |
| 791 | movl %ebx, %esi # | 913 | movl %ebx, %esi # |
| 792 | roll $5, %esi # rotl32(a,5) | 914 | roll $5, %esi # rotl32(a,5) |
| 793 | addl %esi, %eax # e += rotl32(a,5) | 915 | addl %esi, %eax # e += rotl32(a,5) |
| @@ -799,14 +921,8 @@ sha1_process_block64: | |||
| 799 | andl %ecx, %esi # si: b & c | 921 | andl %ecx, %esi # si: b & c |
| 800 | andl %edx, %edi # di: (b | c) & d | 922 | andl %edx, %edi # di: (b | c) & d |
| 801 | orl %esi, %edi # ((b | c) & d) | (b & c) | 923 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 802 | movl %r15d, %esi # W[(n+13) & 15] | ||
| 803 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
| 804 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
| 805 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
| 806 | roll %esi # | ||
| 807 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
| 808 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 924 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
| 809 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 925 | addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] |
| 810 | movl %eax, %esi # | 926 | movl %eax, %esi # |
| 811 | roll $5, %esi # rotl32(a,5) | 927 | roll $5, %esi # rotl32(a,5) |
| 812 | addl %esi, %ebp # e += rotl32(a,5) | 928 | addl %esi, %ebp # e += rotl32(a,5) |
| @@ -818,18 +934,38 @@ sha1_process_block64: | |||
| 818 | andl %ebx, %esi # si: b & c | 934 | andl %ebx, %esi # si: b & c |
| 819 | andl %ecx, %edi # di: (b | c) & d | 935 | andl %ecx, %edi # di: (b | c) & d |
| 820 | orl %esi, %edi # ((b | c) & d) | (b & c) | 936 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 821 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
| 822 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
| 823 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
| 824 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
| 825 | roll %esi # | ||
| 826 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
| 827 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 937 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
| 828 | leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 938 | addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] |
| 829 | movl %ebp, %esi # | 939 | movl %ebp, %esi # |
| 830 | roll $5, %esi # rotl32(a,5) | 940 | roll $5, %esi # rotl32(a,5) |
| 831 | addl %esi, %edx # e += rotl32(a,5) | 941 | addl %esi, %edx # e += rotl32(a,5) |
| 832 | rorl $2, %eax # b = rotl32(b,30) | 942 | rorl $2, %eax # b = rotl32(b,30) |
| 943 | movaps rconst0xCA62C1D6(%rip), %xmm6 | ||
| 944 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
| 945 | movaps %xmm2, %xmm4 | ||
| 946 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 947 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 948 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 949 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 950 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 951 | xorps %xmm5, %xmm3 # ^ | ||
| 952 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 953 | movaps %xmm3, %xmm5 | ||
| 954 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 955 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 956 | paddd %xmm3, %xmm3 # shift left by 1 | ||
| 957 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
| 958 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 959 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 960 | movaps %xmm5, %xmm4 | ||
| 961 | pslld $2, %xmm5 | ||
| 962 | psrld $30, %xmm4 | ||
| 963 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 964 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
| 965 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 966 | movaps %xmm3, %xmm5 | ||
| 967 | paddd %xmm6, %xmm5 | ||
| 968 | movups %xmm5, -64+16*3(%rsp) | ||
| 833 | # 52 | 969 | # 52 |
| 834 | movl %ebp, %edi # di: b | 970 | movl %ebp, %edi # di: b |
| 835 | movl %ebp, %esi # si: b | 971 | movl %ebp, %esi # si: b |
| @@ -837,14 +973,8 @@ sha1_process_block64: | |||
| 837 | andl %eax, %esi # si: b & c | 973 | andl %eax, %esi # si: b & c |
| 838 | andl %ebx, %edi # di: (b | c) & d | 974 | andl %ebx, %edi # di: (b | c) & d |
| 839 | orl %esi, %edi # ((b | c) & d) | (b & c) | 975 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 840 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
| 841 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
| 842 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
| 843 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
| 844 | roll %esi # | ||
| 845 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
| 846 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 976 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
| 847 | leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 977 | addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] |
| 848 | movl %edx, %esi # | 978 | movl %edx, %esi # |
| 849 | roll $5, %esi # rotl32(a,5) | 979 | roll $5, %esi # rotl32(a,5) |
| 850 | addl %esi, %ecx # e += rotl32(a,5) | 980 | addl %esi, %ecx # e += rotl32(a,5) |
| @@ -856,14 +986,8 @@ sha1_process_block64: | |||
| 856 | andl %ebp, %esi # si: b & c | 986 | andl %ebp, %esi # si: b & c |
| 857 | andl %eax, %edi # di: (b | c) & d | 987 | andl %eax, %edi # di: (b | c) & d |
| 858 | orl %esi, %edi # ((b | c) & d) | (b & c) | 988 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 859 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
| 860 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
| 861 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
| 862 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
| 863 | roll %esi # | ||
| 864 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
| 865 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 989 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
| 866 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 990 | addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] |
| 867 | movl %ecx, %esi # | 991 | movl %ecx, %esi # |
| 868 | roll $5, %esi # rotl32(a,5) | 992 | roll $5, %esi # rotl32(a,5) |
| 869 | addl %esi, %ebx # e += rotl32(a,5) | 993 | addl %esi, %ebx # e += rotl32(a,5) |
| @@ -875,14 +999,8 @@ sha1_process_block64: | |||
| 875 | andl %edx, %esi # si: b & c | 999 | andl %edx, %esi # si: b & c |
| 876 | andl %ebp, %edi # di: (b | c) & d | 1000 | andl %ebp, %edi # di: (b | c) & d |
| 877 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1001 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 878 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
| 879 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
| 880 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
| 881 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
| 882 | roll %esi # | ||
| 883 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
| 884 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1002 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
| 885 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1003 | addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] |
| 886 | movl %ebx, %esi # | 1004 | movl %ebx, %esi # |
| 887 | roll $5, %esi # rotl32(a,5) | 1005 | roll $5, %esi # rotl32(a,5) |
| 888 | addl %esi, %eax # e += rotl32(a,5) | 1006 | addl %esi, %eax # e += rotl32(a,5) |
| @@ -894,18 +1012,37 @@ sha1_process_block64: | |||
| 894 | andl %ecx, %esi # si: b & c | 1012 | andl %ecx, %esi # si: b & c |
| 895 | andl %edx, %edi # di: (b | c) & d | 1013 | andl %edx, %edi # di: (b | c) & d |
| 896 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1014 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 897 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
| 898 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
| 899 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
| 900 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
| 901 | roll %esi # | ||
| 902 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
| 903 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 1015 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
| 904 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1016 | addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] |
| 905 | movl %eax, %esi # | 1017 | movl %eax, %esi # |
| 906 | roll $5, %esi # rotl32(a,5) | 1018 | roll $5, %esi # rotl32(a,5) |
| 907 | addl %esi, %ebp # e += rotl32(a,5) | 1019 | addl %esi, %ebp # e += rotl32(a,5) |
| 908 | rorl $2, %ebx # b = rotl32(b,30) | 1020 | rorl $2, %ebx # b = rotl32(b,30) |
| 1021 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
| 1022 | movaps %xmm3, %xmm4 | ||
| 1023 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 1024 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 1025 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 1026 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 1027 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 1028 | xorps %xmm5, %xmm0 # ^ | ||
| 1029 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1030 | movaps %xmm0, %xmm5 | ||
| 1031 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 1032 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 1033 | paddd %xmm0, %xmm0 # shift left by 1 | ||
| 1034 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
| 1035 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1036 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 1037 | movaps %xmm5, %xmm4 | ||
| 1038 | pslld $2, %xmm5 | ||
| 1039 | psrld $30, %xmm4 | ||
| 1040 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 1041 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
| 1042 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 1043 | movaps %xmm0, %xmm5 | ||
| 1044 | paddd %xmm6, %xmm5 | ||
| 1045 | movups %xmm5, -64+16*0(%rsp) | ||
| 909 | # 56 | 1046 | # 56 |
| 910 | movl %eax, %edi # di: b | 1047 | movl %eax, %edi # di: b |
| 911 | movl %eax, %esi # si: b | 1048 | movl %eax, %esi # si: b |
| @@ -913,12 +1050,8 @@ sha1_process_block64: | |||
| 913 | andl %ebx, %esi # si: b & c | 1050 | andl %ebx, %esi # si: b & c |
| 914 | andl %ecx, %edi # di: (b | c) & d | 1051 | andl %ecx, %edi # di: (b | c) & d |
| 915 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1052 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 916 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
| 917 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
| 918 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
| 919 | roll %r8d # | ||
| 920 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 1053 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
| 921 | leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] | 1054 | addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] |
| 922 | movl %ebp, %esi # | 1055 | movl %ebp, %esi # |
| 923 | roll $5, %esi # rotl32(a,5) | 1056 | roll $5, %esi # rotl32(a,5) |
| 924 | addl %esi, %edx # e += rotl32(a,5) | 1057 | addl %esi, %edx # e += rotl32(a,5) |
| @@ -930,12 +1063,8 @@ sha1_process_block64: | |||
| 930 | andl %eax, %esi # si: b & c | 1063 | andl %eax, %esi # si: b & c |
| 931 | andl %ebx, %edi # di: (b | c) & d | 1064 | andl %ebx, %edi # di: (b | c) & d |
| 932 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1065 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 933 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
| 934 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
| 935 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
| 936 | roll %r9d # | ||
| 937 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 1066 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
| 938 | leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] | 1067 | addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] |
| 939 | movl %edx, %esi # | 1068 | movl %edx, %esi # |
| 940 | roll $5, %esi # rotl32(a,5) | 1069 | roll $5, %esi # rotl32(a,5) |
| 941 | addl %esi, %ecx # e += rotl32(a,5) | 1070 | addl %esi, %ecx # e += rotl32(a,5) |
| @@ -947,12 +1076,8 @@ sha1_process_block64: | |||
| 947 | andl %ebp, %esi # si: b & c | 1076 | andl %ebp, %esi # si: b & c |
| 948 | andl %eax, %edi # di: (b | c) & d | 1077 | andl %eax, %edi # di: (b | c) & d |
| 949 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1078 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 950 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
| 951 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
| 952 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
| 953 | roll %r10d # | ||
| 954 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 1079 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
| 955 | leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] | 1080 | addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] |
| 956 | movl %ecx, %esi # | 1081 | movl %ecx, %esi # |
| 957 | roll $5, %esi # rotl32(a,5) | 1082 | roll $5, %esi # rotl32(a,5) |
| 958 | addl %esi, %ebx # e += rotl32(a,5) | 1083 | addl %esi, %ebx # e += rotl32(a,5) |
| @@ -964,307 +1089,282 @@ sha1_process_block64: | |||
| 964 | andl %edx, %esi # si: b & c | 1089 | andl %edx, %esi # si: b & c |
| 965 | andl %ebp, %edi # di: (b | c) & d | 1090 | andl %ebp, %edi # di: (b | c) & d |
| 966 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1091 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 967 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
| 968 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
| 969 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
| 970 | roll %r11d # | ||
| 971 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1092 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
| 972 | leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] | 1093 | addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] |
| 973 | movl %ebx, %esi # | 1094 | movl %ebx, %esi # |
| 974 | roll $5, %esi # rotl32(a,5) | 1095 | roll $5, %esi # rotl32(a,5) |
| 975 | addl %esi, %eax # e += rotl32(a,5) | 1096 | addl %esi, %eax # e += rotl32(a,5) |
| 976 | rorl $2, %ecx # b = rotl32(b,30) | 1097 | rorl $2, %ecx # b = rotl32(b,30) |
| 1098 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
| 1099 | movaps %xmm0, %xmm4 | ||
| 1100 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 1101 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 1102 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 1103 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 1104 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 1105 | xorps %xmm5, %xmm1 # ^ | ||
| 1106 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1107 | movaps %xmm1, %xmm5 | ||
| 1108 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 1109 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 1110 | paddd %xmm1, %xmm1 # shift left by 1 | ||
| 1111 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
| 1112 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1113 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 1114 | movaps %xmm5, %xmm4 | ||
| 1115 | pslld $2, %xmm5 | ||
| 1116 | psrld $30, %xmm4 | ||
| 1117 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 1118 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
| 1119 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 1120 | movaps %xmm1, %xmm5 | ||
| 1121 | paddd %xmm6, %xmm5 | ||
| 1122 | movups %xmm5, -64+16*1(%rsp) | ||
| 977 | # 60 | 1123 | # 60 |
| 978 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
| 979 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
| 980 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
| 981 | roll %r12d # | ||
| 982 | movl %ecx, %edi # c | 1124 | movl %ecx, %edi # c |
| 983 | xorl %edx, %edi # ^d | 1125 | xorl %edx, %edi # ^d |
| 984 | xorl %ebx, %edi # ^b | 1126 | xorl %ebx, %edi # ^b |
| 985 | leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] | 1127 | addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] |
| 986 | addl %edi, %ebp # e += (c ^ d ^ b) | 1128 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 987 | movl %eax, %esi # | 1129 | movl %eax, %esi # |
| 988 | roll $5, %esi # rotl32(a,5) | 1130 | roll $5, %esi # rotl32(a,5) |
| 989 | addl %esi, %ebp # e += rotl32(a,5) | 1131 | addl %esi, %ebp # e += rotl32(a,5) |
| 990 | rorl $2, %ebx # b = rotl32(b,30) | 1132 | rorl $2, %ebx # b = rotl32(b,30) |
| 991 | # 61 | 1133 | # 61 |
| 992 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
| 993 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
| 994 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
| 995 | roll %r13d # | ||
| 996 | movl %ebx, %edi # c | 1134 | movl %ebx, %edi # c |
| 997 | xorl %ecx, %edi # ^d | 1135 | xorl %ecx, %edi # ^d |
| 998 | xorl %eax, %edi # ^b | 1136 | xorl %eax, %edi # ^b |
| 999 | leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] | 1137 | addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] |
| 1000 | addl %edi, %edx # e += (c ^ d ^ b) | 1138 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1001 | movl %ebp, %esi # | 1139 | movl %ebp, %esi # |
| 1002 | roll $5, %esi # rotl32(a,5) | 1140 | roll $5, %esi # rotl32(a,5) |
| 1003 | addl %esi, %edx # e += rotl32(a,5) | 1141 | addl %esi, %edx # e += rotl32(a,5) |
| 1004 | rorl $2, %eax # b = rotl32(b,30) | 1142 | rorl $2, %eax # b = rotl32(b,30) |
| 1005 | # 62 | 1143 | # 62 |
| 1006 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1007 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
| 1008 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
| 1009 | roll %r14d # | ||
| 1010 | movl %eax, %edi # c | 1144 | movl %eax, %edi # c |
| 1011 | xorl %ebx, %edi # ^d | 1145 | xorl %ebx, %edi # ^d |
| 1012 | xorl %ebp, %edi # ^b | 1146 | xorl %ebp, %edi # ^b |
| 1013 | leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] | 1147 | addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] |
| 1014 | addl %edi, %ecx # e += (c ^ d ^ b) | 1148 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1015 | movl %edx, %esi # | 1149 | movl %edx, %esi # |
| 1016 | roll $5, %esi # rotl32(a,5) | 1150 | roll $5, %esi # rotl32(a,5) |
| 1017 | addl %esi, %ecx # e += rotl32(a,5) | 1151 | addl %esi, %ecx # e += rotl32(a,5) |
| 1018 | rorl $2, %ebp # b = rotl32(b,30) | 1152 | rorl $2, %ebp # b = rotl32(b,30) |
| 1019 | # 63 | 1153 | # 63 |
| 1020 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1021 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
| 1022 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
| 1023 | roll %r15d # | ||
| 1024 | movl %ebp, %edi # c | 1154 | movl %ebp, %edi # c |
| 1025 | xorl %eax, %edi # ^d | 1155 | xorl %eax, %edi # ^d |
| 1026 | xorl %edx, %edi # ^b | 1156 | xorl %edx, %edi # ^b |
| 1027 | leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] | 1157 | addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] |
| 1028 | addl %edi, %ebx # e += (c ^ d ^ b) | 1158 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1029 | movl %ecx, %esi # | 1159 | movl %ecx, %esi # |
| 1030 | roll $5, %esi # rotl32(a,5) | 1160 | roll $5, %esi # rotl32(a,5) |
| 1031 | addl %esi, %ebx # e += rotl32(a,5) | 1161 | addl %esi, %ebx # e += rotl32(a,5) |
| 1032 | rorl $2, %edx # b = rotl32(b,30) | 1162 | rorl $2, %edx # b = rotl32(b,30) |
| 1163 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
| 1164 | movaps %xmm1, %xmm4 | ||
| 1165 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 1166 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 1167 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 1168 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 1169 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 1170 | xorps %xmm5, %xmm2 # ^ | ||
| 1171 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1172 | movaps %xmm2, %xmm5 | ||
| 1173 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 1174 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 1175 | paddd %xmm2, %xmm2 # shift left by 1 | ||
| 1176 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
| 1177 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1178 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 1179 | movaps %xmm5, %xmm4 | ||
| 1180 | pslld $2, %xmm5 | ||
| 1181 | psrld $30, %xmm4 | ||
| 1182 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 1183 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
| 1184 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 1185 | movaps %xmm2, %xmm5 | ||
| 1186 | paddd %xmm6, %xmm5 | ||
| 1187 | movups %xmm5, -64+16*2(%rsp) | ||
| 1033 | # 64 | 1188 | # 64 |
| 1034 | movl %r13d, %esi # W[(n+13) & 15] | ||
| 1035 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
| 1036 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
| 1037 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
| 1038 | roll %esi # | ||
| 1039 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
| 1040 | movl %edx, %edi # c | 1189 | movl %edx, %edi # c |
| 1041 | xorl %ebp, %edi # ^d | 1190 | xorl %ebp, %edi # ^d |
| 1042 | xorl %ecx, %edi # ^b | 1191 | xorl %ecx, %edi # ^b |
| 1043 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1192 | addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] |
| 1044 | addl %edi, %eax # e += (c ^ d ^ b) | 1193 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1045 | movl %ebx, %esi # | 1194 | movl %ebx, %esi # |
| 1046 | roll $5, %esi # rotl32(a,5) | 1195 | roll $5, %esi # rotl32(a,5) |
| 1047 | addl %esi, %eax # e += rotl32(a,5) | 1196 | addl %esi, %eax # e += rotl32(a,5) |
| 1048 | rorl $2, %ecx # b = rotl32(b,30) | 1197 | rorl $2, %ecx # b = rotl32(b,30) |
| 1049 | # 65 | 1198 | # 65 |
| 1050 | movl %r14d, %esi # W[(n+13) & 15] | ||
| 1051 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
| 1052 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
| 1053 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
| 1054 | roll %esi # | ||
| 1055 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
| 1056 | movl %ecx, %edi # c | 1199 | movl %ecx, %edi # c |
| 1057 | xorl %edx, %edi # ^d | 1200 | xorl %edx, %edi # ^d |
| 1058 | xorl %ebx, %edi # ^b | 1201 | xorl %ebx, %edi # ^b |
| 1059 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1202 | addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] |
| 1060 | addl %edi, %ebp # e += (c ^ d ^ b) | 1203 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1061 | movl %eax, %esi # | 1204 | movl %eax, %esi # |
| 1062 | roll $5, %esi # rotl32(a,5) | 1205 | roll $5, %esi # rotl32(a,5) |
| 1063 | addl %esi, %ebp # e += rotl32(a,5) | 1206 | addl %esi, %ebp # e += rotl32(a,5) |
| 1064 | rorl $2, %ebx # b = rotl32(b,30) | 1207 | rorl $2, %ebx # b = rotl32(b,30) |
| 1065 | # 66 | 1208 | # 66 |
| 1066 | movl %r15d, %esi # W[(n+13) & 15] | ||
| 1067 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
| 1068 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
| 1069 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
| 1070 | roll %esi # | ||
| 1071 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
| 1072 | movl %ebx, %edi # c | 1209 | movl %ebx, %edi # c |
| 1073 | xorl %ecx, %edi # ^d | 1210 | xorl %ecx, %edi # ^d |
| 1074 | xorl %eax, %edi # ^b | 1211 | xorl %eax, %edi # ^b |
| 1075 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1212 | addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] |
| 1076 | addl %edi, %edx # e += (c ^ d ^ b) | 1213 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1077 | movl %ebp, %esi # | 1214 | movl %ebp, %esi # |
| 1078 | roll $5, %esi # rotl32(a,5) | 1215 | roll $5, %esi # rotl32(a,5) |
| 1079 | addl %esi, %edx # e += rotl32(a,5) | 1216 | addl %esi, %edx # e += rotl32(a,5) |
| 1080 | rorl $2, %eax # b = rotl32(b,30) | 1217 | rorl $2, %eax # b = rotl32(b,30) |
| 1081 | # 67 | 1218 | # 67 |
| 1082 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
| 1083 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
| 1084 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
| 1085 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
| 1086 | roll %esi # | ||
| 1087 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
| 1088 | movl %eax, %edi # c | 1219 | movl %eax, %edi # c |
| 1089 | xorl %ebx, %edi # ^d | 1220 | xorl %ebx, %edi # ^d |
| 1090 | xorl %ebp, %edi # ^b | 1221 | xorl %ebp, %edi # ^b |
| 1091 | leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 1222 | addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] |
| 1092 | addl %edi, %ecx # e += (c ^ d ^ b) | 1223 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1093 | movl %edx, %esi # | 1224 | movl %edx, %esi # |
| 1094 | roll $5, %esi # rotl32(a,5) | 1225 | roll $5, %esi # rotl32(a,5) |
| 1095 | addl %esi, %ecx # e += rotl32(a,5) | 1226 | addl %esi, %ecx # e += rotl32(a,5) |
| 1096 | rorl $2, %ebp # b = rotl32(b,30) | 1227 | rorl $2, %ebp # b = rotl32(b,30) |
| 1228 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
| 1229 | movaps %xmm2, %xmm4 | ||
| 1230 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 1231 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 1232 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 1233 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 1234 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 1235 | xorps %xmm5, %xmm3 # ^ | ||
| 1236 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1237 | movaps %xmm3, %xmm5 | ||
| 1238 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
| 1239 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 1240 | paddd %xmm3, %xmm3 # shift left by 1 | ||
| 1241 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
| 1242 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 1243 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 1244 | movaps %xmm5, %xmm4 | ||
| 1245 | pslld $2, %xmm5 | ||
| 1246 | psrld $30, %xmm4 | ||
| 1247 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
| 1248 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
| 1249 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 1250 | movaps %xmm3, %xmm5 | ||
| 1251 | paddd %xmm6, %xmm5 | ||
| 1252 | movups %xmm5, -64+16*3(%rsp) | ||
| 1097 | # 68 | 1253 | # 68 |
| 1098 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
| 1099 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
| 1100 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
| 1101 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
| 1102 | roll %esi # | ||
| 1103 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
| 1104 | movl %ebp, %edi # c | 1254 | movl %ebp, %edi # c |
| 1105 | xorl %eax, %edi # ^d | 1255 | xorl %eax, %edi # ^d |
| 1106 | xorl %edx, %edi # ^b | 1256 | xorl %edx, %edi # ^b |
| 1107 | leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 1257 | addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] |
| 1108 | addl %edi, %ebx # e += (c ^ d ^ b) | 1258 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1109 | movl %ecx, %esi # | 1259 | movl %ecx, %esi # |
| 1110 | roll $5, %esi # rotl32(a,5) | 1260 | roll $5, %esi # rotl32(a,5) |
| 1111 | addl %esi, %ebx # e += rotl32(a,5) | 1261 | addl %esi, %ebx # e += rotl32(a,5) |
| 1112 | rorl $2, %edx # b = rotl32(b,30) | 1262 | rorl $2, %edx # b = rotl32(b,30) |
| 1113 | # 69 | 1263 | # 69 |
| 1114 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
| 1115 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
| 1116 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
| 1117 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
| 1118 | roll %esi # | ||
| 1119 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
| 1120 | movl %edx, %edi # c | 1264 | movl %edx, %edi # c |
| 1121 | xorl %ebp, %edi # ^d | 1265 | xorl %ebp, %edi # ^d |
| 1122 | xorl %ecx, %edi # ^b | 1266 | xorl %ecx, %edi # ^b |
| 1123 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1267 | addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] |
| 1124 | addl %edi, %eax # e += (c ^ d ^ b) | 1268 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1125 | movl %ebx, %esi # | 1269 | movl %ebx, %esi # |
| 1126 | roll $5, %esi # rotl32(a,5) | 1270 | roll $5, %esi # rotl32(a,5) |
| 1127 | addl %esi, %eax # e += rotl32(a,5) | 1271 | addl %esi, %eax # e += rotl32(a,5) |
| 1128 | rorl $2, %ecx # b = rotl32(b,30) | 1272 | rorl $2, %ecx # b = rotl32(b,30) |
| 1129 | # 70 | 1273 | # 70 |
| 1130 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
| 1131 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
| 1132 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
| 1133 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
| 1134 | roll %esi # | ||
| 1135 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
| 1136 | movl %ecx, %edi # c | 1274 | movl %ecx, %edi # c |
| 1137 | xorl %edx, %edi # ^d | 1275 | xorl %edx, %edi # ^d |
| 1138 | xorl %ebx, %edi # ^b | 1276 | xorl %ebx, %edi # ^b |
| 1139 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1277 | addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] |
| 1140 | addl %edi, %ebp # e += (c ^ d ^ b) | 1278 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1141 | movl %eax, %esi # | 1279 | movl %eax, %esi # |
| 1142 | roll $5, %esi # rotl32(a,5) | 1280 | roll $5, %esi # rotl32(a,5) |
| 1143 | addl %esi, %ebp # e += rotl32(a,5) | 1281 | addl %esi, %ebp # e += rotl32(a,5) |
| 1144 | rorl $2, %ebx # b = rotl32(b,30) | 1282 | rorl $2, %ebx # b = rotl32(b,30) |
| 1145 | # 71 | 1283 | # 71 |
| 1146 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
| 1147 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
| 1148 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
| 1149 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
| 1150 | roll %esi # | ||
| 1151 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
| 1152 | movl %ebx, %edi # c | 1284 | movl %ebx, %edi # c |
| 1153 | xorl %ecx, %edi # ^d | 1285 | xorl %ecx, %edi # ^d |
| 1154 | xorl %eax, %edi # ^b | 1286 | xorl %eax, %edi # ^b |
| 1155 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1287 | addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] |
| 1156 | addl %edi, %edx # e += (c ^ d ^ b) | 1288 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1157 | movl %ebp, %esi # | 1289 | movl %ebp, %esi # |
| 1158 | roll $5, %esi # rotl32(a,5) | 1290 | roll $5, %esi # rotl32(a,5) |
| 1159 | addl %esi, %edx # e += rotl32(a,5) | 1291 | addl %esi, %edx # e += rotl32(a,5) |
| 1160 | rorl $2, %eax # b = rotl32(b,30) | 1292 | rorl $2, %eax # b = rotl32(b,30) |
| 1161 | # 72 | 1293 | # 72 |
| 1162 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1163 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
| 1164 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
| 1165 | roll %r8d # | ||
| 1166 | movl %eax, %edi # c | 1294 | movl %eax, %edi # c |
| 1167 | xorl %ebx, %edi # ^d | 1295 | xorl %ebx, %edi # ^d |
| 1168 | xorl %ebp, %edi # ^b | 1296 | xorl %ebp, %edi # ^b |
| 1169 | leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] | 1297 | addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] |
| 1170 | addl %edi, %ecx # e += (c ^ d ^ b) | 1298 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1171 | movl %edx, %esi # | 1299 | movl %edx, %esi # |
| 1172 | roll $5, %esi # rotl32(a,5) | 1300 | roll $5, %esi # rotl32(a,5) |
| 1173 | addl %esi, %ecx # e += rotl32(a,5) | 1301 | addl %esi, %ecx # e += rotl32(a,5) |
| 1174 | rorl $2, %ebp # b = rotl32(b,30) | 1302 | rorl $2, %ebp # b = rotl32(b,30) |
| 1175 | # 73 | 1303 | # 73 |
| 1176 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1177 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
| 1178 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
| 1179 | roll %r9d # | ||
| 1180 | movl %ebp, %edi # c | 1304 | movl %ebp, %edi # c |
| 1181 | xorl %eax, %edi # ^d | 1305 | xorl %eax, %edi # ^d |
| 1182 | xorl %edx, %edi # ^b | 1306 | xorl %edx, %edi # ^b |
| 1183 | leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] | 1307 | addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] |
| 1184 | addl %edi, %ebx # e += (c ^ d ^ b) | 1308 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1185 | movl %ecx, %esi # | 1309 | movl %ecx, %esi # |
| 1186 | roll $5, %esi # rotl32(a,5) | 1310 | roll $5, %esi # rotl32(a,5) |
| 1187 | addl %esi, %ebx # e += rotl32(a,5) | 1311 | addl %esi, %ebx # e += rotl32(a,5) |
| 1188 | rorl $2, %edx # b = rotl32(b,30) | 1312 | rorl $2, %edx # b = rotl32(b,30) |
| 1189 | # 74 | 1313 | # 74 |
| 1190 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1191 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
| 1192 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
| 1193 | roll %r10d # | ||
| 1194 | movl %edx, %edi # c | 1314 | movl %edx, %edi # c |
| 1195 | xorl %ebp, %edi # ^d | 1315 | xorl %ebp, %edi # ^d |
| 1196 | xorl %ecx, %edi # ^b | 1316 | xorl %ecx, %edi # ^b |
| 1197 | leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] | 1317 | addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] |
| 1198 | addl %edi, %eax # e += (c ^ d ^ b) | 1318 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1199 | movl %ebx, %esi # | 1319 | movl %ebx, %esi # |
| 1200 | roll $5, %esi # rotl32(a,5) | 1320 | roll $5, %esi # rotl32(a,5) |
| 1201 | addl %esi, %eax # e += rotl32(a,5) | 1321 | addl %esi, %eax # e += rotl32(a,5) |
| 1202 | rorl $2, %ecx # b = rotl32(b,30) | 1322 | rorl $2, %ecx # b = rotl32(b,30) |
| 1203 | # 75 | 1323 | # 75 |
| 1204 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1205 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
| 1206 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
| 1207 | roll %r11d # | ||
| 1208 | movl %ecx, %edi # c | 1324 | movl %ecx, %edi # c |
| 1209 | xorl %edx, %edi # ^d | 1325 | xorl %edx, %edi # ^d |
| 1210 | xorl %ebx, %edi # ^b | 1326 | xorl %ebx, %edi # ^b |
| 1211 | leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] | 1327 | addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] |
| 1212 | addl %edi, %ebp # e += (c ^ d ^ b) | 1328 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1213 | movl %eax, %esi # | 1329 | movl %eax, %esi # |
| 1214 | roll $5, %esi # rotl32(a,5) | 1330 | roll $5, %esi # rotl32(a,5) |
| 1215 | addl %esi, %ebp # e += rotl32(a,5) | 1331 | addl %esi, %ebp # e += rotl32(a,5) |
| 1216 | rorl $2, %ebx # b = rotl32(b,30) | 1332 | rorl $2, %ebx # b = rotl32(b,30) |
| 1217 | # 76 | 1333 | # 76 |
| 1218 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1219 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
| 1220 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
| 1221 | roll %r12d # | ||
| 1222 | movl %ebx, %edi # c | 1334 | movl %ebx, %edi # c |
| 1223 | xorl %ecx, %edi # ^d | 1335 | xorl %ecx, %edi # ^d |
| 1224 | xorl %eax, %edi # ^b | 1336 | xorl %eax, %edi # ^b |
| 1225 | leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] | 1337 | addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] |
| 1226 | addl %edi, %edx # e += (c ^ d ^ b) | 1338 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1227 | movl %ebp, %esi # | 1339 | movl %ebp, %esi # |
| 1228 | roll $5, %esi # rotl32(a,5) | 1340 | roll $5, %esi # rotl32(a,5) |
| 1229 | addl %esi, %edx # e += rotl32(a,5) | 1341 | addl %esi, %edx # e += rotl32(a,5) |
| 1230 | rorl $2, %eax # b = rotl32(b,30) | 1342 | rorl $2, %eax # b = rotl32(b,30) |
| 1231 | # 77 | 1343 | # 77 |
| 1232 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1233 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
| 1234 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
| 1235 | roll %r13d # | ||
| 1236 | movl %eax, %edi # c | 1344 | movl %eax, %edi # c |
| 1237 | xorl %ebx, %edi # ^d | 1345 | xorl %ebx, %edi # ^d |
| 1238 | xorl %ebp, %edi # ^b | 1346 | xorl %ebp, %edi # ^b |
| 1239 | leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] | 1347 | addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] |
| 1240 | addl %edi, %ecx # e += (c ^ d ^ b) | 1348 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1241 | movl %edx, %esi # | 1349 | movl %edx, %esi # |
| 1242 | roll $5, %esi # rotl32(a,5) | 1350 | roll $5, %esi # rotl32(a,5) |
| 1243 | addl %esi, %ecx # e += rotl32(a,5) | 1351 | addl %esi, %ecx # e += rotl32(a,5) |
| 1244 | rorl $2, %ebp # b = rotl32(b,30) | 1352 | rorl $2, %ebp # b = rotl32(b,30) |
| 1245 | # 78 | 1353 | # 78 |
| 1246 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1247 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
| 1248 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
| 1249 | roll %r14d # | ||
| 1250 | movl %ebp, %edi # c | 1354 | movl %ebp, %edi # c |
| 1251 | xorl %eax, %edi # ^d | 1355 | xorl %eax, %edi # ^d |
| 1252 | xorl %edx, %edi # ^b | 1356 | xorl %edx, %edi # ^b |
| 1253 | leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] | 1357 | addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] |
| 1254 | addl %edi, %ebx # e += (c ^ d ^ b) | 1358 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1255 | movl %ecx, %esi # | 1359 | movl %ecx, %esi # |
| 1256 | roll $5, %esi # rotl32(a,5) | 1360 | roll $5, %esi # rotl32(a,5) |
| 1257 | addl %esi, %ebx # e += rotl32(a,5) | 1361 | addl %esi, %ebx # e += rotl32(a,5) |
| 1258 | rorl $2, %edx # b = rotl32(b,30) | 1362 | rorl $2, %edx # b = rotl32(b,30) |
| 1259 | # 79 | 1363 | # 79 |
| 1260 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
| 1261 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
| 1262 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
| 1263 | roll %r15d # | ||
| 1264 | movl %edx, %edi # c | 1364 | movl %edx, %edi # c |
| 1265 | xorl %ebp, %edi # ^d | 1365 | xorl %ebp, %edi # ^d |
| 1266 | xorl %ecx, %edi # ^b | 1366 | xorl %ecx, %edi # ^b |
| 1267 | leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] | 1367 | addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] |
| 1268 | addl %edi, %eax # e += (c ^ d ^ b) | 1368 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1269 | movl %ebx, %esi # | 1369 | movl %ebx, %esi # |
| 1270 | roll $5, %esi # rotl32(a,5) | 1370 | roll $5, %esi # rotl32(a,5) |
| @@ -1286,4 +1386,28 @@ sha1_process_block64: | |||
| 1286 | 1386 | ||
| 1287 | ret | 1387 | ret |
| 1288 | .size sha1_process_block64, .-sha1_process_block64 | 1388 | .size sha1_process_block64, .-sha1_process_block64 |
| 1389 | |||
| 1390 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 | ||
| 1391 | .align 16 | ||
| 1392 | rconst0x5A827999: | ||
| 1393 | .long 0x5A827999 | ||
| 1394 | .long 0x5A827999 | ||
| 1395 | .long 0x5A827999 | ||
| 1396 | .long 0x5A827999 | ||
| 1397 | rconst0x6ED9EBA1: | ||
| 1398 | .long 0x6ED9EBA1 | ||
| 1399 | .long 0x6ED9EBA1 | ||
| 1400 | .long 0x6ED9EBA1 | ||
| 1401 | .long 0x6ED9EBA1 | ||
| 1402 | rconst0x8F1BBCDC: | ||
| 1403 | .long 0x8F1BBCDC | ||
| 1404 | .long 0x8F1BBCDC | ||
| 1405 | .long 0x8F1BBCDC | ||
| 1406 | .long 0x8F1BBCDC | ||
| 1407 | rconst0xCA62C1D6: | ||
| 1408 | .long 0xCA62C1D6 | ||
| 1409 | .long 0xCA62C1D6 | ||
| 1410 | .long 0xCA62C1D6 | ||
| 1411 | .long 0xCA62C1D6 | ||
| 1412 | |||
| 1289 | #endif | 1413 | #endif |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 901896e6e..87c2d0800 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
| @@ -6,33 +6,103 @@ | |||
| 6 | # also contains the diff of the generated file. | 6 | # also contains the diff of the generated file. |
| 7 | exec >hash_md5_sha_x86-64.S | 7 | exec >hash_md5_sha_x86-64.S |
| 8 | 8 | ||
| 9 | # There is a way to use XMM registers (which always exist for x86-64!) for W[] | 9 | # Based on http://arctic.org/~dean/crypto/sha1.html. |
| 10 | # For example, if we load W as follows: | 10 | # ("This SHA1 implementation is public domain.") |
| 11 | # %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] | 11 | # |
| 12 | # %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] | 12 | # x86-64 has at least SSE2 vector insns always available. |
| 13 | # %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] | 13 | # We can use them without any CPUID checks (and without a need |
| 14 | # %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] | 14 | # for a fallback code if needed insns are not available). |
| 15 | # then the xor'ing operation to generate next W[0..3] is: | 15 | # This code uses them to calculate W[] ahead of time. |
| 16 | # movaps %xmm0, %xmmT2 | 16 | # |
| 17 | # palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) | 17 | # Unfortunately, results are passed from vector unit to |
| 18 | # # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. | 18 | # integer ALUs on the stack. MOVD/Q insns to move them directly |
| 19 | # movaps %xmm0, %xmmT13 | 19 | # from vector to integer registers are slower than store-to-load |
| 20 | # palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) | 20 | # forwarding in LSU (on Skylake at least). |
| 21 | # xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 | 21 | # |
| 22 | # xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or | 22 | # The win against a purely integer code is small on Skylake, |
| 23 | # and then results can be extracted for use: | 23 | # only about 7-8%. We offload about 1/3 of our operations to the vector unit. |
| 24 | # movd %xmm0, %esi # new W[0] | 24 | # It can do 4 ops at once in one 128-bit register, |
| 25 | # pextrd $1, %xmm0, %esi # new W[1] | 25 | # but we have to use x2 of them because of W[0] complication, |
| 26 | # # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) | 26 | # SSE2 has no "rotate each word by N bits" insns, |
| 27 | # pextrd $2, %xmm0, %esi # new W[2] | 27 | # moving data to/from vector unit is clunky, and Skylake |
| 28 | # pextrd $3, %xmm0, %esi # new W[3] | 28 | # has four integer ALUs unified with three vector ALUs, |
| 29 | # ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. | 29 | # which makes pure integer code rather fast, and makes |
| 30 | # vector ops compete with integer ones. | ||
| 31 | # | ||
| 32 | # Zen3, with its separate vector ALUs, wins more, about 12%. | ||
| 33 | |||
| 34 | xmmT1="%xmm4" | ||
| 35 | xmmT2="%xmm5" | ||
| 36 | xmmRCONST="%xmm6" | ||
| 37 | T=`printf '\t'` | ||
| 38 | |||
| 39 | # SSE instructions are longer than 4 bytes on average. | ||
| 40 | # Intel CPUs (up to Tiger Lake at least) can't decode | ||
| 41 | # more than 16 bytes of code in one cycle. | ||
| 42 | # By interleaving SSE code and integer code | ||
| 43 | # we mostly achieve a situation where 16-byte decode fetch window | ||
| 44 | # contains 4 (or more) insns. | ||
| 45 | # | ||
| 46 | # However. On Skylake, there was no observed difference, | ||
| 47 | # but on Zen3, non-interleaved code is ~3% faster | ||
| 48 | # (822 Mb/s versus 795 Mb/s hashing speed). | ||
| 49 | # Off for now: | ||
| 50 | interleave=false | ||
| 51 | |||
| 52 | INTERLEAVE() { | ||
| 53 | $interleave || \ | ||
| 54 | { | ||
| 55 | # Generate non-interleaved code | ||
| 56 | # (it should work correctly too) | ||
| 57 | echo "$1" | ||
| 58 | echo "$2" | ||
| 59 | return | ||
| 60 | } | ||
| 61 | ( | ||
| 62 | echo "$1" | grep -v '^$' >"$0.temp1" | ||
| 63 | echo "$2" | grep -v '^$' >"$0.temp2" | ||
| 64 | exec 3<"$0.temp1" | ||
| 65 | exec 4<"$0.temp2" | ||
| 66 | IFS='' | ||
| 67 | while :; do | ||
| 68 | line1='' | ||
| 69 | line2='' | ||
| 70 | while :; do | ||
| 71 | read -r line1 <&3 | ||
| 72 | if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then | ||
| 73 | break | ||
| 74 | fi | ||
| 75 | echo "$line1" | ||
| 76 | done | ||
| 77 | while :; do | ||
| 78 | read -r line2 <&4 | ||
| 79 | if test "${line2:0:4}" = "${T}lea"; then | ||
| 80 | # We use 7-8 byte long forms of LEA. | ||
| 81 | # Do not interleave them with SSE insns | ||
| 82 | # which are also long. | ||
| 83 | echo "$line2" | ||
| 84 | read -r line2 <&4 | ||
| 85 | echo "$line2" | ||
| 86 | continue | ||
| 87 | fi | ||
| 88 | if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then | ||
| 89 | break | ||
| 90 | fi | ||
| 91 | echo "$line2" | ||
| 92 | done | ||
| 93 | test "$line1$line2" || break | ||
| 94 | echo "$line1" | ||
| 95 | echo "$line2" | ||
| 96 | done | ||
| 97 | rm "$0.temp1" "$0.temp2" | ||
| 98 | ) | ||
| 99 | } | ||
| 30 | 100 | ||
| 31 | echo \ | 101 | echo \ |
| 32 | '### Generated by hash_md5_sha_x86-64.S.sh ### | 102 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
| 33 | 103 | ||
| 34 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
| 35 | .section .text.sha1_process_block64,"ax",@progbits | 105 | .section .text.sha1_process_block64,\"ax\",@progbits |
| 36 | .globl sha1_process_block64 | 106 | .globl sha1_process_block64 |
| 37 | .hidden sha1_process_block64 | 107 | .hidden sha1_process_block64 |
| 38 | .type sha1_process_block64, @function | 108 | .type sha1_process_block64, @function |
| @@ -51,16 +121,10 @@ sha1_process_block64: | |||
| 51 | # eax..edx: a..d | 121 | # eax..edx: a..d |
| 52 | # ebp: e | 122 | # ebp: e |
| 53 | # esi,edi: temps | 123 | # esi,edi: temps |
| 54 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 124 | # xmm0..xmm3: W[] |
| 55 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 125 | # xmm4,xmm5: temps |
| 56 | movl $3, %eax | 126 | # xmm6: current round constant |
| 57 | 1: | 127 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
| 58 | movq (%rdi,%rax,8), %rsi | ||
| 59 | bswapq %rsi | ||
| 60 | rolq $32, %rsi | ||
| 61 | movq %rsi, -32(%rsp,%rax,8) | ||
| 62 | decl %eax | ||
| 63 | jns 1b | ||
| 64 | 128 | ||
| 65 | movl 80(%rdi), %eax # a = ctx->hash[0] | 129 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 66 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 130 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| @@ -68,32 +132,120 @@ sha1_process_block64: | |||
| 68 | movl 92(%rdi), %edx # d = ctx->hash[3] | 132 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 69 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 133 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
| 70 | 134 | ||
| 135 | movaps rconst0x5A827999(%rip), $xmmRCONST | ||
| 136 | |||
| 137 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | ||
| 138 | # instead of spilling them to stack. | ||
| 139 | # (We lose parallelized addition of RCONST, but LEA | ||
| 140 | # can do two additions at once, so...) | ||
| 141 | movq 4*0(%rdi), %rsi | ||
| 142 | movq 4*2(%rdi), %r10 | ||
| 143 | bswapq %rsi | ||
| 144 | bswapq %r10 | ||
| 145 | rolq \$32, %rsi # rsi = W[1]:W[0] | ||
| 146 | rolq \$32, %r10 | ||
| 147 | movq %rsi, %xmm0 | ||
| 148 | movq %r10, $xmmT1 | ||
| 149 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | ||
| 150 | movaps %xmm0, $xmmT1 | ||
| 151 | paddd $xmmRCONST, $xmmT1 | ||
| 152 | movups $xmmT1, -64+4*0(%rsp) | ||
| 153 | |||
| 154 | movq 4*4(%rdi), %r8 | ||
| 155 | movq 4*6(%rdi), %r10 | ||
| 156 | bswapq %r8 | ||
| 157 | bswapq %r10 | ||
| 158 | rolq \$32, %r8 | ||
| 159 | rolq \$32, %r10 | ||
| 160 | movq %r8, %xmm1 | ||
| 161 | movq %r10, $xmmT1 | ||
| 162 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | ||
| 163 | movaps %xmm1, $xmmT1 | ||
| 164 | paddd $xmmRCONST, $xmmT1 | ||
| 165 | movups $xmmT1, -64+4*4(%rsp) | ||
| 166 | |||
| 71 | movq 4*8(%rdi), %r8 | 167 | movq 4*8(%rdi), %r8 |
| 72 | movq 4*10(%rdi), %r10 | 168 | movq 4*10(%rdi), %r10 |
| 73 | bswapq %r8 | 169 | bswapq %r8 |
| 74 | bswapq %r10 | 170 | bswapq %r10 |
| 171 | movl %r8d, %r9d # r9d = W[9] | ||
| 172 | rolq \$32, %r8 # r8 = W[9]:W[8] | ||
| 173 | movl %r10d, %r11d # r11d = W[11] | ||
| 174 | rolq \$32, %r10 # r10 = W[11]:W[10] | ||
| 175 | movq %r8, %xmm2 | ||
| 176 | movq %r10, $xmmT1 | ||
| 177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
| 178 | |||
| 75 | movq 4*12(%rdi), %r12 | 179 | movq 4*12(%rdi), %r12 |
| 76 | movq 4*14(%rdi), %r14 | 180 | movq 4*14(%rdi), %r14 |
| 77 | bswapq %r12 | 181 | bswapq %r12 |
| 78 | bswapq %r14 | 182 | bswapq %r14 |
| 79 | movl %r8d, %r9d | 183 | movl %r12d, %r13d # r13d = W[13] |
| 80 | shrq $32, %r8 | 184 | rolq \$32, %r12 # r12 = W[13]:W[12] |
| 81 | movl %r10d, %r11d | 185 | movl %r14d, %r15d # r15d = W[15] |
| 82 | shrq $32, %r10 | 186 | rolq \$32, %r14 # r14 = W[15]:W[14] |
| 83 | movl %r12d, %r13d | 187 | movq %r12, %xmm3 |
| 84 | shrq $32, %r12 | 188 | movq %r14, $xmmT1 |
| 85 | movl %r14d, %r15d | 189 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) |
| 86 | shrq $32, %r14 | 190 | " |
| 87 | ' | 191 | |
| 88 | W32() { | 192 | PREP() { |
| 89 | test "$1" || exit 1 | 193 | local xmmW0=$1 |
| 90 | test "$1" -lt 0 && exit 1 | 194 | local xmmW4=$2 |
| 91 | test "$1" -gt 15 && exit 1 | 195 | local xmmW8=$3 |
| 92 | test "$1" -lt 8 && echo "-32+4*$1(%rsp)" | 196 | local xmmW12=$4 |
| 93 | test "$1" -ge 8 && echo "%r${1}d" | 197 | # the above must be %xmm0..3 in some permutation |
| 198 | local dstmem=$5 | ||
| 199 | #W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); | ||
| 200 | #W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); | ||
| 201 | #W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); | ||
| 202 | #W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); | ||
| 203 | #W[3] ^= rol(W[0], 1); | ||
| 204 | echo "# PREP $@ | ||
| 205 | movaps $xmmW12, $xmmT1 | ||
| 206 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
| 207 | |||
| 208 | pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
| 209 | punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
| 210 | |||
| 211 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
| 212 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
| 213 | xorps $xmmT2, $xmmW0 # ^ | ||
| 214 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
| 215 | movaps $xmmW0, $xmmT2 | ||
| 216 | |||
| 217 | xorps $xmmT1, $xmmT1 # rol(W0,1): | ||
| 218 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) | ||
| 219 | paddd $xmmW0, $xmmW0 # shift left by 1 | ||
| 220 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 | ||
| 221 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
| 222 | |||
| 223 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
| 224 | movaps $xmmT2, $xmmT1 | ||
| 225 | pslld \$2, $xmmT2 | ||
| 226 | psrld \$30, $xmmT1 | ||
| 227 | # xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) | ||
| 228 | xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 | ||
| 229 | |||
| 230 | xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
| 231 | " | ||
| 232 | # movq $xmmW0, %r8 # high latency (~6 cycles) | ||
| 233 | # movaps $xmmW0, $xmmT1 | ||
| 234 | # psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower | ||
| 235 | # movq $xmmT1, %r10 # high latency | ||
| 236 | # movq %r8, %r9 | ||
| 237 | # movq %r10, %r11 | ||
| 238 | # shrq \$32, %r9 | ||
| 239 | # shrq \$32, %r11 | ||
| 240 | # ^^^ slower than passing the results on stack (!!!) | ||
| 241 | echo " | ||
| 242 | movaps $xmmW0, $xmmT2 | ||
| 243 | paddd $xmmRCONST, $xmmT2 | ||
| 244 | movups $xmmT2, $dstmem | ||
| 245 | " | ||
| 94 | } | 246 | } |
| 95 | 247 | ||
| 96 | # It's possible to interleave insns in rounds to mostly eliminate | 248 | # It's possible to interleave integer insns in rounds to mostly eliminate |
| 97 | # dependency chains, but this likely to only help old Pentium-based | 249 | # dependency chains, but this likely to only help old Pentium-based |
| 98 | # CPUs (ones without OOO, which can only simultaneously execute a pair | 250 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
| 99 | # of _adjacent_ insns). | 251 | # of _adjacent_ insns). |
| @@ -107,21 +259,16 @@ local n0=$(((n+0) & 15)) | |||
| 107 | echo " | 259 | echo " |
| 108 | # $n | 260 | # $n |
| 109 | ";test $n0 = 0 && echo " | 261 | ";test $n0 = 0 && echo " |
| 110 | # W[0], already in %esi | 262 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
| 111 | ";test $n0 != 0 && test $n0 -lt 8 && echo " | 263 | ";test $n0 != 0 && test $n0 -lt 8 && echo " |
| 112 | movl `W32 $n0`, %esi # W[n] | 264 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] |
| 113 | ";test $n0 -ge 8 && echo " | 265 | ";test $n0 -ge 8 && echo " |
| 114 | # W[n], in %r$n0 | 266 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] |
| 115 | ";echo " | 267 | ";echo " |
| 116 | movl %e$c, %edi # c | 268 | movl %e$c, %edi # c |
| 117 | xorl %e$d, %edi # ^d | 269 | xorl %e$d, %edi # ^d |
| 118 | andl %e$b, %edi # &b | 270 | andl %e$b, %edi # &b |
| 119 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 271 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
| 120 | ";test $n0 -lt 8 && echo " | ||
| 121 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | ||
| 122 | ";test $n0 -ge 8 && echo " | ||
| 123 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] | ||
| 124 | ";echo " | ||
| 125 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 272 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
| 126 | movl %e$a, %esi # | 273 | movl %e$a, %esi # |
| 127 | roll \$5, %esi # rotl32(a,5) | 274 | roll \$5, %esi # rotl32(a,5) |
| @@ -138,28 +285,11 @@ local n2=$(((n+2) & 15)) | |||
| 138 | local n0=$(((n+0) & 15)) | 285 | local n0=$(((n+0) & 15)) |
| 139 | echo " | 286 | echo " |
| 140 | # $n | 287 | # $n |
| 141 | ";test $n0 -lt 8 && echo " | ||
| 142 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
| 143 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
| 144 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
| 145 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
| 146 | roll %esi # | ||
| 147 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
| 148 | ";test $n0 -ge 8 && echo " | ||
| 149 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
| 150 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
| 151 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
| 152 | roll `W32 $n0` # | ||
| 153 | ";echo " | ||
| 154 | movl %e$c, %edi # c | 288 | movl %e$c, %edi # c |
| 155 | xorl %e$d, %edi # ^d | 289 | xorl %e$d, %edi # ^d |
| 156 | andl %e$b, %edi # &b | 290 | andl %e$b, %edi # &b |
| 157 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 291 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
| 158 | ";test $n0 -lt 8 && echo " | 292 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
| 159 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
| 160 | ";test $n0 -ge 8 && echo " | ||
| 161 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
| 162 | ";echo " | ||
| 163 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 293 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
| 164 | movl %e$a, %esi # | 294 | movl %e$a, %esi # |
| 165 | roll \$5, %esi # rotl32(a,5) | 295 | roll \$5, %esi # rotl32(a,5) |
| @@ -167,13 +297,6 @@ echo " | |||
| 167 | rorl \$2, %e$b # b = rotl32(b,30) | 297 | rorl \$2, %e$b # b = rotl32(b,30) |
| 168 | " | 298 | " |
| 169 | } | 299 | } |
| 170 | { | ||
| 171 | RCONST=0x5A827999 | ||
| 172 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 | ||
| 173 | RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 | ||
| 174 | RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 | ||
| 175 | RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 | ||
| 176 | } | grep -v '^$' | ||
| 177 | 300 | ||
| 178 | RD2() { | 301 | RD2() { |
| 179 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 302 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| @@ -184,27 +307,10 @@ local n2=$(((n+2) & 15)) | |||
| 184 | local n0=$(((n+0) & 15)) | 307 | local n0=$(((n+0) & 15)) |
| 185 | echo " | 308 | echo " |
| 186 | # $n | 309 | # $n |
| 187 | ";test $n0 -lt 8 && echo " | ||
| 188 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
| 189 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
| 190 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
| 191 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
| 192 | roll %esi # | ||
| 193 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
| 194 | ";test $n0 -ge 8 && echo " | ||
| 195 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
| 196 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
| 197 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
| 198 | roll `W32 $n0` # | ||
| 199 | ";echo " | ||
| 200 | movl %e$c, %edi # c | 310 | movl %e$c, %edi # c |
| 201 | xorl %e$d, %edi # ^d | 311 | xorl %e$d, %edi # ^d |
| 202 | xorl %e$b, %edi # ^b | 312 | xorl %e$b, %edi # ^b |
| 203 | ";test $n0 -lt 8 && echo " | 313 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
| 204 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
| 205 | ";test $n0 -ge 8 && echo " | ||
| 206 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
| 207 | ";echo " | ||
| 208 | addl %edi, %e$e # e += (c ^ d ^ b) | 314 | addl %edi, %e$e # e += (c ^ d ^ b) |
| 209 | movl %e$a, %esi # | 315 | movl %e$a, %esi # |
| 210 | roll \$5, %esi # rotl32(a,5) | 316 | roll \$5, %esi # rotl32(a,5) |
| @@ -212,13 +318,6 @@ echo " | |||
| 212 | rorl \$2, %e$b # b = rotl32(b,30) | 318 | rorl \$2, %e$b # b = rotl32(b,30) |
| 213 | " | 319 | " |
| 214 | } | 320 | } |
| 215 | { | ||
| 216 | RCONST=0x6ED9EBA1 | ||
| 217 | RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 | ||
| 218 | RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 | ||
| 219 | RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 | ||
| 220 | RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 | ||
| 221 | } | grep -v '^$' | ||
| 222 | 321 | ||
| 223 | RD3() { | 322 | RD3() { |
| 224 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 323 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| @@ -235,53 +334,82 @@ echo " | |||
| 235 | andl %e$c, %esi # si: b & c | 334 | andl %e$c, %esi # si: b & c |
| 236 | andl %e$d, %edi # di: (b | c) & d | 335 | andl %e$d, %edi # di: (b | c) & d |
| 237 | orl %esi, %edi # ((b | c) & d) | (b & c) | 336 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 238 | ";test $n0 -lt 8 && echo " | ||
| 239 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
| 240 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
| 241 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
| 242 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
| 243 | roll %esi # | ||
| 244 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
| 245 | ";test $n0 -ge 8 && echo " | ||
| 246 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
| 247 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
| 248 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
| 249 | roll `W32 $n0` # | ||
| 250 | ";echo " | ||
| 251 | addl %edi, %e$e # += ((b | c) & d) | (b & c) | 337 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
| 252 | ";test $n0 -lt 8 && echo " | 338 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
| 253 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
| 254 | ";test $n0 -ge 8 && echo " | ||
| 255 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
| 256 | ";echo " | ||
| 257 | movl %e$a, %esi # | 339 | movl %e$a, %esi # |
| 258 | roll \$5, %esi # rotl32(a,5) | 340 | roll \$5, %esi # rotl32(a,5) |
| 259 | addl %esi, %e$e # e += rotl32(a,5) | 341 | addl %esi, %e$e # e += rotl32(a,5) |
| 260 | rorl \$2, %e$b # b = rotl32(b,30) | 342 | rorl \$2, %e$b # b = rotl32(b,30) |
| 261 | " | 343 | " |
| 262 | } | 344 | } |
| 345 | |||
| 263 | { | 346 | { |
| 264 | #RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" | 347 | # Round 1 |
| 265 | RCONST=-0x70E44324 | 348 | RCONST=0x5A827999 |
| 266 | RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 | 349 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; |
| 267 | RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 | 350 | RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; |
| 268 | RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 | 351 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 269 | RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 | 352 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
| 270 | } | grep -v '^$' | 353 | INTERLEAVE "$a" "$b" |
| 354 | a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" | ||
| 355 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
| 356 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` | ||
| 357 | INTERLEAVE "$a" "$b" | ||
| 358 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
| 359 | b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` | ||
| 360 | INTERLEAVE "$a" "$b" | ||
| 361 | |||
| 362 | # Round 2 | ||
| 363 | RCONST=0x6ED9EBA1 | ||
| 364 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
| 365 | b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` | ||
| 366 | INTERLEAVE "$a" "$b" | ||
| 367 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
| 368 | b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` | ||
| 369 | INTERLEAVE "$a" "$b" | ||
| 370 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
| 371 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` | ||
| 372 | INTERLEAVE "$a" "$b" | ||
| 373 | a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" | ||
| 374 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
| 375 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` | ||
| 376 | INTERLEAVE "$a" "$b" | ||
| 377 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
| 378 | b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` | ||
| 379 | INTERLEAVE "$a" "$b" | ||
| 380 | |||
| 381 | # Round 3 | ||
| 382 | RCONST=0x8F1BBCDC | ||
| 383 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
| 384 | b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` | ||
| 385 | INTERLEAVE "$a" "$b" | ||
| 386 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
| 387 | b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` | ||
| 388 | INTERLEAVE "$a" "$b" | ||
| 389 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
| 390 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` | ||
| 391 | INTERLEAVE "$a" "$b" | ||
| 392 | a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" | ||
| 393 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
| 394 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` | ||
| 395 | INTERLEAVE "$a" "$b" | ||
| 396 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
| 397 | b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` | ||
| 398 | INTERLEAVE "$a" "$b" | ||
| 271 | 399 | ||
| 272 | # Round 4 has the same logic as round 2, only n and RCONST are different | 400 | # Round 4 has the same logic as round 2, only n and RCONST are different |
| 273 | { | 401 | RCONST=0xCA62C1D6 |
| 274 | #RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" | 402 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 275 | RCONST=-0x359D3E2A | 403 | b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` |
| 276 | RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 | 404 | INTERLEAVE "$a" "$b" |
| 277 | RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 | 405 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 278 | RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 | 406 | b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` |
| 279 | RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 | 407 | INTERLEAVE "$a" "$b" |
| 280 | # Note: new W[n&15] values generated in last 3 iterations | 408 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 281 | # (W[13,14,15]) are unused after each of these iterations. | 409 | b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` |
| 282 | # Since we use r8..r15 for W[8..15], this does not matter. | 410 | INTERLEAVE "$a" "$b" |
| 283 | # If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] | 411 | RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; |
| 284 | # (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. | 412 | RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; |
| 285 | } | grep -v '^$' | 413 | } | grep -v '^$' |
| 286 | 414 | ||
| 287 | echo " | 415 | echo " |
| @@ -300,4 +428,28 @@ echo " | |||
| 300 | 428 | ||
| 301 | ret | 429 | ret |
| 302 | .size sha1_process_block64, .-sha1_process_block64 | 430 | .size sha1_process_block64, .-sha1_process_block64 |
| 431 | |||
| 432 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 | ||
| 433 | .align 16 | ||
| 434 | rconst0x5A827999: | ||
| 435 | .long 0x5A827999 | ||
| 436 | .long 0x5A827999 | ||
| 437 | .long 0x5A827999 | ||
| 438 | .long 0x5A827999 | ||
| 439 | rconst0x6ED9EBA1: | ||
| 440 | .long 0x6ED9EBA1 | ||
| 441 | .long 0x6ED9EBA1 | ||
| 442 | .long 0x6ED9EBA1 | ||
| 443 | .long 0x6ED9EBA1 | ||
| 444 | rconst0x8F1BBCDC: | ||
| 445 | .long 0x8F1BBCDC | ||
| 446 | .long 0x8F1BBCDC | ||
| 447 | .long 0x8F1BBCDC | ||
| 448 | .long 0x8F1BBCDC | ||
| 449 | rconst0xCA62C1D6: | ||
| 450 | .long 0xCA62C1D6 | ||
| 451 | .long 0xCA62C1D6 | ||
| 452 | .long 0xCA62C1D6 | ||
| 453 | .long 0xCA62C1D6 | ||
| 454 | |||
| 303 | #endif" | 455 | #endif" |
