aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha256_x86-32_shaNI.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha256_x86-32_shaNI.S')
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S92
1 files changed, 48 insertions, 44 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index aa68193bd..3905bad9a 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -4,7 +4,7 @@
4// We use shorter insns, even though they are for "wrong" 4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int). 5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all 6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns). 7// (CPUs which do have such penalty do not support SHA insns).
8// For AMD, the penalty is one extra cycle 8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference). 9// (allegedly: I failed to find measurable difference).
10 10
@@ -15,6 +15,10 @@
15//#define shuf128_32 pshufd 15//#define shuf128_32 pshufd
16#define shuf128_32 shufps 16#define shuf128_32 shufps
17 17
18// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits 22 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI 23 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI 24 .hidden sha256_process_block64_shaNI
@@ -39,12 +43,13 @@
39 .balign 8 # allow decoders to fetch at least 2 first insns 43 .balign 8 # allow decoders to fetch at least 2 first insns
40sha256_process_block64_shaNI: 44sha256_process_block64_shaNI:
41 45
42 movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ 46 movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
43 movu128 76+1*16(%eax), STATE1 /* HGFE */ 47 movu128 76+1*16(%eax), STATE1 /* EFGH */
44/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ 48/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
45 mova128 STATE1, STATE0 49 mova128 STATE1, STATE0
46 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ 50 /* --- -------------- ABCD -- EFGH */
47 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ 51 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
52 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
48 53
49/* XMMTMP holds flip mask from here... */ 54/* XMMTMP holds flip mask from here... */
50 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP 55 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
@@ -55,18 +60,18 @@ sha256_process_block64_shaNI:
55 pshufb XMMTMP, MSG 60 pshufb XMMTMP, MSG
56 mova128 MSG, MSGTMP0 61 mova128 MSG, MSGTMP0
57 paddd 0*16-8*16(SHA256CONSTANTS), MSG 62 paddd 0*16-8*16(SHA256CONSTANTS), MSG
58 sha256rnds2 STATE0, STATE1 63 sha256rnds2 MSG, STATE0, STATE1
59 shuf128_32 $0x0E, MSG, MSG 64 shuf128_32 $0x0E, MSG, MSG
60 sha256rnds2 STATE1, STATE0 65 sha256rnds2 MSG, STATE1, STATE0
61 66
62 /* Rounds 4-7 */ 67 /* Rounds 4-7 */
63 movu128 1*16(DATA_PTR), MSG 68 movu128 1*16(DATA_PTR), MSG
64 pshufb XMMTMP, MSG 69 pshufb XMMTMP, MSG
65 mova128 MSG, MSGTMP1 70 mova128 MSG, MSGTMP1
66 paddd 1*16-8*16(SHA256CONSTANTS), MSG 71 paddd 1*16-8*16(SHA256CONSTANTS), MSG
67 sha256rnds2 STATE0, STATE1 72 sha256rnds2 MSG, STATE0, STATE1
68 shuf128_32 $0x0E, MSG, MSG 73 shuf128_32 $0x0E, MSG, MSG
69 sha256rnds2 STATE1, STATE0 74 sha256rnds2 MSG, STATE1, STATE0
70 sha256msg1 MSGTMP1, MSGTMP0 75 sha256msg1 MSGTMP1, MSGTMP0
71 76
72 /* Rounds 8-11 */ 77 /* Rounds 8-11 */
@@ -74,9 +79,9 @@ sha256_process_block64_shaNI:
74 pshufb XMMTMP, MSG 79 pshufb XMMTMP, MSG
75 mova128 MSG, MSGTMP2 80 mova128 MSG, MSGTMP2
76 paddd 2*16-8*16(SHA256CONSTANTS), MSG 81 paddd 2*16-8*16(SHA256CONSTANTS), MSG
77 sha256rnds2 STATE0, STATE1 82 sha256rnds2 MSG, STATE0, STATE1
78 shuf128_32 $0x0E, MSG, MSG 83 shuf128_32 $0x0E, MSG, MSG
79 sha256rnds2 STATE1, STATE0 84 sha256rnds2 MSG, STATE1, STATE0
80 sha256msg1 MSGTMP2, MSGTMP1 85 sha256msg1 MSGTMP2, MSGTMP1
81 86
82 /* Rounds 12-15 */ 87 /* Rounds 12-15 */
@@ -85,159 +90,158 @@ sha256_process_block64_shaNI:
85/* ...to here */ 90/* ...to here */
86 mova128 MSG, MSGTMP3 91 mova128 MSG, MSGTMP3
87 paddd 3*16-8*16(SHA256CONSTANTS), MSG 92 paddd 3*16-8*16(SHA256CONSTANTS), MSG
88 sha256rnds2 STATE0, STATE1 93 sha256rnds2 MSG, STATE0, STATE1
89 mova128 MSGTMP3, XMMTMP 94 mova128 MSGTMP3, XMMTMP
90 palignr $4, MSGTMP2, XMMTMP 95 palignr $4, MSGTMP2, XMMTMP
91 paddd XMMTMP, MSGTMP0 96 paddd XMMTMP, MSGTMP0
92 sha256msg2 MSGTMP3, MSGTMP0 97 sha256msg2 MSGTMP3, MSGTMP0
93 shuf128_32 $0x0E, MSG, MSG 98 shuf128_32 $0x0E, MSG, MSG
94 sha256rnds2 STATE1, STATE0 99 sha256rnds2 MSG, STATE1, STATE0
95 sha256msg1 MSGTMP3, MSGTMP2 100 sha256msg1 MSGTMP3, MSGTMP2
96 101
97 /* Rounds 16-19 */ 102 /* Rounds 16-19 */
98 mova128 MSGTMP0, MSG 103 mova128 MSGTMP0, MSG
99 paddd 4*16-8*16(SHA256CONSTANTS), MSG 104 paddd 4*16-8*16(SHA256CONSTANTS), MSG
100 sha256rnds2 STATE0, STATE1 105 sha256rnds2 MSG, STATE0, STATE1
101 mova128 MSGTMP0, XMMTMP 106 mova128 MSGTMP0, XMMTMP
102 palignr $4, MSGTMP3, XMMTMP 107 palignr $4, MSGTMP3, XMMTMP
103 paddd XMMTMP, MSGTMP1 108 paddd XMMTMP, MSGTMP1
104 sha256msg2 MSGTMP0, MSGTMP1 109 sha256msg2 MSGTMP0, MSGTMP1
105 shuf128_32 $0x0E, MSG, MSG 110 shuf128_32 $0x0E, MSG, MSG
106 sha256rnds2 STATE1, STATE0 111 sha256rnds2 MSG, STATE1, STATE0
107 sha256msg1 MSGTMP0, MSGTMP3 112 sha256msg1 MSGTMP0, MSGTMP3
108 113
109 /* Rounds 20-23 */ 114 /* Rounds 20-23 */
110 mova128 MSGTMP1, MSG 115 mova128 MSGTMP1, MSG
111 paddd 5*16-8*16(SHA256CONSTANTS), MSG 116 paddd 5*16-8*16(SHA256CONSTANTS), MSG
112 sha256rnds2 STATE0, STATE1 117 sha256rnds2 MSG, STATE0, STATE1
113 mova128 MSGTMP1, XMMTMP 118 mova128 MSGTMP1, XMMTMP
114 palignr $4, MSGTMP0, XMMTMP 119 palignr $4, MSGTMP0, XMMTMP
115 paddd XMMTMP, MSGTMP2 120 paddd XMMTMP, MSGTMP2
116 sha256msg2 MSGTMP1, MSGTMP2 121 sha256msg2 MSGTMP1, MSGTMP2
117 shuf128_32 $0x0E, MSG, MSG 122 shuf128_32 $0x0E, MSG, MSG
118 sha256rnds2 STATE1, STATE0 123 sha256rnds2 MSG, STATE1, STATE0
119 sha256msg1 MSGTMP1, MSGTMP0 124 sha256msg1 MSGTMP1, MSGTMP0
120 125
121 /* Rounds 24-27 */ 126 /* Rounds 24-27 */
122 mova128 MSGTMP2, MSG 127 mova128 MSGTMP2, MSG
123 paddd 6*16-8*16(SHA256CONSTANTS), MSG 128 paddd 6*16-8*16(SHA256CONSTANTS), MSG
124 sha256rnds2 STATE0, STATE1 129 sha256rnds2 MSG, STATE0, STATE1
125 mova128 MSGTMP2, XMMTMP 130 mova128 MSGTMP2, XMMTMP
126 palignr $4, MSGTMP1, XMMTMP 131 palignr $4, MSGTMP1, XMMTMP
127 paddd XMMTMP, MSGTMP3 132 paddd XMMTMP, MSGTMP3
128 sha256msg2 MSGTMP2, MSGTMP3 133 sha256msg2 MSGTMP2, MSGTMP3
129 shuf128_32 $0x0E, MSG, MSG 134 shuf128_32 $0x0E, MSG, MSG
130 sha256rnds2 STATE1, STATE0 135 sha256rnds2 MSG, STATE1, STATE0
131 sha256msg1 MSGTMP2, MSGTMP1 136 sha256msg1 MSGTMP2, MSGTMP1
132 137
133 /* Rounds 28-31 */ 138 /* Rounds 28-31 */
134 mova128 MSGTMP3, MSG 139 mova128 MSGTMP3, MSG
135 paddd 7*16-8*16(SHA256CONSTANTS), MSG 140 paddd 7*16-8*16(SHA256CONSTANTS), MSG
136 sha256rnds2 STATE0, STATE1 141 sha256rnds2 MSG, STATE0, STATE1
137 mova128 MSGTMP3, XMMTMP 142 mova128 MSGTMP3, XMMTMP
138 palignr $4, MSGTMP2, XMMTMP 143 palignr $4, MSGTMP2, XMMTMP
139 paddd XMMTMP, MSGTMP0 144 paddd XMMTMP, MSGTMP0
140 sha256msg2 MSGTMP3, MSGTMP0 145 sha256msg2 MSGTMP3, MSGTMP0
141 shuf128_32 $0x0E, MSG, MSG 146 shuf128_32 $0x0E, MSG, MSG
142 sha256rnds2 STATE1, STATE0 147 sha256rnds2 MSG, STATE1, STATE0
143 sha256msg1 MSGTMP3, MSGTMP2 148 sha256msg1 MSGTMP3, MSGTMP2
144 149
145 /* Rounds 32-35 */ 150 /* Rounds 32-35 */
146 mova128 MSGTMP0, MSG 151 mova128 MSGTMP0, MSG
147 paddd 8*16-8*16(SHA256CONSTANTS), MSG 152 paddd 8*16-8*16(SHA256CONSTANTS), MSG
148 sha256rnds2 STATE0, STATE1 153 sha256rnds2 MSG, STATE0, STATE1
149 mova128 MSGTMP0, XMMTMP 154 mova128 MSGTMP0, XMMTMP
150 palignr $4, MSGTMP3, XMMTMP 155 palignr $4, MSGTMP3, XMMTMP
151 paddd XMMTMP, MSGTMP1 156 paddd XMMTMP, MSGTMP1
152 sha256msg2 MSGTMP0, MSGTMP1 157 sha256msg2 MSGTMP0, MSGTMP1
153 shuf128_32 $0x0E, MSG, MSG 158 shuf128_32 $0x0E, MSG, MSG
154 sha256rnds2 STATE1, STATE0 159 sha256rnds2 MSG, STATE1, STATE0
155 sha256msg1 MSGTMP0, MSGTMP3 160 sha256msg1 MSGTMP0, MSGTMP3
156 161
157 /* Rounds 36-39 */ 162 /* Rounds 36-39 */
158 mova128 MSGTMP1, MSG 163 mova128 MSGTMP1, MSG
159 paddd 9*16-8*16(SHA256CONSTANTS), MSG 164 paddd 9*16-8*16(SHA256CONSTANTS), MSG
160 sha256rnds2 STATE0, STATE1 165 sha256rnds2 MSG, STATE0, STATE1
161 mova128 MSGTMP1, XMMTMP 166 mova128 MSGTMP1, XMMTMP
162 palignr $4, MSGTMP0, XMMTMP 167 palignr $4, MSGTMP0, XMMTMP
163 paddd XMMTMP, MSGTMP2 168 paddd XMMTMP, MSGTMP2
164 sha256msg2 MSGTMP1, MSGTMP2 169 sha256msg2 MSGTMP1, MSGTMP2
165 shuf128_32 $0x0E, MSG, MSG 170 shuf128_32 $0x0E, MSG, MSG
166 sha256rnds2 STATE1, STATE0 171 sha256rnds2 MSG, STATE1, STATE0
167 sha256msg1 MSGTMP1, MSGTMP0 172 sha256msg1 MSGTMP1, MSGTMP0
168 173
169 /* Rounds 40-43 */ 174 /* Rounds 40-43 */
170 mova128 MSGTMP2, MSG 175 mova128 MSGTMP2, MSG
171 paddd 10*16-8*16(SHA256CONSTANTS), MSG 176 paddd 10*16-8*16(SHA256CONSTANTS), MSG
172 sha256rnds2 STATE0, STATE1 177 sha256rnds2 MSG, STATE0, STATE1
173 mova128 MSGTMP2, XMMTMP 178 mova128 MSGTMP2, XMMTMP
174 palignr $4, MSGTMP1, XMMTMP 179 palignr $4, MSGTMP1, XMMTMP
175 paddd XMMTMP, MSGTMP3 180 paddd XMMTMP, MSGTMP3
176 sha256msg2 MSGTMP2, MSGTMP3 181 sha256msg2 MSGTMP2, MSGTMP3
177 shuf128_32 $0x0E, MSG, MSG 182 shuf128_32 $0x0E, MSG, MSG
178 sha256rnds2 STATE1, STATE0 183 sha256rnds2 MSG, STATE1, STATE0
179 sha256msg1 MSGTMP2, MSGTMP1 184 sha256msg1 MSGTMP2, MSGTMP1
180 185
181 /* Rounds 44-47 */ 186 /* Rounds 44-47 */
182 mova128 MSGTMP3, MSG 187 mova128 MSGTMP3, MSG
183 paddd 11*16-8*16(SHA256CONSTANTS), MSG 188 paddd 11*16-8*16(SHA256CONSTANTS), MSG
184 sha256rnds2 STATE0, STATE1 189 sha256rnds2 MSG, STATE0, STATE1
185 mova128 MSGTMP3, XMMTMP 190 mova128 MSGTMP3, XMMTMP
186 palignr $4, MSGTMP2, XMMTMP 191 palignr $4, MSGTMP2, XMMTMP
187 paddd XMMTMP, MSGTMP0 192 paddd XMMTMP, MSGTMP0
188 sha256msg2 MSGTMP3, MSGTMP0 193 sha256msg2 MSGTMP3, MSGTMP0
189 shuf128_32 $0x0E, MSG, MSG 194 shuf128_32 $0x0E, MSG, MSG
190 sha256rnds2 STATE1, STATE0 195 sha256rnds2 MSG, STATE1, STATE0
191 sha256msg1 MSGTMP3, MSGTMP2 196 sha256msg1 MSGTMP3, MSGTMP2
192 197
193 /* Rounds 48-51 */ 198 /* Rounds 48-51 */
194 mova128 MSGTMP0, MSG 199 mova128 MSGTMP0, MSG
195 paddd 12*16-8*16(SHA256CONSTANTS), MSG 200 paddd 12*16-8*16(SHA256CONSTANTS), MSG
196 sha256rnds2 STATE0, STATE1 201 sha256rnds2 MSG, STATE0, STATE1
197 mova128 MSGTMP0, XMMTMP 202 mova128 MSGTMP0, XMMTMP
198 palignr $4, MSGTMP3, XMMTMP 203 palignr $4, MSGTMP3, XMMTMP
199 paddd XMMTMP, MSGTMP1 204 paddd XMMTMP, MSGTMP1
200 sha256msg2 MSGTMP0, MSGTMP1 205 sha256msg2 MSGTMP0, MSGTMP1
201 shuf128_32 $0x0E, MSG, MSG 206 shuf128_32 $0x0E, MSG, MSG
202 sha256rnds2 STATE1, STATE0 207 sha256rnds2 MSG, STATE1, STATE0
203 sha256msg1 MSGTMP0, MSGTMP3 208 sha256msg1 MSGTMP0, MSGTMP3
204 209
205 /* Rounds 52-55 */ 210 /* Rounds 52-55 */
206 mova128 MSGTMP1, MSG 211 mova128 MSGTMP1, MSG
207 paddd 13*16-8*16(SHA256CONSTANTS), MSG 212 paddd 13*16-8*16(SHA256CONSTANTS), MSG
208 sha256rnds2 STATE0, STATE1 213 sha256rnds2 MSG, STATE0, STATE1
209 mova128 MSGTMP1, XMMTMP 214 mova128 MSGTMP1, XMMTMP
210 palignr $4, MSGTMP0, XMMTMP 215 palignr $4, MSGTMP0, XMMTMP
211 paddd XMMTMP, MSGTMP2 216 paddd XMMTMP, MSGTMP2
212 sha256msg2 MSGTMP1, MSGTMP2 217 sha256msg2 MSGTMP1, MSGTMP2
213 shuf128_32 $0x0E, MSG, MSG 218 shuf128_32 $0x0E, MSG, MSG
214 sha256rnds2 STATE1, STATE0 219 sha256rnds2 MSG, STATE1, STATE0
215 220
216 /* Rounds 56-59 */ 221 /* Rounds 56-59 */
217 mova128 MSGTMP2, MSG 222 mova128 MSGTMP2, MSG
218 paddd 14*16-8*16(SHA256CONSTANTS), MSG 223 paddd 14*16-8*16(SHA256CONSTANTS), MSG
219 sha256rnds2 STATE0, STATE1 224 sha256rnds2 MSG, STATE0, STATE1
220 mova128 MSGTMP2, XMMTMP 225 mova128 MSGTMP2, XMMTMP
221 palignr $4, MSGTMP1, XMMTMP 226 palignr $4, MSGTMP1, XMMTMP
222 paddd XMMTMP, MSGTMP3 227 paddd XMMTMP, MSGTMP3
223 sha256msg2 MSGTMP2, MSGTMP3 228 sha256msg2 MSGTMP2, MSGTMP3
224 shuf128_32 $0x0E, MSG, MSG 229 shuf128_32 $0x0E, MSG, MSG
225 sha256rnds2 STATE1, STATE0 230 sha256rnds2 MSG, STATE1, STATE0
226 231
227 /* Rounds 60-63 */ 232 /* Rounds 60-63 */
228 mova128 MSGTMP3, MSG 233 mova128 MSGTMP3, MSG
229 paddd 15*16-8*16(SHA256CONSTANTS), MSG 234 paddd 15*16-8*16(SHA256CONSTANTS), MSG
230 sha256rnds2 STATE0, STATE1 235 sha256rnds2 MSG, STATE0, STATE1
231 shuf128_32 $0x0E, MSG, MSG 236 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0 237 sha256rnds2 MSG, STATE1, STATE0
233 238
234 /* Write hash values back in the correct order */ 239 /* Write hash values back in the correct order */
235 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
236 /* STATE1: CDGH */
237 mova128 STATE0, XMMTMP 240 mova128 STATE0, XMMTMP
238/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ 241/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
239 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ 242 /* --- -------------- HGDC -- FEBA */
240 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ 243 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
244 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
241 /* add current hash values to previous ones */ 245 /* add current hash values to previous ones */
242 movu128 76+1*16(%eax), STATE1 246 movu128 76+1*16(%eax), STATE1
243 paddd XMMTMP, STATE1 247 paddd XMMTMP, STATE1
@@ -250,7 +254,7 @@ sha256_process_block64_shaNI:
250 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI 254 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
251 255
252 .section .rodata.cst256.K256, "aM", @progbits, 256 256 .section .rodata.cst256.K256, "aM", @progbits, 256
253 .balign 16 257 .balign 16
254K256: 258K256:
255 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 259 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
256 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 260 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -270,8 +274,8 @@ K256:
270 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 274 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
271 275
272 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 276 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
273 .balign 16 277 .balign 16
274PSHUFFLE_BSWAP32_FLIP_MASK: 278PSHUFFLE_BSWAP32_FLIP_MASK:
275 .octa 0x0c0d0e0f08090a0b0405060700010203 279 .octa 0x0c0d0e0f08090a0b0405060700010203
276 280
277#endif 281#endif