aboutsummaryrefslogtreecommitdiff
path: root/Asm/x86/XzCrc64Opt.asm
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-14 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-15 23:55:04 +0500
commitfc662341e6f85da78ada0e443f6116b978f79f22 (patch)
tree1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /Asm/x86/XzCrc64Opt.asm
parent5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff)
download7zip-24.05.tar.gz
7zip-24.05.tar.bz2
7zip-24.05.zip
24.0524.05
Diffstat (limited to '')
-rw-r--r--Asm/x86/XzCrc64Opt.asm632
1 files changed, 458 insertions, 174 deletions
diff --git a/Asm/x86/XzCrc64Opt.asm b/Asm/x86/XzCrc64Opt.asm
index ad22cc2..283424c 100644
--- a/Asm/x86/XzCrc64Opt.asm
+++ b/Asm/x86/XzCrc64Opt.asm
@@ -1,113 +1,231 @@
1; XzCrc64Opt.asm -- CRC64 calculation : optimized version 1; XzCrc64Opt.asm -- CRC64 calculation : optimized version
2; 2021-02-06 : Igor Pavlov : Public domain 2; 2023-12-08 : Igor Pavlov : Public domain
3 3
4include 7zAsm.asm 4include 7zAsm.asm
5 5
6MY_ASM_START 6MY_ASM_START
7 7
8NUM_WORDS equ 3
9
10if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
11.err <num_words_IS_INCORRECT>
12endif
13
14NUM_SKIP_BYTES equ ((NUM_WORDS - 2) * 4)
15
16
17MOVZXLO macro dest:req, src:req
18 movzx dest, @CatStr(src, _L)
19endm
20
21MOVZXHI macro dest:req, src:req
22 movzx dest, @CatStr(src, _H)
23endm
24
25
8ifdef x64 26ifdef x64
9 27
10rD equ r9 28rD equ r11
11rN equ r10 29rN equ r10
12rT equ r5 30rT equ r9
13num_VAR equ r8 31
14 32CRC_OP macro op:req, dest:req, src:req, t:req
15SRCDAT4 equ dword ptr [rD + rN * 1] 33 op dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)]
34endm
16 35
17CRC_XOR macro dest:req, src:req, t:req 36CRC_XOR macro dest:req, src:req, t:req
18 xor dest, QWORD PTR [rT + src * 8 + 0800h * t] 37 CRC_OP xor, dest, src, t
38endm
39
40CRC_MOV macro dest:req, src:req, t:req
41 CRC_OP mov, dest, src, t
19endm 42endm
20 43
21CRC1b macro 44CRC1b macro
22 movzx x6, BYTE PTR [rD] 45 movzx x6, BYTE PTR [rD]
23 inc rD 46 inc rD
24 movzx x3, x0_L 47 MOVZXLO x3, x0
25 xor x6, x3 48 xor x6, x3
26 shr r0, 8 49 shr r0, 8
27 CRC_XOR r0, r6, 0 50 CRC_XOR r0, x6, 0
28 dec rN 51 dec rN
29endm
30
31MY_PROLOG macro crc_end:req
32 ifdef ABI_LINUX
33 MY_PUSH_2_REGS
34 else
35 MY_PUSH_4_REGS
36 endif
37 mov r0, REG_ABI_PARAM_0
38 mov rN, REG_ABI_PARAM_2
39 mov rT, REG_ABI_PARAM_3
40 mov rD, REG_ABI_PARAM_1
41 test rN, rN
42 jz crc_end
43 @@:
44 test rD, 3
45 jz @F
46 CRC1b
47 jnz @B
48 @@:
49 cmp rN, 8
50 jb crc_end
51 add rN, rD
52 mov num_VAR, rN
53 sub rN, 4
54 and rN, NOT 3
55 sub rD, rN
56 mov x1, SRCDAT4
57 xor r0, r1
58 add rN, 4
59endm
60
61MY_EPILOG macro crc_end:req
62 sub rN, 4
63 mov x1, SRCDAT4
64 xor r0, r1
65 mov rD, rN
66 mov rN, num_VAR
67 sub rN, rD
68 crc_end:
69 test rN, rN
70 jz @F
71 CRC1b
72 jmp crc_end
73 @@:
74 ifdef ABI_LINUX
75 MY_POP_2_REGS
76 else
77 MY_POP_4_REGS
78 endif
79endm 52endm
80 53
81MY_PROC XzCrc64UpdateT4, 4 54
82 MY_PROLOG crc_end_4 55; ALIGN_MASK is 3 or 7 bytes alignment:
83 align 16 56ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
84 main_loop_4: 57
85 mov x1, SRCDAT4 58if NUM_WORDS eq 1
86 movzx x2, x0_L 59
87 movzx x3, x0_H 60src_rN_offset equ 4
88 shr r0, 16 61; + 4 for prefetching next 4-bytes after current iteration
89 movzx x6, x0_L 62NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 4)
90 movzx x7, x0_H 63SRCDAT4 equ DWORD PTR [rN + rD * 1]
91 shr r0, 16 64
92 CRC_XOR r1, r2, 3 65XOR_NEXT macro
93 CRC_XOR r0, r3, 2 66 mov x1, [rD]
94 CRC_XOR r1, r6, 1 67 xor r0, r1
95 CRC_XOR r0, r7, 0 68endm
96 xor r0, r1 69
97 70else ; NUM_WORDS > 1
98 add rD, 4 71
99 jnz main_loop_4 72src_rN_offset equ 8
100 73; + 8 for prefetching next 8-bytes after current iteration
101 MY_EPILOG crc_end_4 74NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8)
75
76XOR_NEXT macro
77 xor r0, QWORD PTR [rD] ; 64-bit read, can be unaligned
78endm
79
80; 32-bit or 64-bit
81LOAD_SRC_MULT4 macro dest:req, word_index:req
82 mov dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset];
83endm
84
85endif
86
87
88
89MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4
90 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
91
92 mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
93 mov rD, REG_ABI_PARAM_1 ; r11 <- r2 / r6
94 mov rN, REG_ABI_PARAM_2 ; r10 <- r8 / r2
95if (IS_LINUX gt 0)
96 mov rT, REG_ABI_PARAM_3 ; r9 <- r9 / r1
97endif
98
99 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
100 jb crc_end
101@@:
102 test rD, ALIGN_MASK
103 jz @F
104 CRC1b
105 jmp @B
106@@:
107 XOR_NEXT
108 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
109 sub rD, rN
110 add rN, src_rN_offset
111
112align 16
113@@:
114
115if NUM_WORDS eq 1
116
117 mov x1, x0
118 shr x1, 8
119 MOVZXLO x3, x1
120 MOVZXLO x2, x0
121 shr x1, 8
122 shr r0, 32
123 xor x0, SRCDAT4
124 CRC_XOR r0, x2, 3
125 CRC_XOR r0, x3, 2
126 MOVZXLO x2, x1
127 shr x1, 8
128 CRC_XOR r0, x2, 1
129 CRC_XOR r0, x1, 0
130
131else ; NUM_WORDS > 1
132
133if NUM_WORDS ne 2
134 k = 2
135 while k lt NUM_WORDS
136
137 LOAD_SRC_MULT4 x1, k
138 crc_op1 textequ <xor>
139
140 if k eq 2
141 if (NUM_WORDS and 1)
142 LOAD_SRC_MULT4 x7, NUM_WORDS ; aligned 32-bit
143 LOAD_SRC_MULT4 x6, NUM_WORDS + 1 ; aligned 32-bit
144 shl r6, 32
145 else
146 LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
147 crc_op1 textequ <mov>
148 endif
149 endif
150 table = 4 * (NUM_WORDS - 1 - k)
151 MOVZXLO x3, x1
152 CRC_OP crc_op1, r7, x3, 3 + table
153 MOVZXHI x3, x1
154 shr x1, 16
155 CRC_XOR r6, x3, 2 + table
156 MOVZXLO x3, x1
157 shr x1, 8
158 CRC_XOR r7, x3, 1 + table
159 CRC_XOR r6, x1, 0 + table
160 k = k + 1
161 endm
162 crc_op2 textequ <xor>
163
164else ; NUM_WORDS == 2
165 LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
166 crc_op2 textequ <mov>
167endif ; NUM_WORDS == 2
168
169 MOVZXHI x3, x0
170 MOVZXLO x2, x0
171 mov r1, r0
172 shr r1, 32
173 shr x0, 16
174 CRC_XOR r6, x2, NUM_SKIP_BYTES + 7
175 CRC_OP crc_op2, r7, x3, NUM_SKIP_BYTES + 6
176 MOVZXLO x2, x0
177 MOVZXHI x5, x1
178 MOVZXLO x3, x1
179 shr x0, 8
180 shr x1, 16
181 CRC_XOR r7, x2, NUM_SKIP_BYTES + 5
182 CRC_XOR r6, x3, NUM_SKIP_BYTES + 3
183 CRC_XOR r7, x0, NUM_SKIP_BYTES + 4
184 CRC_XOR r6, x5, NUM_SKIP_BYTES + 2
185 MOVZXLO x2, x1
186 shr x1, 8
187 CRC_XOR r7, x2, NUM_SKIP_BYTES + 1
188 CRC_MOV r0, x1, NUM_SKIP_BYTES + 0
189 xor r0, r6
190 xor r0, r7
191
192endif ; NUM_WORDS > 1
193 add rD, NUM_WORDS * 4
194 jnc @B
195
196 sub rN, src_rN_offset
197 add rD, rN
198 XOR_NEXT
199 add rN, NUM_BYTES_LIMIT - 1
200 sub rN, rD
201
202crc_end:
203 test rN, rN
204 jz func_end
205@@:
206 CRC1b
207 jnz @B
208func_end:
209 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
102MY_ENDP 210MY_ENDP
103 211
212
213
104else 214else
215; ==================================================================
105; x86 (32-bit) 216; x86 (32-bit)
106 217
107rD equ r1 218rD equ r7
108rN equ r7 219rN equ r1
109rT equ r5 220rT equ r5
110 221
222xA equ x6
223xA_R equ r6
224
225ifdef x64
226 num_VAR equ r8
227else
228
111crc_OFFS equ (REG_SIZE * 5) 229crc_OFFS equ (REG_SIZE * 5)
112 230
113if (IS_CDECL gt 0) or (IS_LINUX gt 0) 231if (IS_CDECL gt 0) or (IS_LINUX gt 0)
@@ -133,107 +251,273 @@ else
133 table_VAR equ [r4 + table_OFFS] 251 table_VAR equ [r4 + table_OFFS]
134 num_VAR equ table_VAR 252 num_VAR equ table_VAR
135endif 253endif
254endif ; x64
255
256SRCDAT4 equ DWORD PTR [rN + rD * 1]
136 257
137SRCDAT4 equ dword ptr [rD + rN * 1] 258CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req
259 op dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4]
260endm
138 261
139CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req 262CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
140 op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t] 263 CRC_1 op0, dest0, src, t, 0
141 op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4] 264 CRC_1 op1, dest1, src, t, 1
142endm 265endm
143 266
144CRC_XOR macro dest0:req, dest1:req, src:req, t:req 267CRC_XOR macro dest0:req, dest1:req, src:req, t:req
145 CRC xor, xor, dest0, dest1, src, t 268 CRC xor, xor, dest0, dest1, src, t
146endm 269endm
147 270
148 271
149CRC1b macro 272CRC1b macro
150 movzx x6, BYTE PTR [rD] 273 movzx xA, BYTE PTR [rD]
151 inc rD 274 inc rD
152 movzx x3, x0_L 275 MOVZXLO x3, x0
153 xor x6, x3 276 xor xA, x3
154 shrd r0, r2, 8 277 shrd x0, x2, 8
155 shr r2, 8 278 shr x2, 8
156 CRC_XOR r0, r2, r6, 0 279 CRC_XOR x0, x2, xA, 0
157 dec rN 280 dec rN
158endm 281endm
159 282
160MY_PROLOG macro crc_end:req 283
161 MY_PUSH_4_REGS 284MY_PROLOG_BASE macro
162 285 MY_PUSH_4_REGS
163 if (IS_CDECL gt 0) or (IS_LINUX gt 0) 286ifdef x64
164 proc_numParams = proc_numParams + 2 ; for ABI_LINUX 287 mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
165 mov rN, [r4 + size_OFFS] 288 mov rT, REG_ABI_PARAM_3 ; r5 <- r9 / r1
166 mov rD, [r4 + data_OFFS] 289 mov rN, REG_ABI_PARAM_2 ; r1 <- r8 / r2
290 mov rD, REG_ABI_PARAM_1 ; r7 <- r2 / r6
291 mov r2, r0
292 shr r2, 32
293 mov x0, x0
294else
295 if (IS_CDECL gt 0) or (IS_LINUX gt 0)
296 proc_numParams = proc_numParams + 2 ; for ABI_LINUX
297 mov rN, [r4 + size_OFFS]
298 mov rD, [r4 + data_OFFS]
299 else
300 mov rD, REG_ABI_PARAM_0 ; r7 <- r1 : (data)
301 mov rN, REG_ABI_PARAM_1 ; r1 <- r2 : (size)
302 endif
303 mov x0, [r4 + crc_OFFS]
304 mov x2, [r4 + crc_OFFS + 4]
305 mov rT, table_VAR
306endif
307endm
308
309
310MY_EPILOG_BASE macro crc_end:req, func_end:req
311crc_end:
312 test rN, rN
313 jz func_end
314@@:
315 CRC1b
316 jnz @B
317func_end:
318ifdef x64
319 shl r2, 32
320 xor r0, r2
321endif
322 MY_POP_4_REGS
323endm
324
325
326; ALIGN_MASK is 3 or 7 bytes alignment:
327ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
328
329if (NUM_WORDS eq 1)
330
331NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4)
332
333MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
334 MY_PROLOG_BASE
335
336 cmp rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK
337 jb crc_end_4
338@@:
339 test rD, ALIGN_MASK
340 jz @F
341 CRC1b
342 jmp @B
343@@:
344 xor x0, [rD]
345 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)]
346 sub rD, rN
347 add rN, 4
348
349 MOVZXLO xA, x0
350align 16
351@@:
352 mov x3, SRCDAT4
353 xor x3, x2
354 shr x0, 8
355 CRC xor, mov, x3, x2, xA, 3
356 MOVZXLO xA, x0
357 shr x0, 8
358 ; MOVZXHI xA, x0
359 ; shr x0, 16
360 CRC_XOR x3, x2, xA, 2
361
362 MOVZXLO xA, x0
363 shr x0, 8
364 CRC_XOR x3, x2, xA, 1
365 CRC_XOR x3, x2, x0, 0
366 MOVZXLO xA, x3
367 mov x0, x3
368
369 add rD, 4
370 jnc @B
371
372 sub rN, 4
373 add rD, rN
374 xor x0, [rD]
375 add rN, NUM_BYTES_LIMIT_T4 - 1
376 sub rN, rD
377 MY_EPILOG_BASE crc_end_4, func_end_4
378MY_ENDP
379
380else ; NUM_WORDS > 1
381
382SHR_X macro x, imm
383 shr x, imm
384endm
385
386
387ITER_1 macro v0, v1, a, off
388 MOVZXLO xA, a
389 SHR_X a, 8
390 CRC_XOR v0, v1, xA, off
391endm
392
393
394ITER_4 macro v0, v1, a, off
395if 0 eq 0
396 ITER_1 v0, v1, a, off + 3
397 ITER_1 v0, v1, a, off + 2
398 ITER_1 v0, v1, a, off + 1
399 CRC_XOR v0, v1, a, off
400elseif 0 eq 0
401 MOVZXLO xA, a
402 CRC_XOR v0, v1, xA, off + 3
403 mov xA, a
404 ror a, 16 ; 32-bit ror
405 shr xA, 24
406 CRC_XOR v0, v1, xA, off
407 MOVZXLO xA, a
408 SHR_X a, 24
409 CRC_XOR v0, v1, xA, off + 1
410 CRC_XOR v0, v1, a, off + 2
411else
412 ; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction
413 MOVZXLO xA, a
414 CRC_XOR v0, v1, xA, off + 3
415 MOVZXHI xA, a
416 SHR_X a, 16
417 CRC_XOR v0, v1, xA, off + 2
418 MOVZXLO xA, a
419 SHR_X a, 8
420 CRC_XOR v0, v1, xA, off + 1
421 CRC_XOR v0, v1, a, off
422endif
423endm
424
425
426
427ITER_1_PAIR macro v0, v1, a0, a1, off
428 ITER_1 v0, v1, a0, off + 4
429 ITER_1 v0, v1, a1, off
430endm
431
432src_rD_offset equ 8
433STEP_SIZE equ (NUM_WORDS * 4)
434
435ITER_12_NEXT macro op, index, v0, v1
436 op v0, DWORD PTR [rD + (index + 1) * STEP_SIZE - src_rD_offset]
437 op v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset]
438endm
439
440ITER_12 macro index, a0, a1, v0, v1
441
442 if NUM_SKIP_BYTES eq 0
443 ITER_12_NEXT mov, index, v0, v1
167 else 444 else
168 mov rN, r2 445 k = 0
446 while k lt NUM_SKIP_BYTES
447 movzx xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset]
448 if k eq 0
449 CRC mov, mov, v0, v1, xA, NUM_SKIP_BYTES - 1 - k
450 else
451 CRC_XOR v0, v1, xA, NUM_SKIP_BYTES - 1 - k
452 endif
453 k = k + 1
454 endm
455 ITER_12_NEXT xor, index, v0, v1
169 endif 456 endif
170 457
171 mov x0, [r4 + crc_OFFS] 458if 0 eq 0
172 mov x2, [r4 + crc_OFFS + 4] 459 ITER_4 v0, v1, a0, NUM_SKIP_BYTES + 4
173 mov rT, table_VAR 460 ITER_4 v0, v1, a1, NUM_SKIP_BYTES
174 test rN, rN 461else ; interleave version is faster/slower for different processors
175 jz crc_end 462 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3
176 @@: 463 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2
177 test rD, 3 464 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1
178 jz @F 465 CRC_XOR v0, v1, a0, NUM_SKIP_BYTES + 4
179 CRC1b 466 CRC_XOR v0, v1, a1, NUM_SKIP_BYTES
180 jnz @B 467endif
181 @@: 468endm
182 cmp rN, 8 469
183 jb crc_end 470; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads)
184 add rN, rD 471UNROLL_CNT equ (2 * 1)
185 472NUM_BYTES_LIMIT equ (STEP_SIZE * UNROLL_CNT + 8)
186 mov num_VAR, rN 473
187 474MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
188 sub rN, 4 475 MY_PROLOG_BASE
189 and rN, NOT 3 476
190 sub rD, rN 477 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
191 xor r0, SRCDAT4 478 jb crc_end_12
192 add rN, 4 479@@:
193endm 480 test rD, ALIGN_MASK
194 481 jz @F
195MY_EPILOG macro crc_end:req 482 CRC1b
196 sub rN, 4 483 jmp @B
197 xor r0, SRCDAT4 484@@:
198 485 xor x0, [rD]
199 mov rD, rN 486 xor x2, [rD + 4]
200 mov rN, num_VAR 487 add rD, src_rD_offset
201 sub rN, rD 488 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
202 crc_end: 489 mov num_VAR, rN
203 test rN, rN 490
204 jz @F 491align 16
205 CRC1b 492@@:
206 jmp crc_end 493 i = 0
207 @@: 494 rept UNROLL_CNT
208 MY_POP_4_REGS 495 if (i and 1) eq 0
209endm 496 ITER_12 i, x0, x2, x1, x3
210 497 else
211MY_PROC XzCrc64UpdateT4, 5 498 ITER_12 i, x1, x3, x0, x2
212 MY_PROLOG crc_end_4 499 endif
213 movzx x6, x0_L 500 i = i + 1
214 align 16 501 endm
215 main_loop_4: 502
216 mov r3, SRCDAT4 503 if (UNROLL_CNT and 1)
217 xor r3, r2 504 mov x0, x1
218 505 mov x2, x3
219 CRC xor, mov, r3, r2, r6, 3 506 endif
220 movzx x6, x0_H 507 add rD, STEP_SIZE * UNROLL_CNT
221 shr r0, 16 508 cmp rD, num_VAR
222 CRC_XOR r3, r2, r6, 2 509 jb @B
223 510
224 movzx x6, x0_L 511 mov rN, num_VAR
225 movzx x0, x0_H 512 add rN, NUM_BYTES_LIMIT - 1
226 CRC_XOR r3, r2, r6, 1 513 sub rN, rD
227 CRC_XOR r3, r2, r0, 0 514 sub rD, src_rD_offset
228 movzx x6, x3_L 515 xor x0, [rD]
229 mov r0, r3 516 xor x2, [rD + 4]
230 517
231 add rD, 4 518 MY_EPILOG_BASE crc_end_12, func_end_12
232 jnz main_loop_4
233
234 MY_EPILOG crc_end_4
235MY_ENDP 519MY_ENDP
236 520
521endif ; (NUM_WORDS > 1)
237endif ; ! x64 522endif ; ! x64
238
239end 523end