aboutsummaryrefslogtreecommitdiff
path: root/Asm/x86
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-14 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-15 23:55:04 +0500
commitfc662341e6f85da78ada0e443f6116b978f79f22 (patch)
tree1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /Asm/x86
parent5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff)
download7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.gz
7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.bz2
7zip-fc662341e6f85da78ada0e443f6116b978f79f22.zip
24.0524.05
Diffstat (limited to 'Asm/x86')
-rw-r--r--Asm/x86/7zAsm.asm68
-rw-r--r--Asm/x86/7zCrcOpt.asm352
-rw-r--r--Asm/x86/XzCrc64Opt.asm632
3 files changed, 733 insertions, 319 deletions
diff --git a/Asm/x86/7zAsm.asm b/Asm/x86/7zAsm.asm
index 19c40da..8910d16 100644
--- a/Asm/x86/7zAsm.asm
+++ b/Asm/x86/7zAsm.asm
@@ -1,5 +1,5 @@
1; 7zAsm.asm -- ASM macros 1; 7zAsm.asm -- ASM macros
2; 2022-05-16 : Igor Pavlov : Public domain 2; 2023-12-08 : Igor Pavlov : Public domain
3 3
4 4
5; UASM can require these changes 5; UASM can require these changes
@@ -43,7 +43,7 @@ else
43endif 43endif
44endif 44endif
45 45
46OPTION PROLOGUE:NONE 46OPTION PROLOGUE:NONE
47OPTION EPILOGUE:NONE 47OPTION EPILOGUE:NONE
48 48
49MY_ASM_START macro 49MY_ASM_START macro
@@ -121,10 +121,29 @@ endif
121 x2_H equ DH 121 x2_H equ DH
122 x3_H equ BH 122 x3_H equ BH
123 123
124; r0_L equ AL
125; r1_L equ CL
126; r2_L equ DL
127; r3_L equ BL
128
129; r0_H equ AH
130; r1_H equ CH
131; r2_H equ DH
132; r3_H equ BH
133
134
124ifdef x64 135ifdef x64
125 x5_L equ BPL 136 x5_L equ BPL
126 x6_L equ SIL 137 x6_L equ SIL
127 x7_L equ DIL 138 x7_L equ DIL
139 x8_L equ r8b
140 x9_L equ r9b
141 x10_L equ r10b
142 x11_L equ r11b
143 x12_L equ r12b
144 x13_L equ r13b
145 x14_L equ r14b
146 x15_L equ r15b
128 147
129 r0 equ RAX 148 r0 equ RAX
130 r1 equ RCX 149 r1 equ RCX
@@ -153,6 +172,22 @@ else
153 r7 equ x7 172 r7 equ x7
154endif 173endif
155 174
175 x0_R equ r0
176 x1_R equ r1
177 x2_R equ r2
178 x3_R equ r3
179 x4_R equ r4
180 x5_R equ r5
181 x6_R equ r6
182 x7_R equ r7
183 x8_R equ r8
184 x9_R equ r9
185 x10_R equ r10
186 x11_R equ r11
187 x12_R equ r12
188 x13_R equ r13
189 x14_R equ r14
190 x15_R equ r15
156 191
157ifdef x64 192ifdef x64
158ifdef ABI_LINUX 193ifdef ABI_LINUX
@@ -200,6 +235,14 @@ REG_ABI_PARAM_0 equ REG_PARAM_0
200REG_ABI_PARAM_1_x equ REG_PARAM_1_x 235REG_ABI_PARAM_1_x equ REG_PARAM_1_x
201REG_ABI_PARAM_1 equ REG_PARAM_1 236REG_ABI_PARAM_1 equ REG_PARAM_1
202 237
238MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
239 MY_PUSH_4_REGS
240endm
241
242MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
243 MY_POP_4_REGS
244endm
245
203else 246else
204; x64 247; x64
205 248
@@ -261,12 +304,25 @@ endm
261endif ; IS_LINUX 304endif ; IS_LINUX
262 305
263 306
264MY_PUSH_PRESERVED_ABI_REGS macro 307MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
265 if (IS_LINUX gt 0) 308 if (IS_LINUX gt 0)
266 MY_PUSH_2_REGS 309 MY_PUSH_2_REGS
267 else 310 else
268 MY_PUSH_4_REGS 311 MY_PUSH_4_REGS
269 endif 312 endif
313endm
314
315MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
316 if (IS_LINUX gt 0)
317 MY_POP_2_REGS
318 else
319 MY_POP_4_REGS
320 endif
321endm
322
323
324MY_PUSH_PRESERVED_ABI_REGS macro
325 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
270 push r12 326 push r12
271 push r13 327 push r13
272 push r14 328 push r14
@@ -279,11 +335,7 @@ MY_POP_PRESERVED_ABI_REGS macro
279 pop r14 335 pop r14
280 pop r13 336 pop r13
281 pop r12 337 pop r12
282 if (IS_LINUX gt 0) 338 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
283 MY_POP_2_REGS
284 else
285 MY_POP_4_REGS
286 endif
287endm 339endm
288 340
289endif ; x64 341endif ; x64
diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm
index 0fee206..c5de808 100644
--- a/Asm/x86/7zCrcOpt.asm
+++ b/Asm/x86/7zCrcOpt.asm
@@ -1,180 +1,258 @@
1; 7zCrcOpt.asm -- CRC32 calculation : optimized version 1; 7zCrcOpt.asm -- CRC32 calculation : optimized version
2; 2021-02-07 : Igor Pavlov : Public domain 2; 2023-12-08 : Igor Pavlov : Public domain
3 3
4include 7zAsm.asm 4include 7zAsm.asm
5 5
6MY_ASM_START 6MY_ASM_START
7 7
8rD equ r2 8NUM_WORDS equ 3
9rN equ r7 9UNROLL_CNT equ 2
10rT equ r5
11 10
12ifdef x64 11if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
13 num_VAR equ r8 12.err <NUM_WORDS_IS_INCORRECT>
14 table_VAR equ r9 13endif
15else 14if (UNROLL_CNT lt 1)
16 if (IS_CDECL gt 0) 15.err <UNROLL_CNT_IS_INCORRECT>
17 crc_OFFS equ (REG_SIZE * 5)
18 data_OFFS equ (REG_SIZE + crc_OFFS)
19 size_OFFS equ (REG_SIZE + data_OFFS)
20 else
21 size_OFFS equ (REG_SIZE * 5)
22 endif
23 table_OFFS equ (REG_SIZE + size_OFFS)
24 num_VAR equ [r4 + size_OFFS]
25 table_VAR equ [r4 + table_OFFS]
26endif 16endif
27 17
28SRCDAT equ rD + rN * 1 + 4 * 18rD equ r2
19rD_x equ x2
20rN equ r7
21rT equ r5
22
23ifndef x64
24 if (IS_CDECL gt 0)
25 crc_OFFS equ (REG_SIZE * 5)
26 data_OFFS equ (REG_SIZE + crc_OFFS)
27 size_OFFS equ (REG_SIZE + data_OFFS)
28 else
29 size_OFFS equ (REG_SIZE * 5)
30 endif
31 table_OFFS equ (REG_SIZE + size_OFFS)
32endif
33
34; rN + rD is same speed as rD, but we reduce one instruction in loop
35SRCDAT_1 equ rN + rD * 1 + 1 *
36SRCDAT_4 equ rN + rD * 1 + 4 *
29 37
30CRC macro op:req, dest:req, src:req, t:req 38CRC macro op:req, dest:req, src:req, t:req
31 op dest, DWORD PTR [rT + src * 4 + 0400h * t] 39 op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)]
32endm 40endm
33 41
34CRC_XOR macro dest:req, src:req, t:req 42CRC_XOR macro dest:req, src:req, t:req
35 CRC xor, dest, src, t 43 CRC xor, dest, src, t
36endm 44endm
37 45
38CRC_MOV macro dest:req, src:req, t:req 46CRC_MOV macro dest:req, src:req, t:req
39 CRC mov, dest, src, t 47 CRC mov, dest, src, t
48endm
49
50MOVZXLO macro dest:req, src:req
51 movzx dest, @CatStr(src, _L)
52endm
53
54MOVZXHI macro dest:req, src:req
55 movzx dest, @CatStr(src, _H)
40endm 56endm
41 57
58; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest
59; movzx x3, x0_L sometimes is 0 cycles latency (not always)
60; movzx x3, x0_L sometimes is 0.5 cycles latency
61; movzx x3, x0_H is 2 cycles latency in some cpus
62
42CRC1b macro 63CRC1b macro
43 movzx x6, BYTE PTR [rD] 64 movzx x6, byte ptr [rD]
44 inc rD 65 MOVZXLO x3, x0
45 movzx x3, x0_L 66 inc rD
46 xor x6, x3 67 shr x0, 8
47 shr x0, 8 68 xor x6, x3
48 CRC xor, x0, r6, 0 69 CRC_XOR x0, x6, 0
49 dec rN 70 dec rN
71endm
72
73LOAD_1 macro dest:req, t:req, iter:req, index:req
74 movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
75endm
76
77LOAD_2 macro dest:req, t:req, iter:req, index:req
78 movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
79endm
80
81CRC_QUAD macro nn, t:req, iter:req
82ifdef x64
83 ; paired memory loads give 1-3% speed gain, but it uses more registers
84 LOAD_2 x3, t, iter, 0
85 LOAD_2 x9, t, iter, 2
86 MOVZXLO x6, x3
87 shr x3, 8
88 CRC_XOR nn, x6, t * 4 + 3
89 MOVZXLO x6, x9
90 shr x9, 8
91 CRC_XOR nn, x3, t * 4 + 2
92 CRC_XOR nn, x6, t * 4 + 1
93 CRC_XOR nn, x9, t * 4 + 0
94elseif 0
95 LOAD_2 x3, t, iter, 0
96 MOVZXLO x6, x3
97 shr x3, 8
98 CRC_XOR nn, x6, t * 4 + 3
99 CRC_XOR nn, x3, t * 4 + 2
100 LOAD_2 x3, t, iter, 2
101 MOVZXLO x6, x3
102 shr x3, 8
103 CRC_XOR nn, x6, t * 4 + 1
104 CRC_XOR nn, x3, t * 4 + 0
105elseif 0
106 LOAD_1 x3, t, iter, 0
107 LOAD_1 x6, t, iter, 1
108 CRC_XOR nn, x3, t * 4 + 3
109 CRC_XOR nn, x6, t * 4 + 2
110 LOAD_1 x3, t, iter, 2
111 LOAD_1 x6, t, iter, 3
112 CRC_XOR nn, x3, t * 4 + 1
113 CRC_XOR nn, x6, t * 4 + 0
114else
115 ; 32-bit load is better if there is only one read port (core2)
116 ; but that code can be slower if there are 2 read ports (snb)
117 mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)]
118 MOVZXLO x6, x3
119 CRC_XOR nn, x6, t * 4 + 3
120 MOVZXHI x6, x3
121 shr x3, 16
122 CRC_XOR nn, x6, t * 4 + 2
123 MOVZXLO x6, x3
124 shr x3, 8
125 CRC_XOR nn, x6, t * 4 + 1
126 CRC_XOR nn, x3, t * 4 + 0
127endif
50endm 128endm
51 129
52MY_PROLOG macro crc_end:req
53 130
131LAST equ (4 * (NUM_WORDS - 1))
132
133CRC_ITER macro qq, nn, iter
134 mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))]
135
136 i = 0
137 rept NUM_WORDS - 1
138 CRC_QUAD nn, i, iter
139 i = i + 1
140 endm
141
142 MOVZXLO x6, qq
143 mov x3, qq
144 shr x3, 24
145 CRC_XOR nn, x6, LAST + 3
146 CRC_XOR nn, x3, LAST + 0
147 ror qq, 16
148 MOVZXLO x6, qq
149 shr qq, 24
150 CRC_XOR nn, x6, LAST + 1
151if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1))
152 CRC_MOV qq, qq, LAST + 2
153 xor qq, nn
154else
155 CRC_XOR nn, qq, LAST + 2
156endif
157endm
158
159
160; + 4 for prefetching next 4-bytes after current iteration
161NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4)
162ALIGN_MASK equ 3
163
164
165; MY_PROC @CatStr(CrcUpdateT, 12), 4
166MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4
167 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
54 ifdef x64 168 ifdef x64
169 mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux)
170 mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux)
171 mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux)
172 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win)
55 if (IS_LINUX gt 0) 173 if (IS_LINUX gt 0)
56 MY_PUSH_2_REGS
57 mov x0, REG_ABI_PARAM_0_x ; x0 = x7
58 mov rT, REG_ABI_PARAM_3 ; r5 = r1
59 mov rN, REG_ABI_PARAM_2 ; r7 = r2
60 mov rD, REG_ABI_PARAM_1 ; r2 = r6 174 mov rD, REG_ABI_PARAM_1 ; r2 = r6
61 else
62 MY_PUSH_4_REGS
63 mov x0, REG_ABI_PARAM_0_x ; x0 = x1
64 mov rT, REG_ABI_PARAM_3 ; r5 = r9
65 mov rN, REG_ABI_PARAM_2 ; r7 = r8
66 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2
67 endif 175 endif
68 else 176 else
69 MY_PUSH_4_REGS
70 if (IS_CDECL gt 0) 177 if (IS_CDECL gt 0)
71 mov x0, [r4 + crc_OFFS] 178 mov x0, [r4 + crc_OFFS]
72 mov rD, [r4 + data_OFFS] 179 mov rD, [r4 + data_OFFS]
73 else 180 else
74 mov x0, REG_ABI_PARAM_0_x 181 mov x0, REG_ABI_PARAM_0_x
75 endif 182 endif
76 mov rN, num_VAR 183 mov rN, [r4 + size_OFFS]
77 mov rT, table_VAR 184 mov rT, [r4 + table_OFFS]
78 endif 185 endif
79 186
80 test rN, rN 187 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
81 jz crc_end 188 jb crc_end
82 @@: 189@@:
83 test rD, 7 190 test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK
84 jz @F 191 jz @F
85 CRC1b 192 CRC1b
86 jnz @B 193 jmp @B
87 @@: 194@@:
88 cmp rN, 16 195 xor x0, dword ptr [rD]
89 jb crc_end 196 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
90 add rN, rD 197 sub rD, rN
91 mov num_VAR, rN
92 sub rN, 8
93 and rN, NOT 7
94 sub rD, rN
95 xor x0, [SRCDAT 0]
96endm
97 198
98MY_EPILOG macro crc_end:req 199align 16
99 xor x0, [SRCDAT 0] 200@@:
100 mov rD, rN 201unr_index = 0
101 mov rN, num_VAR 202while unr_index lt UNROLL_CNT
102 sub rN, rD 203 if (unr_index and 1) eq 0
103 crc_end: 204 CRC_ITER x0, x1, unr_index
104 test rN, rN 205 else
105 jz @F 206 CRC_ITER x1, x0, unr_index
106 CRC1b 207 endif
107 jmp crc_end 208 unr_index = unr_index + 1
108 @@:
109 if (IS_X64 gt 0) and (IS_LINUX gt 0)
110 MY_POP_2_REGS
111 else
112 MY_POP_4_REGS
113 endif
114endm 209endm
115 210
116MY_PROC CrcUpdateT8, 4 211 add rD, NUM_WORDS * 4 * UNROLL_CNT
117 MY_PROLOG crc_end_8 212 jnc @B
118 mov x1, [SRCDAT 1] 213
119 align 16 214if 0
120 main_loop_8: 215 ; byte verson
121 mov x6, [SRCDAT 2] 216 add rD, rN
122 movzx x3, x1_L 217 xor x0, dword ptr [rD]
123 CRC_XOR x6, r3, 3 218 add rN, NUM_BYTES_LIMIT - 1
124 movzx x3, x1_H 219else
125 CRC_XOR x6, r3, 2 220 ; 4-byte version
126 shr x1, 16 221 add rN, 4 * NUM_WORDS * UNROLL_CNT
127 movzx x3, x1_L 222 sub rD, 4 * NUM_WORDS * UNROLL_CNT
128 movzx x1, x1_H 223@@:
129 CRC_XOR x6, r3, 1 224 MOVZXLO x3, x0
130 movzx x3, x0_L 225 MOVZXHI x1, x0
131 CRC_XOR x6, r1, 0 226 shr x0, 16
132 227 MOVZXLO x6, x0
133 mov x1, [SRCDAT 3] 228 shr x0, 8
134 CRC_XOR x6, r3, 7 229 CRC_MOV x0, x0, 0
135 movzx x3, x0_H 230 CRC_XOR x0, x3, 3
136 shr x0, 16 231 CRC_XOR x0, x1, 2
137 CRC_XOR x6, r3, 6 232 CRC_XOR x0, x6, 1
138 movzx x3, x0_L 233
139 CRC_XOR x6, r3, 5 234 add rD, 4
140 movzx x3, x0_H 235if (NUM_WORDS * UNROLL_CNT) ne 1
141 CRC_MOV x0, r3, 4 236 jc @F
142 xor x0, x6 237 xor x0, [SRCDAT_4 0]
143 add rD, 8 238 jmp @B
144 jnz main_loop_8 239@@:
145 240endif
146 MY_EPILOG crc_end_8 241 add rD, rN
147MY_ENDP 242 add rN, 4 - 1
243
244endif
245
246 sub rN, rD
247crc_end:
248 test rN, rN
249 jz func_end
250@@:
251 CRC1b
252 jnz @B
148 253
149MY_PROC CrcUpdateT4, 4 254func_end:
150 MY_PROLOG crc_end_4 255 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
151 align 16
152 main_loop_4:
153 movzx x1, x0_L
154 movzx x3, x0_H
155 shr x0, 16
156 movzx x6, x0_H
157 and x0, 0FFh
158 CRC_MOV x1, r1, 3
159 xor x1, [SRCDAT 1]
160 CRC_XOR x1, r3, 2
161 CRC_XOR x1, r6, 0
162 CRC_XOR x1, r0, 1
163
164 movzx x0, x1_L
165 movzx x3, x1_H
166 shr x1, 16
167 movzx x6, x1_H
168 and x1, 0FFh
169 CRC_MOV x0, r0, 3
170 xor x0, [SRCDAT 2]
171 CRC_XOR x0, r3, 2
172 CRC_XOR x0, r6, 0
173 CRC_XOR x0, r1, 1
174 add rD, 8
175 jnz main_loop_4
176
177 MY_EPILOG crc_end_4
178MY_ENDP 256MY_ENDP
179 257
180end 258end
diff --git a/Asm/x86/XzCrc64Opt.asm b/Asm/x86/XzCrc64Opt.asm
index ad22cc2..283424c 100644
--- a/Asm/x86/XzCrc64Opt.asm
+++ b/Asm/x86/XzCrc64Opt.asm
@@ -1,113 +1,231 @@
1; XzCrc64Opt.asm -- CRC64 calculation : optimized version 1; XzCrc64Opt.asm -- CRC64 calculation : optimized version
2; 2021-02-06 : Igor Pavlov : Public domain 2; 2023-12-08 : Igor Pavlov : Public domain
3 3
4include 7zAsm.asm 4include 7zAsm.asm
5 5
6MY_ASM_START 6MY_ASM_START
7 7
8NUM_WORDS equ 3
9
10if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
11.err <num_words_IS_INCORRECT>
12endif
13
14NUM_SKIP_BYTES equ ((NUM_WORDS - 2) * 4)
15
16
17MOVZXLO macro dest:req, src:req
18 movzx dest, @CatStr(src, _L)
19endm
20
21MOVZXHI macro dest:req, src:req
22 movzx dest, @CatStr(src, _H)
23endm
24
25
8ifdef x64 26ifdef x64
9 27
10rD equ r9 28rD equ r11
11rN equ r10 29rN equ r10
12rT equ r5 30rT equ r9
13num_VAR equ r8 31
14 32CRC_OP macro op:req, dest:req, src:req, t:req
15SRCDAT4 equ dword ptr [rD + rN * 1] 33 op dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)]
34endm
16 35
17CRC_XOR macro dest:req, src:req, t:req 36CRC_XOR macro dest:req, src:req, t:req
18 xor dest, QWORD PTR [rT + src * 8 + 0800h * t] 37 CRC_OP xor, dest, src, t
38endm
39
40CRC_MOV macro dest:req, src:req, t:req
41 CRC_OP mov, dest, src, t
19endm 42endm
20 43
21CRC1b macro 44CRC1b macro
22 movzx x6, BYTE PTR [rD] 45 movzx x6, BYTE PTR [rD]
23 inc rD 46 inc rD
24 movzx x3, x0_L 47 MOVZXLO x3, x0
25 xor x6, x3 48 xor x6, x3
26 shr r0, 8 49 shr r0, 8
27 CRC_XOR r0, r6, 0 50 CRC_XOR r0, x6, 0
28 dec rN 51 dec rN
29endm
30
31MY_PROLOG macro crc_end:req
32 ifdef ABI_LINUX
33 MY_PUSH_2_REGS
34 else
35 MY_PUSH_4_REGS
36 endif
37 mov r0, REG_ABI_PARAM_0
38 mov rN, REG_ABI_PARAM_2
39 mov rT, REG_ABI_PARAM_3
40 mov rD, REG_ABI_PARAM_1
41 test rN, rN
42 jz crc_end
43 @@:
44 test rD, 3
45 jz @F
46 CRC1b
47 jnz @B
48 @@:
49 cmp rN, 8
50 jb crc_end
51 add rN, rD
52 mov num_VAR, rN
53 sub rN, 4
54 and rN, NOT 3
55 sub rD, rN
56 mov x1, SRCDAT4
57 xor r0, r1
58 add rN, 4
59endm
60
61MY_EPILOG macro crc_end:req
62 sub rN, 4
63 mov x1, SRCDAT4
64 xor r0, r1
65 mov rD, rN
66 mov rN, num_VAR
67 sub rN, rD
68 crc_end:
69 test rN, rN
70 jz @F
71 CRC1b
72 jmp crc_end
73 @@:
74 ifdef ABI_LINUX
75 MY_POP_2_REGS
76 else
77 MY_POP_4_REGS
78 endif
79endm 52endm
80 53
81MY_PROC XzCrc64UpdateT4, 4 54
82 MY_PROLOG crc_end_4 55; ALIGN_MASK is 3 or 7 bytes alignment:
83 align 16 56ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
84 main_loop_4: 57
85 mov x1, SRCDAT4 58if NUM_WORDS eq 1
86 movzx x2, x0_L 59
87 movzx x3, x0_H 60src_rN_offset equ 4
88 shr r0, 16 61; + 4 for prefetching next 4-bytes after current iteration
89 movzx x6, x0_L 62NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 4)
90 movzx x7, x0_H 63SRCDAT4 equ DWORD PTR [rN + rD * 1]
91 shr r0, 16 64
92 CRC_XOR r1, r2, 3 65XOR_NEXT macro
93 CRC_XOR r0, r3, 2 66 mov x1, [rD]
94 CRC_XOR r1, r6, 1 67 xor r0, r1
95 CRC_XOR r0, r7, 0 68endm
96 xor r0, r1 69
97 70else ; NUM_WORDS > 1
98 add rD, 4 71
99 jnz main_loop_4 72src_rN_offset equ 8
100 73; + 8 for prefetching next 8-bytes after current iteration
101 MY_EPILOG crc_end_4 74NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8)
75
76XOR_NEXT macro
77 xor r0, QWORD PTR [rD] ; 64-bit read, can be unaligned
78endm
79
80; 32-bit or 64-bit
81LOAD_SRC_MULT4 macro dest:req, word_index:req
82 mov dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset];
83endm
84
85endif
86
87
88
89MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4
90 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
91
92 mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
93 mov rD, REG_ABI_PARAM_1 ; r11 <- r2 / r6
94 mov rN, REG_ABI_PARAM_2 ; r10 <- r8 / r2
95if (IS_LINUX gt 0)
96 mov rT, REG_ABI_PARAM_3 ; r9 <- r9 / r1
97endif
98
99 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
100 jb crc_end
101@@:
102 test rD, ALIGN_MASK
103 jz @F
104 CRC1b
105 jmp @B
106@@:
107 XOR_NEXT
108 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
109 sub rD, rN
110 add rN, src_rN_offset
111
112align 16
113@@:
114
115if NUM_WORDS eq 1
116
117 mov x1, x0
118 shr x1, 8
119 MOVZXLO x3, x1
120 MOVZXLO x2, x0
121 shr x1, 8
122 shr r0, 32
123 xor x0, SRCDAT4
124 CRC_XOR r0, x2, 3
125 CRC_XOR r0, x3, 2
126 MOVZXLO x2, x1
127 shr x1, 8
128 CRC_XOR r0, x2, 1
129 CRC_XOR r0, x1, 0
130
131else ; NUM_WORDS > 1
132
133if NUM_WORDS ne 2
134 k = 2
135 while k lt NUM_WORDS
136
137 LOAD_SRC_MULT4 x1, k
138 crc_op1 textequ <xor>
139
140 if k eq 2
141 if (NUM_WORDS and 1)
142 LOAD_SRC_MULT4 x7, NUM_WORDS ; aligned 32-bit
143 LOAD_SRC_MULT4 x6, NUM_WORDS + 1 ; aligned 32-bit
144 shl r6, 32
145 else
146 LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
147 crc_op1 textequ <mov>
148 endif
149 endif
150 table = 4 * (NUM_WORDS - 1 - k)
151 MOVZXLO x3, x1
152 CRC_OP crc_op1, r7, x3, 3 + table
153 MOVZXHI x3, x1
154 shr x1, 16
155 CRC_XOR r6, x3, 2 + table
156 MOVZXLO x3, x1
157 shr x1, 8
158 CRC_XOR r7, x3, 1 + table
159 CRC_XOR r6, x1, 0 + table
160 k = k + 1
161 endm
162 crc_op2 textequ <xor>
163
164else ; NUM_WORDS == 2
165 LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
166 crc_op2 textequ <mov>
167endif ; NUM_WORDS == 2
168
169 MOVZXHI x3, x0
170 MOVZXLO x2, x0
171 mov r1, r0
172 shr r1, 32
173 shr x0, 16
174 CRC_XOR r6, x2, NUM_SKIP_BYTES + 7
175 CRC_OP crc_op2, r7, x3, NUM_SKIP_BYTES + 6
176 MOVZXLO x2, x0
177 MOVZXHI x5, x1
178 MOVZXLO x3, x1
179 shr x0, 8
180 shr x1, 16
181 CRC_XOR r7, x2, NUM_SKIP_BYTES + 5
182 CRC_XOR r6, x3, NUM_SKIP_BYTES + 3
183 CRC_XOR r7, x0, NUM_SKIP_BYTES + 4
184 CRC_XOR r6, x5, NUM_SKIP_BYTES + 2
185 MOVZXLO x2, x1
186 shr x1, 8
187 CRC_XOR r7, x2, NUM_SKIP_BYTES + 1
188 CRC_MOV r0, x1, NUM_SKIP_BYTES + 0
189 xor r0, r6
190 xor r0, r7
191
192endif ; NUM_WORDS > 1
193 add rD, NUM_WORDS * 4
194 jnc @B
195
196 sub rN, src_rN_offset
197 add rD, rN
198 XOR_NEXT
199 add rN, NUM_BYTES_LIMIT - 1
200 sub rN, rD
201
202crc_end:
203 test rN, rN
204 jz func_end
205@@:
206 CRC1b
207 jnz @B
208func_end:
209 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
102MY_ENDP 210MY_ENDP
103 211
212
213
104else 214else
215; ==================================================================
105; x86 (32-bit) 216; x86 (32-bit)
106 217
107rD equ r1 218rD equ r7
108rN equ r7 219rN equ r1
109rT equ r5 220rT equ r5
110 221
222xA equ x6
223xA_R equ r6
224
225ifdef x64
226 num_VAR equ r8
227else
228
111crc_OFFS equ (REG_SIZE * 5) 229crc_OFFS equ (REG_SIZE * 5)
112 230
113if (IS_CDECL gt 0) or (IS_LINUX gt 0) 231if (IS_CDECL gt 0) or (IS_LINUX gt 0)
@@ -133,107 +251,273 @@ else
133 table_VAR equ [r4 + table_OFFS] 251 table_VAR equ [r4 + table_OFFS]
134 num_VAR equ table_VAR 252 num_VAR equ table_VAR
135endif 253endif
254endif ; x64
255
256SRCDAT4 equ DWORD PTR [rN + rD * 1]
136 257
137SRCDAT4 equ dword ptr [rD + rN * 1] 258CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req
259 op dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4]
260endm
138 261
139CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req 262CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
140 op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t] 263 CRC_1 op0, dest0, src, t, 0
141 op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4] 264 CRC_1 op1, dest1, src, t, 1
142endm 265endm
143 266
144CRC_XOR macro dest0:req, dest1:req, src:req, t:req 267CRC_XOR macro dest0:req, dest1:req, src:req, t:req
145 CRC xor, xor, dest0, dest1, src, t 268 CRC xor, xor, dest0, dest1, src, t
146endm 269endm
147 270
148 271
149CRC1b macro 272CRC1b macro
150 movzx x6, BYTE PTR [rD] 273 movzx xA, BYTE PTR [rD]
151 inc rD 274 inc rD
152 movzx x3, x0_L 275 MOVZXLO x3, x0
153 xor x6, x3 276 xor xA, x3
154 shrd r0, r2, 8 277 shrd x0, x2, 8
155 shr r2, 8 278 shr x2, 8
156 CRC_XOR r0, r2, r6, 0 279 CRC_XOR x0, x2, xA, 0
157 dec rN 280 dec rN
158endm 281endm
159 282
160MY_PROLOG macro crc_end:req 283
161 MY_PUSH_4_REGS 284MY_PROLOG_BASE macro
162 285 MY_PUSH_4_REGS
163 if (IS_CDECL gt 0) or (IS_LINUX gt 0) 286ifdef x64
164 proc_numParams = proc_numParams + 2 ; for ABI_LINUX 287 mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
165 mov rN, [r4 + size_OFFS] 288 mov rT, REG_ABI_PARAM_3 ; r5 <- r9 / r1
166 mov rD, [r4 + data_OFFS] 289 mov rN, REG_ABI_PARAM_2 ; r1 <- r8 / r2
290 mov rD, REG_ABI_PARAM_1 ; r7 <- r2 / r6
291 mov r2, r0
292 shr r2, 32
293 mov x0, x0
294else
295 if (IS_CDECL gt 0) or (IS_LINUX gt 0)
296 proc_numParams = proc_numParams + 2 ; for ABI_LINUX
297 mov rN, [r4 + size_OFFS]
298 mov rD, [r4 + data_OFFS]
299 else
300 mov rD, REG_ABI_PARAM_0 ; r7 <- r1 : (data)
301 mov rN, REG_ABI_PARAM_1 ; r1 <- r2 : (size)
302 endif
303 mov x0, [r4 + crc_OFFS]
304 mov x2, [r4 + crc_OFFS + 4]
305 mov rT, table_VAR
306endif
307endm
308
309
310MY_EPILOG_BASE macro crc_end:req, func_end:req
311crc_end:
312 test rN, rN
313 jz func_end
314@@:
315 CRC1b
316 jnz @B
317func_end:
318ifdef x64
319 shl r2, 32
320 xor r0, r2
321endif
322 MY_POP_4_REGS
323endm
324
325
326; ALIGN_MASK is 3 or 7 bytes alignment:
327ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
328
329if (NUM_WORDS eq 1)
330
331NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4)
332
333MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
334 MY_PROLOG_BASE
335
336 cmp rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK
337 jb crc_end_4
338@@:
339 test rD, ALIGN_MASK
340 jz @F
341 CRC1b
342 jmp @B
343@@:
344 xor x0, [rD]
345 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)]
346 sub rD, rN
347 add rN, 4
348
349 MOVZXLO xA, x0
350align 16
351@@:
352 mov x3, SRCDAT4
353 xor x3, x2
354 shr x0, 8
355 CRC xor, mov, x3, x2, xA, 3
356 MOVZXLO xA, x0
357 shr x0, 8
358 ; MOVZXHI xA, x0
359 ; shr x0, 16
360 CRC_XOR x3, x2, xA, 2
361
362 MOVZXLO xA, x0
363 shr x0, 8
364 CRC_XOR x3, x2, xA, 1
365 CRC_XOR x3, x2, x0, 0
366 MOVZXLO xA, x3
367 mov x0, x3
368
369 add rD, 4
370 jnc @B
371
372 sub rN, 4
373 add rD, rN
374 xor x0, [rD]
375 add rN, NUM_BYTES_LIMIT_T4 - 1
376 sub rN, rD
377 MY_EPILOG_BASE crc_end_4, func_end_4
378MY_ENDP
379
380else ; NUM_WORDS > 1
381
382SHR_X macro x, imm
383 shr x, imm
384endm
385
386
387ITER_1 macro v0, v1, a, off
388 MOVZXLO xA, a
389 SHR_X a, 8
390 CRC_XOR v0, v1, xA, off
391endm
392
393
394ITER_4 macro v0, v1, a, off
395if 0 eq 0
396 ITER_1 v0, v1, a, off + 3
397 ITER_1 v0, v1, a, off + 2
398 ITER_1 v0, v1, a, off + 1
399 CRC_XOR v0, v1, a, off
400elseif 0 eq 0
401 MOVZXLO xA, a
402 CRC_XOR v0, v1, xA, off + 3
403 mov xA, a
404 ror a, 16 ; 32-bit ror
405 shr xA, 24
406 CRC_XOR v0, v1, xA, off
407 MOVZXLO xA, a
408 SHR_X a, 24
409 CRC_XOR v0, v1, xA, off + 1
410 CRC_XOR v0, v1, a, off + 2
411else
412 ; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction
413 MOVZXLO xA, a
414 CRC_XOR v0, v1, xA, off + 3
415 MOVZXHI xA, a
416 SHR_X a, 16
417 CRC_XOR v0, v1, xA, off + 2
418 MOVZXLO xA, a
419 SHR_X a, 8
420 CRC_XOR v0, v1, xA, off + 1
421 CRC_XOR v0, v1, a, off
422endif
423endm
424
425
426
427ITER_1_PAIR macro v0, v1, a0, a1, off
428 ITER_1 v0, v1, a0, off + 4
429 ITER_1 v0, v1, a1, off
430endm
431
432src_rD_offset equ 8
433STEP_SIZE equ (NUM_WORDS * 4)
434
435ITER_12_NEXT macro op, index, v0, v1
436 op v0, DWORD PTR [rD + (index + 1) * STEP_SIZE - src_rD_offset]
437 op v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset]
438endm
439
440ITER_12 macro index, a0, a1, v0, v1
441
442 if NUM_SKIP_BYTES eq 0
443 ITER_12_NEXT mov, index, v0, v1
167 else 444 else
168 mov rN, r2 445 k = 0
446 while k lt NUM_SKIP_BYTES
447 movzx xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset]
448 if k eq 0
449 CRC mov, mov, v0, v1, xA, NUM_SKIP_BYTES - 1 - k
450 else
451 CRC_XOR v0, v1, xA, NUM_SKIP_BYTES - 1 - k
452 endif
453 k = k + 1
454 endm
455 ITER_12_NEXT xor, index, v0, v1
169 endif 456 endif
170 457
171 mov x0, [r4 + crc_OFFS] 458if 0 eq 0
172 mov x2, [r4 + crc_OFFS + 4] 459 ITER_4 v0, v1, a0, NUM_SKIP_BYTES + 4
173 mov rT, table_VAR 460 ITER_4 v0, v1, a1, NUM_SKIP_BYTES
174 test rN, rN 461else ; interleave version is faster/slower for different processors
175 jz crc_end 462 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3
176 @@: 463 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2
177 test rD, 3 464 ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1
178 jz @F 465 CRC_XOR v0, v1, a0, NUM_SKIP_BYTES + 4
179 CRC1b 466 CRC_XOR v0, v1, a1, NUM_SKIP_BYTES
180 jnz @B 467endif
181 @@: 468endm
182 cmp rN, 8 469
183 jb crc_end 470; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads)
184 add rN, rD 471UNROLL_CNT equ (2 * 1)
185 472NUM_BYTES_LIMIT equ (STEP_SIZE * UNROLL_CNT + 8)
186 mov num_VAR, rN 473
187 474MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
188 sub rN, 4 475 MY_PROLOG_BASE
189 and rN, NOT 3 476
190 sub rD, rN 477 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
191 xor r0, SRCDAT4 478 jb crc_end_12
192 add rN, 4 479@@:
193endm 480 test rD, ALIGN_MASK
194 481 jz @F
195MY_EPILOG macro crc_end:req 482 CRC1b
196 sub rN, 4 483 jmp @B
197 xor r0, SRCDAT4 484@@:
198 485 xor x0, [rD]
199 mov rD, rN 486 xor x2, [rD + 4]
200 mov rN, num_VAR 487 add rD, src_rD_offset
201 sub rN, rD 488 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
202 crc_end: 489 mov num_VAR, rN
203 test rN, rN 490
204 jz @F 491align 16
205 CRC1b 492@@:
206 jmp crc_end 493 i = 0
207 @@: 494 rept UNROLL_CNT
208 MY_POP_4_REGS 495 if (i and 1) eq 0
209endm 496 ITER_12 i, x0, x2, x1, x3
210 497 else
211MY_PROC XzCrc64UpdateT4, 5 498 ITER_12 i, x1, x3, x0, x2
212 MY_PROLOG crc_end_4 499 endif
213 movzx x6, x0_L 500 i = i + 1
214 align 16 501 endm
215 main_loop_4: 502
216 mov r3, SRCDAT4 503 if (UNROLL_CNT and 1)
217 xor r3, r2 504 mov x0, x1
218 505 mov x2, x3
219 CRC xor, mov, r3, r2, r6, 3 506 endif
220 movzx x6, x0_H 507 add rD, STEP_SIZE * UNROLL_CNT
221 shr r0, 16 508 cmp rD, num_VAR
222 CRC_XOR r3, r2, r6, 2 509 jb @B
223 510
224 movzx x6, x0_L 511 mov rN, num_VAR
225 movzx x0, x0_H 512 add rN, NUM_BYTES_LIMIT - 1
226 CRC_XOR r3, r2, r6, 1 513 sub rN, rD
227 CRC_XOR r3, r2, r0, 0 514 sub rD, src_rD_offset
228 movzx x6, x3_L 515 xor x0, [rD]
229 mov r0, r3 516 xor x2, [rD + 4]
230 517
231 add rD, 4 518 MY_EPILOG_BASE crc_end_12, func_end_12
232 jnz main_loop_4
233
234 MY_EPILOG crc_end_4
235MY_ENDP 519MY_ENDP
236 520
521endif ; (NUM_WORDS > 1)
237endif ; ! x64 522endif ; ! x64
238
239end 523end