diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-05-14 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-05-15 23:55:04 +0500 |
commit | fc662341e6f85da78ada0e443f6116b978f79f22 (patch) | |
tree | 1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /Asm/x86/7zCrcOpt.asm | |
parent | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff) | |
download | 7zip-24.05.tar.gz 7zip-24.05.tar.bz2 7zip-24.05.zip |
24.0524.05
Diffstat (limited to 'Asm/x86/7zCrcOpt.asm')
-rw-r--r-- | Asm/x86/7zCrcOpt.asm | 352 |
1 files changed, 215 insertions, 137 deletions
diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm index 0fee206..c5de808 100644 --- a/Asm/x86/7zCrcOpt.asm +++ b/Asm/x86/7zCrcOpt.asm | |||
@@ -1,180 +1,258 @@ | |||
1 | ; 7zCrcOpt.asm -- CRC32 calculation : optimized version | 1 | ; 7zCrcOpt.asm -- CRC32 calculation : optimized version |
2 | ; 2021-02-07 : Igor Pavlov : Public domain | 2 | ; 2023-12-08 : Igor Pavlov : Public domain |
3 | 3 | ||
4 | include 7zAsm.asm | 4 | include 7zAsm.asm |
5 | 5 | ||
6 | MY_ASM_START | 6 | MY_ASM_START |
7 | 7 | ||
8 | rD equ r2 | 8 | NUM_WORDS equ 3 |
9 | rN equ r7 | 9 | UNROLL_CNT equ 2 |
10 | rT equ r5 | ||
11 | 10 | ||
12 | ifdef x64 | 11 | if (NUM_WORDS lt 1) or (NUM_WORDS gt 64) |
13 | num_VAR equ r8 | 12 | .err <NUM_WORDS_IS_INCORRECT> |
14 | table_VAR equ r9 | 13 | endif |
15 | else | 14 | if (UNROLL_CNT lt 1) |
16 | if (IS_CDECL gt 0) | 15 | .err <UNROLL_CNT_IS_INCORRECT> |
17 | crc_OFFS equ (REG_SIZE * 5) | ||
18 | data_OFFS equ (REG_SIZE + crc_OFFS) | ||
19 | size_OFFS equ (REG_SIZE + data_OFFS) | ||
20 | else | ||
21 | size_OFFS equ (REG_SIZE * 5) | ||
22 | endif | ||
23 | table_OFFS equ (REG_SIZE + size_OFFS) | ||
24 | num_VAR equ [r4 + size_OFFS] | ||
25 | table_VAR equ [r4 + table_OFFS] | ||
26 | endif | 16 | endif |
27 | 17 | ||
28 | SRCDAT equ rD + rN * 1 + 4 * | 18 | rD equ r2 |
19 | rD_x equ x2 | ||
20 | rN equ r7 | ||
21 | rT equ r5 | ||
22 | |||
23 | ifndef x64 | ||
24 | if (IS_CDECL gt 0) | ||
25 | crc_OFFS equ (REG_SIZE * 5) | ||
26 | data_OFFS equ (REG_SIZE + crc_OFFS) | ||
27 | size_OFFS equ (REG_SIZE + data_OFFS) | ||
28 | else | ||
29 | size_OFFS equ (REG_SIZE * 5) | ||
30 | endif | ||
31 | table_OFFS equ (REG_SIZE + size_OFFS) | ||
32 | endif | ||
33 | |||
34 | ; rN + rD is same speed as rD, but we reduce one instruction in loop | ||
35 | SRCDAT_1 equ rN + rD * 1 + 1 * | ||
36 | SRCDAT_4 equ rN + rD * 1 + 4 * | ||
29 | 37 | ||
30 | CRC macro op:req, dest:req, src:req, t:req | 38 | CRC macro op:req, dest:req, src:req, t:req |
31 | op dest, DWORD PTR [rT + src * 4 + 0400h * t] | 39 | op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)] |
32 | endm | 40 | endm |
33 | 41 | ||
34 | CRC_XOR macro dest:req, src:req, t:req | 42 | CRC_XOR macro dest:req, src:req, t:req |
35 | CRC xor, dest, src, t | 43 | CRC xor, dest, src, t |
36 | endm | 44 | endm |
37 | 45 | ||
38 | CRC_MOV macro dest:req, src:req, t:req | 46 | CRC_MOV macro dest:req, src:req, t:req |
39 | CRC mov, dest, src, t | 47 | CRC mov, dest, src, t |
48 | endm | ||
49 | |||
50 | MOVZXLO macro dest:req, src:req | ||
51 | movzx dest, @CatStr(src, _L) | ||
52 | endm | ||
53 | |||
54 | MOVZXHI macro dest:req, src:req | ||
55 | movzx dest, @CatStr(src, _H) | ||
40 | endm | 56 | endm |
41 | 57 | ||
58 | ; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest | ||
59 | ; movzx x3, x0_L sometimes is 0 cycles latency (not always) | ||
60 | ; movzx x3, x0_L sometimes is 0.5 cycles latency | ||
61 | ; movzx x3, x0_H is 2 cycles latency in some cpus | ||
62 | |||
42 | CRC1b macro | 63 | CRC1b macro |
43 | movzx x6, BYTE PTR [rD] | 64 | movzx x6, byte ptr [rD] |
44 | inc rD | 65 | MOVZXLO x3, x0 |
45 | movzx x3, x0_L | 66 | inc rD |
46 | xor x6, x3 | 67 | shr x0, 8 |
47 | shr x0, 8 | 68 | xor x6, x3 |
48 | CRC xor, x0, r6, 0 | 69 | CRC_XOR x0, x6, 0 |
49 | dec rN | 70 | dec rN |
71 | endm | ||
72 | |||
73 | LOAD_1 macro dest:req, t:req, iter:req, index:req | ||
74 | movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] | ||
75 | endm | ||
76 | |||
77 | LOAD_2 macro dest:req, t:req, iter:req, index:req | ||
78 | movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] | ||
79 | endm | ||
80 | |||
81 | CRC_QUAD macro nn, t:req, iter:req | ||
82 | ifdef x64 | ||
83 | ; paired memory loads give 1-3% speed gain, but it uses more registers | ||
84 | LOAD_2 x3, t, iter, 0 | ||
85 | LOAD_2 x9, t, iter, 2 | ||
86 | MOVZXLO x6, x3 | ||
87 | shr x3, 8 | ||
88 | CRC_XOR nn, x6, t * 4 + 3 | ||
89 | MOVZXLO x6, x9 | ||
90 | shr x9, 8 | ||
91 | CRC_XOR nn, x3, t * 4 + 2 | ||
92 | CRC_XOR nn, x6, t * 4 + 1 | ||
93 | CRC_XOR nn, x9, t * 4 + 0 | ||
94 | elseif 0 | ||
95 | LOAD_2 x3, t, iter, 0 | ||
96 | MOVZXLO x6, x3 | ||
97 | shr x3, 8 | ||
98 | CRC_XOR nn, x6, t * 4 + 3 | ||
99 | CRC_XOR nn, x3, t * 4 + 2 | ||
100 | LOAD_2 x3, t, iter, 2 | ||
101 | MOVZXLO x6, x3 | ||
102 | shr x3, 8 | ||
103 | CRC_XOR nn, x6, t * 4 + 1 | ||
104 | CRC_XOR nn, x3, t * 4 + 0 | ||
105 | elseif 0 | ||
106 | LOAD_1 x3, t, iter, 0 | ||
107 | LOAD_1 x6, t, iter, 1 | ||
108 | CRC_XOR nn, x3, t * 4 + 3 | ||
109 | CRC_XOR nn, x6, t * 4 + 2 | ||
110 | LOAD_1 x3, t, iter, 2 | ||
111 | LOAD_1 x6, t, iter, 3 | ||
112 | CRC_XOR nn, x3, t * 4 + 1 | ||
113 | CRC_XOR nn, x6, t * 4 + 0 | ||
114 | else | ||
115 | ; 32-bit load is better if there is only one read port (core2) | ||
116 | ; but that code can be slower if there are 2 read ports (snb) | ||
117 | mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)] | ||
118 | MOVZXLO x6, x3 | ||
119 | CRC_XOR nn, x6, t * 4 + 3 | ||
120 | MOVZXHI x6, x3 | ||
121 | shr x3, 16 | ||
122 | CRC_XOR nn, x6, t * 4 + 2 | ||
123 | MOVZXLO x6, x3 | ||
124 | shr x3, 8 | ||
125 | CRC_XOR nn, x6, t * 4 + 1 | ||
126 | CRC_XOR nn, x3, t * 4 + 0 | ||
127 | endif | ||
50 | endm | 128 | endm |
51 | 129 | ||
52 | MY_PROLOG macro crc_end:req | ||
53 | 130 | ||
131 | LAST equ (4 * (NUM_WORDS - 1)) | ||
132 | |||
133 | CRC_ITER macro qq, nn, iter | ||
134 | mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))] | ||
135 | |||
136 | i = 0 | ||
137 | rept NUM_WORDS - 1 | ||
138 | CRC_QUAD nn, i, iter | ||
139 | i = i + 1 | ||
140 | endm | ||
141 | |||
142 | MOVZXLO x6, qq | ||
143 | mov x3, qq | ||
144 | shr x3, 24 | ||
145 | CRC_XOR nn, x6, LAST + 3 | ||
146 | CRC_XOR nn, x3, LAST + 0 | ||
147 | ror qq, 16 | ||
148 | MOVZXLO x6, qq | ||
149 | shr qq, 24 | ||
150 | CRC_XOR nn, x6, LAST + 1 | ||
151 | if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1)) | ||
152 | CRC_MOV qq, qq, LAST + 2 | ||
153 | xor qq, nn | ||
154 | else | ||
155 | CRC_XOR nn, qq, LAST + 2 | ||
156 | endif | ||
157 | endm | ||
158 | |||
159 | |||
160 | ; + 4 for prefetching next 4-bytes after current iteration | ||
161 | NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4) | ||
162 | ALIGN_MASK equ 3 | ||
163 | |||
164 | |||
165 | ; MY_PROC @CatStr(CrcUpdateT, 12), 4 | ||
166 | MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4 | ||
167 | MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
54 | ifdef x64 | 168 | ifdef x64 |
169 | mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux) | ||
170 | mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux) | ||
171 | mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux) | ||
172 | ; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win) | ||
55 | if (IS_LINUX gt 0) | 173 | if (IS_LINUX gt 0) |
56 | MY_PUSH_2_REGS | ||
57 | mov x0, REG_ABI_PARAM_0_x ; x0 = x7 | ||
58 | mov rT, REG_ABI_PARAM_3 ; r5 = r1 | ||
59 | mov rN, REG_ABI_PARAM_2 ; r7 = r2 | ||
60 | mov rD, REG_ABI_PARAM_1 ; r2 = r6 | 174 | mov rD, REG_ABI_PARAM_1 ; r2 = r6 |
61 | else | ||
62 | MY_PUSH_4_REGS | ||
63 | mov x0, REG_ABI_PARAM_0_x ; x0 = x1 | ||
64 | mov rT, REG_ABI_PARAM_3 ; r5 = r9 | ||
65 | mov rN, REG_ABI_PARAM_2 ; r7 = r8 | ||
66 | ; mov rD, REG_ABI_PARAM_1 ; r2 = r2 | ||
67 | endif | 175 | endif |
68 | else | 176 | else |
69 | MY_PUSH_4_REGS | ||
70 | if (IS_CDECL gt 0) | 177 | if (IS_CDECL gt 0) |
71 | mov x0, [r4 + crc_OFFS] | 178 | mov x0, [r4 + crc_OFFS] |
72 | mov rD, [r4 + data_OFFS] | 179 | mov rD, [r4 + data_OFFS] |
73 | else | 180 | else |
74 | mov x0, REG_ABI_PARAM_0_x | 181 | mov x0, REG_ABI_PARAM_0_x |
75 | endif | 182 | endif |
76 | mov rN, num_VAR | 183 | mov rN, [r4 + size_OFFS] |
77 | mov rT, table_VAR | 184 | mov rT, [r4 + table_OFFS] |
78 | endif | 185 | endif |
79 | 186 | ||
80 | test rN, rN | 187 | cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK |
81 | jz crc_end | 188 | jb crc_end |
82 | @@: | 189 | @@: |
83 | test rD, 7 | 190 | test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK |
84 | jz @F | 191 | jz @F |
85 | CRC1b | 192 | CRC1b |
86 | jnz @B | 193 | jmp @B |
87 | @@: | 194 | @@: |
88 | cmp rN, 16 | 195 | xor x0, dword ptr [rD] |
89 | jb crc_end | 196 | lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] |
90 | add rN, rD | 197 | sub rD, rN |
91 | mov num_VAR, rN | ||
92 | sub rN, 8 | ||
93 | and rN, NOT 7 | ||
94 | sub rD, rN | ||
95 | xor x0, [SRCDAT 0] | ||
96 | endm | ||
97 | 198 | ||
98 | MY_EPILOG macro crc_end:req | 199 | align 16 |
99 | xor x0, [SRCDAT 0] | 200 | @@: |
100 | mov rD, rN | 201 | unr_index = 0 |
101 | mov rN, num_VAR | 202 | while unr_index lt UNROLL_CNT |
102 | sub rN, rD | 203 | if (unr_index and 1) eq 0 |
103 | crc_end: | 204 | CRC_ITER x0, x1, unr_index |
104 | test rN, rN | 205 | else |
105 | jz @F | 206 | CRC_ITER x1, x0, unr_index |
106 | CRC1b | 207 | endif |
107 | jmp crc_end | 208 | unr_index = unr_index + 1 |
108 | @@: | ||
109 | if (IS_X64 gt 0) and (IS_LINUX gt 0) | ||
110 | MY_POP_2_REGS | ||
111 | else | ||
112 | MY_POP_4_REGS | ||
113 | endif | ||
114 | endm | 209 | endm |
115 | 210 | ||
116 | MY_PROC CrcUpdateT8, 4 | 211 | add rD, NUM_WORDS * 4 * UNROLL_CNT |
117 | MY_PROLOG crc_end_8 | 212 | jnc @B |
118 | mov x1, [SRCDAT 1] | 213 | |
119 | align 16 | 214 | if 0 |
120 | main_loop_8: | 215 | ; byte verson |
121 | mov x6, [SRCDAT 2] | 216 | add rD, rN |
122 | movzx x3, x1_L | 217 | xor x0, dword ptr [rD] |
123 | CRC_XOR x6, r3, 3 | 218 | add rN, NUM_BYTES_LIMIT - 1 |
124 | movzx x3, x1_H | 219 | else |
125 | CRC_XOR x6, r3, 2 | 220 | ; 4-byte version |
126 | shr x1, 16 | 221 | add rN, 4 * NUM_WORDS * UNROLL_CNT |
127 | movzx x3, x1_L | 222 | sub rD, 4 * NUM_WORDS * UNROLL_CNT |
128 | movzx x1, x1_H | 223 | @@: |
129 | CRC_XOR x6, r3, 1 | 224 | MOVZXLO x3, x0 |
130 | movzx x3, x0_L | 225 | MOVZXHI x1, x0 |
131 | CRC_XOR x6, r1, 0 | 226 | shr x0, 16 |
132 | 227 | MOVZXLO x6, x0 | |
133 | mov x1, [SRCDAT 3] | 228 | shr x0, 8 |
134 | CRC_XOR x6, r3, 7 | 229 | CRC_MOV x0, x0, 0 |
135 | movzx x3, x0_H | 230 | CRC_XOR x0, x3, 3 |
136 | shr x0, 16 | 231 | CRC_XOR x0, x1, 2 |
137 | CRC_XOR x6, r3, 6 | 232 | CRC_XOR x0, x6, 1 |
138 | movzx x3, x0_L | 233 | |
139 | CRC_XOR x6, r3, 5 | 234 | add rD, 4 |
140 | movzx x3, x0_H | 235 | if (NUM_WORDS * UNROLL_CNT) ne 1 |
141 | CRC_MOV x0, r3, 4 | 236 | jc @F |
142 | xor x0, x6 | 237 | xor x0, [SRCDAT_4 0] |
143 | add rD, 8 | 238 | jmp @B |
144 | jnz main_loop_8 | 239 | @@: |
145 | 240 | endif | |
146 | MY_EPILOG crc_end_8 | 241 | add rD, rN |
147 | MY_ENDP | 242 | add rN, 4 - 1 |
243 | |||
244 | endif | ||
245 | |||
246 | sub rN, rD | ||
247 | crc_end: | ||
248 | test rN, rN | ||
249 | jz func_end | ||
250 | @@: | ||
251 | CRC1b | ||
252 | jnz @B | ||
148 | 253 | ||
149 | MY_PROC CrcUpdateT4, 4 | 254 | func_end: |
150 | MY_PROLOG crc_end_4 | 255 | MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 |
151 | align 16 | ||
152 | main_loop_4: | ||
153 | movzx x1, x0_L | ||
154 | movzx x3, x0_H | ||
155 | shr x0, 16 | ||
156 | movzx x6, x0_H | ||
157 | and x0, 0FFh | ||
158 | CRC_MOV x1, r1, 3 | ||
159 | xor x1, [SRCDAT 1] | ||
160 | CRC_XOR x1, r3, 2 | ||
161 | CRC_XOR x1, r6, 0 | ||
162 | CRC_XOR x1, r0, 1 | ||
163 | |||
164 | movzx x0, x1_L | ||
165 | movzx x3, x1_H | ||
166 | shr x1, 16 | ||
167 | movzx x6, x1_H | ||
168 | and x1, 0FFh | ||
169 | CRC_MOV x0, r0, 3 | ||
170 | xor x0, [SRCDAT 2] | ||
171 | CRC_XOR x0, r3, 2 | ||
172 | CRC_XOR x0, r6, 0 | ||
173 | CRC_XOR x0, r1, 1 | ||
174 | add rD, 8 | ||
175 | jnz main_loop_4 | ||
176 | |||
177 | MY_EPILOG crc_end_4 | ||
178 | MY_ENDP | 256 | MY_ENDP |
179 | 257 | ||
180 | end | 258 | end |