aboutsummaryrefslogtreecommitdiff
path: root/Asm/x86/7zCrcOpt.asm
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-14 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-15 23:55:04 +0500
commitfc662341e6f85da78ada0e443f6116b978f79f22 (patch)
tree1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /Asm/x86/7zCrcOpt.asm
parent5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff)
download7zip-24.05.tar.gz
7zip-24.05.tar.bz2
7zip-24.05.zip
24.0524.05
Diffstat (limited to 'Asm/x86/7zCrcOpt.asm')
-rw-r--r--Asm/x86/7zCrcOpt.asm352
1 files changed, 215 insertions, 137 deletions
diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm
index 0fee206..c5de808 100644
--- a/Asm/x86/7zCrcOpt.asm
+++ b/Asm/x86/7zCrcOpt.asm
@@ -1,180 +1,258 @@
1; 7zCrcOpt.asm -- CRC32 calculation : optimized version 1; 7zCrcOpt.asm -- CRC32 calculation : optimized version
2; 2021-02-07 : Igor Pavlov : Public domain 2; 2023-12-08 : Igor Pavlov : Public domain
3 3
4include 7zAsm.asm 4include 7zAsm.asm
5 5
6MY_ASM_START 6MY_ASM_START
7 7
8rD equ r2 8NUM_WORDS equ 3
9rN equ r7 9UNROLL_CNT equ 2
10rT equ r5
11 10
12ifdef x64 11if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
13 num_VAR equ r8 12.err <NUM_WORDS_IS_INCORRECT>
14 table_VAR equ r9 13endif
15else 14if (UNROLL_CNT lt 1)
16 if (IS_CDECL gt 0) 15.err <UNROLL_CNT_IS_INCORRECT>
17 crc_OFFS equ (REG_SIZE * 5)
18 data_OFFS equ (REG_SIZE + crc_OFFS)
19 size_OFFS equ (REG_SIZE + data_OFFS)
20 else
21 size_OFFS equ (REG_SIZE * 5)
22 endif
23 table_OFFS equ (REG_SIZE + size_OFFS)
24 num_VAR equ [r4 + size_OFFS]
25 table_VAR equ [r4 + table_OFFS]
26endif 16endif
27 17
28SRCDAT equ rD + rN * 1 + 4 * 18rD equ r2
19rD_x equ x2
20rN equ r7
21rT equ r5
22
23ifndef x64
24 if (IS_CDECL gt 0)
25 crc_OFFS equ (REG_SIZE * 5)
26 data_OFFS equ (REG_SIZE + crc_OFFS)
27 size_OFFS equ (REG_SIZE + data_OFFS)
28 else
29 size_OFFS equ (REG_SIZE * 5)
30 endif
31 table_OFFS equ (REG_SIZE + size_OFFS)
32endif
33
34; rN + rD is same speed as rD, but we reduce one instruction in loop
35SRCDAT_1 equ rN + rD * 1 + 1 *
36SRCDAT_4 equ rN + rD * 1 + 4 *
29 37
30CRC macro op:req, dest:req, src:req, t:req 38CRC macro op:req, dest:req, src:req, t:req
31 op dest, DWORD PTR [rT + src * 4 + 0400h * t] 39 op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)]
32endm 40endm
33 41
34CRC_XOR macro dest:req, src:req, t:req 42CRC_XOR macro dest:req, src:req, t:req
35 CRC xor, dest, src, t 43 CRC xor, dest, src, t
36endm 44endm
37 45
38CRC_MOV macro dest:req, src:req, t:req 46CRC_MOV macro dest:req, src:req, t:req
39 CRC mov, dest, src, t 47 CRC mov, dest, src, t
48endm
49
50MOVZXLO macro dest:req, src:req
51 movzx dest, @CatStr(src, _L)
52endm
53
54MOVZXHI macro dest:req, src:req
55 movzx dest, @CatStr(src, _H)
40endm 56endm
41 57
58; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest
59; movzx x3, x0_L sometimes is 0 cycles latency (not always)
60; movzx x3, x0_L sometimes is 0.5 cycles latency
61; movzx x3, x0_H is 2 cycles latency in some cpus
62
42CRC1b macro 63CRC1b macro
43 movzx x6, BYTE PTR [rD] 64 movzx x6, byte ptr [rD]
44 inc rD 65 MOVZXLO x3, x0
45 movzx x3, x0_L 66 inc rD
46 xor x6, x3 67 shr x0, 8
47 shr x0, 8 68 xor x6, x3
48 CRC xor, x0, r6, 0 69 CRC_XOR x0, x6, 0
49 dec rN 70 dec rN
71endm
72
73LOAD_1 macro dest:req, t:req, iter:req, index:req
74 movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
75endm
76
77LOAD_2 macro dest:req, t:req, iter:req, index:req
78 movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
79endm
80
81CRC_QUAD macro nn, t:req, iter:req
82ifdef x64
83 ; paired memory loads give 1-3% speed gain, but it uses more registers
84 LOAD_2 x3, t, iter, 0
85 LOAD_2 x9, t, iter, 2
86 MOVZXLO x6, x3
87 shr x3, 8
88 CRC_XOR nn, x6, t * 4 + 3
89 MOVZXLO x6, x9
90 shr x9, 8
91 CRC_XOR nn, x3, t * 4 + 2
92 CRC_XOR nn, x6, t * 4 + 1
93 CRC_XOR nn, x9, t * 4 + 0
94elseif 0
95 LOAD_2 x3, t, iter, 0
96 MOVZXLO x6, x3
97 shr x3, 8
98 CRC_XOR nn, x6, t * 4 + 3
99 CRC_XOR nn, x3, t * 4 + 2
100 LOAD_2 x3, t, iter, 2
101 MOVZXLO x6, x3
102 shr x3, 8
103 CRC_XOR nn, x6, t * 4 + 1
104 CRC_XOR nn, x3, t * 4 + 0
105elseif 0
106 LOAD_1 x3, t, iter, 0
107 LOAD_1 x6, t, iter, 1
108 CRC_XOR nn, x3, t * 4 + 3
109 CRC_XOR nn, x6, t * 4 + 2
110 LOAD_1 x3, t, iter, 2
111 LOAD_1 x6, t, iter, 3
112 CRC_XOR nn, x3, t * 4 + 1
113 CRC_XOR nn, x6, t * 4 + 0
114else
115 ; 32-bit load is better if there is only one read port (core2)
116 ; but that code can be slower if there are 2 read ports (snb)
117 mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)]
118 MOVZXLO x6, x3
119 CRC_XOR nn, x6, t * 4 + 3
120 MOVZXHI x6, x3
121 shr x3, 16
122 CRC_XOR nn, x6, t * 4 + 2
123 MOVZXLO x6, x3
124 shr x3, 8
125 CRC_XOR nn, x6, t * 4 + 1
126 CRC_XOR nn, x3, t * 4 + 0
127endif
50endm 128endm
51 129
52MY_PROLOG macro crc_end:req
53 130
131LAST equ (4 * (NUM_WORDS - 1))
132
133CRC_ITER macro qq, nn, iter
134 mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))]
135
136 i = 0
137 rept NUM_WORDS - 1
138 CRC_QUAD nn, i, iter
139 i = i + 1
140 endm
141
142 MOVZXLO x6, qq
143 mov x3, qq
144 shr x3, 24
145 CRC_XOR nn, x6, LAST + 3
146 CRC_XOR nn, x3, LAST + 0
147 ror qq, 16
148 MOVZXLO x6, qq
149 shr qq, 24
150 CRC_XOR nn, x6, LAST + 1
151if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1))
152 CRC_MOV qq, qq, LAST + 2
153 xor qq, nn
154else
155 CRC_XOR nn, qq, LAST + 2
156endif
157endm
158
159
160; + 4 for prefetching next 4-bytes after current iteration
161NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4)
162ALIGN_MASK equ 3
163
164
165; MY_PROC @CatStr(CrcUpdateT, 12), 4
166MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4
167 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
54 ifdef x64 168 ifdef x64
169 mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux)
170 mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux)
171 mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux)
172 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win)
55 if (IS_LINUX gt 0) 173 if (IS_LINUX gt 0)
56 MY_PUSH_2_REGS
57 mov x0, REG_ABI_PARAM_0_x ; x0 = x7
58 mov rT, REG_ABI_PARAM_3 ; r5 = r1
59 mov rN, REG_ABI_PARAM_2 ; r7 = r2
60 mov rD, REG_ABI_PARAM_1 ; r2 = r6 174 mov rD, REG_ABI_PARAM_1 ; r2 = r6
61 else
62 MY_PUSH_4_REGS
63 mov x0, REG_ABI_PARAM_0_x ; x0 = x1
64 mov rT, REG_ABI_PARAM_3 ; r5 = r9
65 mov rN, REG_ABI_PARAM_2 ; r7 = r8
66 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2
67 endif 175 endif
68 else 176 else
69 MY_PUSH_4_REGS
70 if (IS_CDECL gt 0) 177 if (IS_CDECL gt 0)
71 mov x0, [r4 + crc_OFFS] 178 mov x0, [r4 + crc_OFFS]
72 mov rD, [r4 + data_OFFS] 179 mov rD, [r4 + data_OFFS]
73 else 180 else
74 mov x0, REG_ABI_PARAM_0_x 181 mov x0, REG_ABI_PARAM_0_x
75 endif 182 endif
76 mov rN, num_VAR 183 mov rN, [r4 + size_OFFS]
77 mov rT, table_VAR 184 mov rT, [r4 + table_OFFS]
78 endif 185 endif
79 186
80 test rN, rN 187 cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
81 jz crc_end 188 jb crc_end
82 @@: 189@@:
83 test rD, 7 190 test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK
84 jz @F 191 jz @F
85 CRC1b 192 CRC1b
86 jnz @B 193 jmp @B
87 @@: 194@@:
88 cmp rN, 16 195 xor x0, dword ptr [rD]
89 jb crc_end 196 lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
90 add rN, rD 197 sub rD, rN
91 mov num_VAR, rN
92 sub rN, 8
93 and rN, NOT 7
94 sub rD, rN
95 xor x0, [SRCDAT 0]
96endm
97 198
98MY_EPILOG macro crc_end:req 199align 16
99 xor x0, [SRCDAT 0] 200@@:
100 mov rD, rN 201unr_index = 0
101 mov rN, num_VAR 202while unr_index lt UNROLL_CNT
102 sub rN, rD 203 if (unr_index and 1) eq 0
103 crc_end: 204 CRC_ITER x0, x1, unr_index
104 test rN, rN 205 else
105 jz @F 206 CRC_ITER x1, x0, unr_index
106 CRC1b 207 endif
107 jmp crc_end 208 unr_index = unr_index + 1
108 @@:
109 if (IS_X64 gt 0) and (IS_LINUX gt 0)
110 MY_POP_2_REGS
111 else
112 MY_POP_4_REGS
113 endif
114endm 209endm
115 210
116MY_PROC CrcUpdateT8, 4 211 add rD, NUM_WORDS * 4 * UNROLL_CNT
117 MY_PROLOG crc_end_8 212 jnc @B
118 mov x1, [SRCDAT 1] 213
119 align 16 214if 0
120 main_loop_8: 215 ; byte verson
121 mov x6, [SRCDAT 2] 216 add rD, rN
122 movzx x3, x1_L 217 xor x0, dword ptr [rD]
123 CRC_XOR x6, r3, 3 218 add rN, NUM_BYTES_LIMIT - 1
124 movzx x3, x1_H 219else
125 CRC_XOR x6, r3, 2 220 ; 4-byte version
126 shr x1, 16 221 add rN, 4 * NUM_WORDS * UNROLL_CNT
127 movzx x3, x1_L 222 sub rD, 4 * NUM_WORDS * UNROLL_CNT
128 movzx x1, x1_H 223@@:
129 CRC_XOR x6, r3, 1 224 MOVZXLO x3, x0
130 movzx x3, x0_L 225 MOVZXHI x1, x0
131 CRC_XOR x6, r1, 0 226 shr x0, 16
132 227 MOVZXLO x6, x0
133 mov x1, [SRCDAT 3] 228 shr x0, 8
134 CRC_XOR x6, r3, 7 229 CRC_MOV x0, x0, 0
135 movzx x3, x0_H 230 CRC_XOR x0, x3, 3
136 shr x0, 16 231 CRC_XOR x0, x1, 2
137 CRC_XOR x6, r3, 6 232 CRC_XOR x0, x6, 1
138 movzx x3, x0_L 233
139 CRC_XOR x6, r3, 5 234 add rD, 4
140 movzx x3, x0_H 235if (NUM_WORDS * UNROLL_CNT) ne 1
141 CRC_MOV x0, r3, 4 236 jc @F
142 xor x0, x6 237 xor x0, [SRCDAT_4 0]
143 add rD, 8 238 jmp @B
144 jnz main_loop_8 239@@:
145 240endif
146 MY_EPILOG crc_end_8 241 add rD, rN
147MY_ENDP 242 add rN, 4 - 1
243
244endif
245
246 sub rN, rD
247crc_end:
248 test rN, rN
249 jz func_end
250@@:
251 CRC1b
252 jnz @B
148 253
149MY_PROC CrcUpdateT4, 4 254func_end:
150 MY_PROLOG crc_end_4 255 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
151 align 16
152 main_loop_4:
153 movzx x1, x0_L
154 movzx x3, x0_H
155 shr x0, 16
156 movzx x6, x0_H
157 and x0, 0FFh
158 CRC_MOV x1, r1, 3
159 xor x1, [SRCDAT 1]
160 CRC_XOR x1, r3, 2
161 CRC_XOR x1, r6, 0
162 CRC_XOR x1, r0, 1
163
164 movzx x0, x1_L
165 movzx x3, x1_H
166 shr x1, 16
167 movzx x6, x1_H
168 and x1, 0FFh
169 CRC_MOV x0, r0, 3
170 xor x0, [SRCDAT 2]
171 CRC_XOR x0, r3, 2
172 CRC_XOR x0, r6, 0
173 CRC_XOR x0, r1, 1
174 add rD, 8
175 jnz main_loop_4
176
177 MY_EPILOG crc_end_4
178MY_ENDP 256MY_ENDP
179 257
180end 258end