aboutsummaryrefslogtreecommitdiff
path: root/Asm/x86
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2021-12-27 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2022-03-18 15:35:13 +0500
commitf19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /Asm/x86
parent98e06a519b63b81986abe76d28887f6984a7732b (diff)
download7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip
'21.07'21.07
Diffstat (limited to 'Asm/x86')
-rw-r--r--Asm/x86/7zAsm.asm284
-rw-r--r--Asm/x86/7zCrcOpt.asm180
-rw-r--r--Asm/x86/AesOpt.asm742
-rw-r--r--Asm/x86/LzFindOpt.asm513
-rw-r--r--Asm/x86/LzmaDecOpt.asm1303
-rw-r--r--Asm/x86/Sha1Opt.asm263
-rw-r--r--Asm/x86/Sha256Opt.asm263
-rw-r--r--Asm/x86/XzCrc64Opt.asm239
8 files changed, 3787 insertions, 0 deletions
diff --git a/Asm/x86/7zAsm.asm b/Asm/x86/7zAsm.asm
new file mode 100644
index 0000000..6275bb7
--- /dev/null
+++ b/Asm/x86/7zAsm.asm
@@ -0,0 +1,284 @@
1; 7zAsm.asm -- ASM macros
2; 2021-12-25 : Igor Pavlov : Public domain
3
4
5ifdef @wordsize
6; @wordsize is defined only in JWASM and ASMC and is not defined in MASM
7; @wordsize eq 8 for 64-bit x64
8; @wordsize eq 2 for 32-bit x86
9if @wordsize eq 8
10 x64 equ 1
11endif
12else
13ifdef RAX
14 x64 equ 1
15endif
16endif
17
18
19ifdef x64
20 IS_X64 equ 1
21else
22 IS_X64 equ 0
23endif
24
25ifdef ABI_LINUX
26 IS_LINUX equ 1
27else
28 IS_LINUX equ 0
29endif
30
31ifndef x64
32; Use ABI_CDECL for x86 (32-bit) only
33; if ABI_CDECL is not defined, we use fastcall abi
34ifdef ABI_CDECL
35 IS_CDECL equ 1
36else
37 IS_CDECL equ 0
38endif
39endif
40
41OPTION PROLOGUE:NONE
42OPTION EPILOGUE:NONE
43
44MY_ASM_START macro
45 ifdef x64
46 .code
47 else
48 .386
49 .model flat
50 _TEXT$00 SEGMENT PARA PUBLIC 'CODE'
51 endif
52endm
53
54MY_PROC macro name:req, numParams:req
55 align 16
56 proc_numParams = numParams
57 if (IS_X64 gt 0)
58 proc_name equ name
59 elseif (IS_LINUX gt 0)
60 proc_name equ name
61 elseif (IS_CDECL gt 0)
62 proc_name equ @CatStr(_,name)
63 else
64 proc_name equ @CatStr(@,name,@, %numParams * 4)
65 endif
66 proc_name PROC
67endm
68
69MY_ENDP macro
70 if (IS_X64 gt 0)
71 ret
72 elseif (IS_CDECL gt 0)
73 ret
74 elseif (proc_numParams LT 3)
75 ret
76 else
77 ret (proc_numParams - 2) * 4
78 endif
79 proc_name ENDP
80endm
81
82
83ifdef x64
84 REG_SIZE equ 8
85 REG_LOGAR_SIZE equ 3
86else
87 REG_SIZE equ 4
88 REG_LOGAR_SIZE equ 2
89endif
90
91 x0 equ EAX
92 x1 equ ECX
93 x2 equ EDX
94 x3 equ EBX
95 x4 equ ESP
96 x5 equ EBP
97 x6 equ ESI
98 x7 equ EDI
99
100 x0_W equ AX
101 x1_W equ CX
102 x2_W equ DX
103 x3_W equ BX
104
105 x5_W equ BP
106 x6_W equ SI
107 x7_W equ DI
108
109 x0_L equ AL
110 x1_L equ CL
111 x2_L equ DL
112 x3_L equ BL
113
114 x0_H equ AH
115 x1_H equ CH
116 x2_H equ DH
117 x3_H equ BH
118
119ifdef x64
120 x5_L equ BPL
121 x6_L equ SIL
122 x7_L equ DIL
123
124 r0 equ RAX
125 r1 equ RCX
126 r2 equ RDX
127 r3 equ RBX
128 r4 equ RSP
129 r5 equ RBP
130 r6 equ RSI
131 r7 equ RDI
132 x8 equ r8d
133 x9 equ r9d
134 x10 equ r10d
135 x11 equ r11d
136 x12 equ r12d
137 x13 equ r13d
138 x14 equ r14d
139 x15 equ r15d
140else
141 r0 equ x0
142 r1 equ x1
143 r2 equ x2
144 r3 equ x3
145 r4 equ x4
146 r5 equ x5
147 r6 equ x6
148 r7 equ x7
149endif
150
151
152ifdef x64
153ifdef ABI_LINUX
154
155MY_PUSH_2_REGS macro
156 push r3
157 push r5
158endm
159
160MY_POP_2_REGS macro
161 pop r5
162 pop r3
163endm
164
165endif
166endif
167
168
169MY_PUSH_4_REGS macro
170 push r3
171 push r5
172 push r6
173 push r7
174endm
175
176MY_POP_4_REGS macro
177 pop r7
178 pop r6
179 pop r5
180 pop r3
181endm
182
183
184; for fastcall and for WIN-x64
185REG_PARAM_0_x equ x1
186REG_PARAM_0 equ r1
187REG_PARAM_1_x equ x2
188REG_PARAM_1 equ r2
189
190ifndef x64
191; for x86-fastcall
192
193REG_ABI_PARAM_0_x equ REG_PARAM_0_x
194REG_ABI_PARAM_0 equ REG_PARAM_0
195REG_ABI_PARAM_1_x equ REG_PARAM_1_x
196REG_ABI_PARAM_1 equ REG_PARAM_1
197
198else
199; x64
200
201if (IS_LINUX eq 0)
202
203; for WIN-x64:
204REG_PARAM_2_x equ x8
205REG_PARAM_2 equ r8
206REG_PARAM_3 equ r9
207
208REG_ABI_PARAM_0_x equ REG_PARAM_0_x
209REG_ABI_PARAM_0 equ REG_PARAM_0
210REG_ABI_PARAM_1_x equ REG_PARAM_1_x
211REG_ABI_PARAM_1 equ REG_PARAM_1
212REG_ABI_PARAM_2_x equ REG_PARAM_2_x
213REG_ABI_PARAM_2 equ REG_PARAM_2
214REG_ABI_PARAM_3 equ REG_PARAM_3
215
216else
217; for LINUX-x64:
218REG_LINUX_PARAM_0_x equ x7
219REG_LINUX_PARAM_0 equ r7
220REG_LINUX_PARAM_1_x equ x6
221REG_LINUX_PARAM_1 equ r6
222REG_LINUX_PARAM_2 equ r2
223REG_LINUX_PARAM_3 equ r1
224REG_LINUX_PARAM_4_x equ x8
225REG_LINUX_PARAM_4 equ r8
226REG_LINUX_PARAM_5 equ r9
227
228REG_ABI_PARAM_0_x equ REG_LINUX_PARAM_0_x
229REG_ABI_PARAM_0 equ REG_LINUX_PARAM_0
230REG_ABI_PARAM_1_x equ REG_LINUX_PARAM_1_x
231REG_ABI_PARAM_1 equ REG_LINUX_PARAM_1
232REG_ABI_PARAM_2 equ REG_LINUX_PARAM_2
233REG_ABI_PARAM_3 equ REG_LINUX_PARAM_3
234REG_ABI_PARAM_4_x equ REG_LINUX_PARAM_4_x
235REG_ABI_PARAM_4 equ REG_LINUX_PARAM_4
236REG_ABI_PARAM_5 equ REG_LINUX_PARAM_5
237
238MY_ABI_LINUX_TO_WIN_2 macro
239 mov r2, r6
240 mov r1, r7
241endm
242
243MY_ABI_LINUX_TO_WIN_3 macro
244 mov r8, r2
245 mov r2, r6
246 mov r1, r7
247endm
248
249MY_ABI_LINUX_TO_WIN_4 macro
250 mov r9, r1
251 mov r8, r2
252 mov r2, r6
253 mov r1, r7
254endm
255
256endif ; IS_LINUX
257
258
259MY_PUSH_PRESERVED_ABI_REGS macro
260 if (IS_LINUX gt 0)
261 MY_PUSH_2_REGS
262 else
263 MY_PUSH_4_REGS
264 endif
265 push r12
266 push r13
267 push r14
268 push r15
269endm
270
271
272MY_POP_PRESERVED_ABI_REGS macro
273 pop r15
274 pop r14
275 pop r13
276 pop r12
277 if (IS_LINUX gt 0)
278 MY_POP_2_REGS
279 else
280 MY_POP_4_REGS
281 endif
282endm
283
284endif ; x64
diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm
new file mode 100644
index 0000000..0fee206
--- /dev/null
+++ b/Asm/x86/7zCrcOpt.asm
@@ -0,0 +1,180 @@
1; 7zCrcOpt.asm -- CRC32 calculation : optimized version
2; 2021-02-07 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8rD equ r2
9rN equ r7
10rT equ r5
11
12ifdef x64
13 num_VAR equ r8
14 table_VAR equ r9
15else
16 if (IS_CDECL gt 0)
17 crc_OFFS equ (REG_SIZE * 5)
18 data_OFFS equ (REG_SIZE + crc_OFFS)
19 size_OFFS equ (REG_SIZE + data_OFFS)
20 else
21 size_OFFS equ (REG_SIZE * 5)
22 endif
23 table_OFFS equ (REG_SIZE + size_OFFS)
24 num_VAR equ [r4 + size_OFFS]
25 table_VAR equ [r4 + table_OFFS]
26endif
27
28SRCDAT equ rD + rN * 1 + 4 *
29
30CRC macro op:req, dest:req, src:req, t:req
31 op dest, DWORD PTR [rT + src * 4 + 0400h * t]
32endm
33
34CRC_XOR macro dest:req, src:req, t:req
35 CRC xor, dest, src, t
36endm
37
38CRC_MOV macro dest:req, src:req, t:req
39 CRC mov, dest, src, t
40endm
41
42CRC1b macro
43 movzx x6, BYTE PTR [rD]
44 inc rD
45 movzx x3, x0_L
46 xor x6, x3
47 shr x0, 8
48 CRC xor, x0, r6, 0
49 dec rN
50endm
51
52MY_PROLOG macro crc_end:req
53
54 ifdef x64
55 if (IS_LINUX gt 0)
56 MY_PUSH_2_REGS
57 mov x0, REG_ABI_PARAM_0_x ; x0 = x7
58 mov rT, REG_ABI_PARAM_3 ; r5 = r1
59 mov rN, REG_ABI_PARAM_2 ; r7 = r2
60 mov rD, REG_ABI_PARAM_1 ; r2 = r6
61 else
62 MY_PUSH_4_REGS
63 mov x0, REG_ABI_PARAM_0_x ; x0 = x1
64 mov rT, REG_ABI_PARAM_3 ; r5 = r9
65 mov rN, REG_ABI_PARAM_2 ; r7 = r8
66 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2
67 endif
68 else
69 MY_PUSH_4_REGS
70 if (IS_CDECL gt 0)
71 mov x0, [r4 + crc_OFFS]
72 mov rD, [r4 + data_OFFS]
73 else
74 mov x0, REG_ABI_PARAM_0_x
75 endif
76 mov rN, num_VAR
77 mov rT, table_VAR
78 endif
79
80 test rN, rN
81 jz crc_end
82 @@:
83 test rD, 7
84 jz @F
85 CRC1b
86 jnz @B
87 @@:
88 cmp rN, 16
89 jb crc_end
90 add rN, rD
91 mov num_VAR, rN
92 sub rN, 8
93 and rN, NOT 7
94 sub rD, rN
95 xor x0, [SRCDAT 0]
96endm
97
98MY_EPILOG macro crc_end:req
99 xor x0, [SRCDAT 0]
100 mov rD, rN
101 mov rN, num_VAR
102 sub rN, rD
103 crc_end:
104 test rN, rN
105 jz @F
106 CRC1b
107 jmp crc_end
108 @@:
109 if (IS_X64 gt 0) and (IS_LINUX gt 0)
110 MY_POP_2_REGS
111 else
112 MY_POP_4_REGS
113 endif
114endm
115
116MY_PROC CrcUpdateT8, 4
117 MY_PROLOG crc_end_8
118 mov x1, [SRCDAT 1]
119 align 16
120 main_loop_8:
121 mov x6, [SRCDAT 2]
122 movzx x3, x1_L
123 CRC_XOR x6, r3, 3
124 movzx x3, x1_H
125 CRC_XOR x6, r3, 2
126 shr x1, 16
127 movzx x3, x1_L
128 movzx x1, x1_H
129 CRC_XOR x6, r3, 1
130 movzx x3, x0_L
131 CRC_XOR x6, r1, 0
132
133 mov x1, [SRCDAT 3]
134 CRC_XOR x6, r3, 7
135 movzx x3, x0_H
136 shr x0, 16
137 CRC_XOR x6, r3, 6
138 movzx x3, x0_L
139 CRC_XOR x6, r3, 5
140 movzx x3, x0_H
141 CRC_MOV x0, r3, 4
142 xor x0, x6
143 add rD, 8
144 jnz main_loop_8
145
146 MY_EPILOG crc_end_8
147MY_ENDP
148
149MY_PROC CrcUpdateT4, 4
150 MY_PROLOG crc_end_4
151 align 16
152 main_loop_4:
153 movzx x1, x0_L
154 movzx x3, x0_H
155 shr x0, 16
156 movzx x6, x0_H
157 and x0, 0FFh
158 CRC_MOV x1, r1, 3
159 xor x1, [SRCDAT 1]
160 CRC_XOR x1, r3, 2
161 CRC_XOR x1, r6, 0
162 CRC_XOR x1, r0, 1
163
164 movzx x0, x1_L
165 movzx x3, x1_H
166 shr x1, 16
167 movzx x6, x1_H
168 and x1, 0FFh
169 CRC_MOV x0, r0, 3
170 xor x0, [SRCDAT 2]
171 CRC_XOR x0, r3, 2
172 CRC_XOR x0, r6, 0
173 CRC_XOR x0, r1, 1
174 add rD, 8
175 jnz main_loop_4
176
177 MY_EPILOG crc_end_4
178MY_ENDP
179
180end
diff --git a/Asm/x86/AesOpt.asm b/Asm/x86/AesOpt.asm
new file mode 100644
index 0000000..84bf897
--- /dev/null
+++ b/Asm/x86/AesOpt.asm
@@ -0,0 +1,742 @@
1; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
2; 2021-12-25 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6ifdef __ASMC__
7 use_vaes_256 equ 1
8else
9ifdef ymm0
10 use_vaes_256 equ 1
11endif
12endif
13
14
15ifdef use_vaes_256
16 ECHO "++ VAES 256"
17else
18 ECHO "-- NO VAES 256"
19endif
20
21ifdef x64
22 ECHO "x86-64"
23else
24 ECHO "x86"
25if (IS_CDECL gt 0)
26 ECHO "ABI : CDECL"
27else
28 ECHO "ABI : no CDECL : FASTCALL"
29endif
30endif
31
32if (IS_LINUX gt 0)
33 ECHO "ABI : LINUX"
34else
35 ECHO "ABI : WINDOWS"
36endif
37
38MY_ASM_START
39
40ifndef x64
41 .686
42 .xmm
43endif
44
45
46; MY_ALIGN EQU ALIGN(64)
47MY_ALIGN EQU
48
49SEG_ALIGN EQU MY_ALIGN
50
51MY_SEG_PROC macro name:req, numParams:req
52 ; seg_name equ @CatStr(_TEXT$, name)
53 ; seg_name SEGMENT SEG_ALIGN 'CODE'
54 MY_PROC name, numParams
55endm
56
57MY_SEG_ENDP macro
58 ; seg_name ENDS
59endm
60
61
62NUM_AES_KEYS_MAX equ 15
63
64; the number of push operators in function PROLOG
65if (IS_LINUX eq 0) or (IS_X64 eq 0)
66num_regs_push equ 2
67stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
68endif
69
70ifdef x64
71 num_param equ REG_ABI_PARAM_2
72else
73 if (IS_CDECL gt 0)
74 ; size_t size
75 ; void * data
76 ; UInt32 * aes
77 ; ret-ip <- (r4)
78 aes_OFFS equ (stack_param_offset)
79 data_OFFS equ (REG_SIZE + aes_OFFS)
80 size_OFFS equ (REG_SIZE + data_OFFS)
81 num_param equ [r4 + size_OFFS]
82 else
83 num_param equ [r4 + stack_param_offset]
84 endif
85endif
86
87keys equ REG_PARAM_0 ; r1
88rD equ REG_PARAM_1 ; r2
89rN equ r0
90
91koffs_x equ x7
92koffs_r equ r7
93
94ksize_x equ x6
95ksize_r equ r6
96
97keys2 equ r3
98
99state equ xmm0
100key equ xmm0
101key_ymm equ ymm0
102key_ymm_n equ 0
103
104ifdef x64
105 ways = 11
106else
107 ways = 4
108endif
109
110ways_start_reg equ 1
111
112iv equ @CatStr(xmm, %(ways_start_reg + ways))
113iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
114
115
116WOP macro op, op2
117 i = 0
118 rept ways
119 op @CatStr(xmm, %(ways_start_reg + i)), op2
120 i = i + 1
121 endm
122endm
123
124
125ifndef ABI_LINUX
126ifdef x64
127
128; we use 32 bytes of home space in stack in WIN64-x64
129NUM_HOME_MM_REGS equ (32 / 16)
130; we preserve xmm registers starting from xmm6 in WIN64-x64
131MM_START_SAVE_REG equ 6
132
133SAVE_XMM macro num_used_mm_regs:req
134 num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
135 if num_save_mm_regs GT 0
136 num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
137 ; RSP is (16*x + 8) after entering the function in WIN64-x64
138 stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
139
140 i = 0
141 rept num_save_mm_regs
142
143 if i eq NUM_HOME_MM_REGS
144 sub r4, stack_offset
145 endif
146
147 if i lt NUM_HOME_MM_REGS
148 movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
149 else
150 movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
151 endif
152
153 i = i + 1
154 endm
155 endif
156endm
157
158RESTORE_XMM macro num_used_mm_regs:req
159 if num_save_mm_regs GT 0
160 i = 0
161 if num_save_mm_regs2 GT 0
162 rept num_save_mm_regs2
163 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
164 i = i + 1
165 endm
166 add r4, stack_offset
167 endif
168
169 num_low_regs = num_save_mm_regs - i
170 i = 0
171 rept num_low_regs
172 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
173 i = i + 1
174 endm
175 endif
176endm
177
178endif ; x64
179endif ; ABI_LINUX
180
181
182MY_PROLOG macro num_used_mm_regs:req
183 ; num_regs_push: must be equal to the number of push operators
184 ; push r3
185 ; push r5
186 if (IS_LINUX eq 0) or (IS_X64 eq 0)
187 push r6
188 push r7
189 endif
190
191 mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
192
193 if (IS_X64 eq 0)
194 if (IS_CDECL gt 0)
195 mov rD, [r4 + data_OFFS]
196 mov keys, [r4 + aes_OFFS]
197 endif
198 elseif (IS_LINUX gt 0)
199 MY_ABI_LINUX_TO_WIN_2
200 endif
201
202
203 ifndef ABI_LINUX
204 ifdef x64
205 SAVE_XMM num_used_mm_regs
206 endif
207 endif
208
209 mov ksize_x, [keys + 16]
210 shl ksize_x, 5
211endm
212
213
214MY_EPILOG macro
215 ifndef ABI_LINUX
216 ifdef x64
217 RESTORE_XMM num_save_mm_regs
218 endif
219 endif
220
221 if (IS_LINUX eq 0) or (IS_X64 eq 0)
222 pop r7
223 pop r6
224 endif
225 ; pop r5
226 ; pop r3
227 MY_ENDP
228endm
229
230
231OP_KEY macro op:req, offs:req
232 op state, [keys + offs]
233endm
234
235
236WOP_KEY macro op:req, offs:req
237 movdqa key, [keys + offs]
238 WOP op, key
239endm
240
241
242; ---------- AES-CBC Decode ----------
243
244
245XOR_WITH_DATA macro reg, _ppp_
246 pxor reg, [rD + i * 16]
247endm
248
249WRITE_TO_DATA macro reg, _ppp_
250 movdqa [rD + i * 16], reg
251endm
252
253
254; state0 equ @CatStr(xmm, %(ways_start_reg))
255
256key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
257key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
258
259key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
260key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
261key_last_ymm_n equ (ways_start_reg + ways + 2)
262
263NUM_CBC_REGS equ (ways_start_reg + ways + 3)
264
265
266MY_SEG_PROC AesCbc_Decode_HW, 3
267
268 AesCbc_Decode_HW_start::
269 MY_PROLOG NUM_CBC_REGS
270
271 AesCbc_Decode_HW_start_2::
272 movdqa iv, [keys]
273 add keys, 32
274
275 movdqa key0, [keys + 1 * ksize_r]
276 movdqa key_last, [keys]
277 sub ksize_x, 16
278
279 jmp check2
280 align 16
281 nextBlocks2:
282 WOP movdqa, [rD + i * 16]
283 mov koffs_x, ksize_x
284 ; WOP_KEY pxor, ksize_r + 16
285 WOP pxor, key0
286 ; align 16
287 @@:
288 WOP_KEY aesdec, 1 * koffs_r
289 sub koffs_r, 16
290 jnz @B
291 ; WOP_KEY aesdeclast, 0
292 WOP aesdeclast, key_last
293
294 pxor @CatStr(xmm, %(ways_start_reg)), iv
295 i = 1
296 rept ways - 1
297 pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
298 i = i + 1
299 endm
300 movdqa iv, [rD + ways * 16 - 16]
301 WOP WRITE_TO_DATA
302
303 add rD, ways * 16
304 AesCbc_Decode_HW_start_3::
305 check2:
306 sub rN, ways
307 jnc nextBlocks2
308 add rN, ways
309
310 sub ksize_x, 16
311
312 jmp check
313 nextBlock:
314 movdqa state, [rD]
315 mov koffs_x, ksize_x
316 ; OP_KEY pxor, 1 * ksize_r + 32
317 pxor state, key0
318 ; movdqa state0, [rD]
319 ; movdqa state, key0
320 ; pxor state, state0
321 @@:
322 OP_KEY aesdec, 1 * koffs_r + 16
323 OP_KEY aesdec, 1 * koffs_r
324 sub koffs_r, 32
325 jnz @B
326 OP_KEY aesdec, 16
327 ; OP_KEY aesdeclast, 0
328 aesdeclast state, key_last
329
330 pxor state, iv
331 movdqa iv, [rD]
332 ; movdqa iv, state0
333 movdqa [rD], state
334
335 add rD, 16
336 check:
337 sub rN, 1
338 jnc nextBlock
339
340 movdqa [keys - 32], iv
341MY_EPILOG
342
343
344
345
346; ---------- AVX ----------
347
348
349AVX__WOP_n macro op
350 i = 0
351 rept ways
352 op (ways_start_reg + i)
353 i = i + 1
354 endm
355endm
356
357AVX__WOP macro op
358 i = 0
359 rept ways
360 op @CatStr(ymm, %(ways_start_reg + i))
361 i = i + 1
362 endm
363endm
364
365
366AVX__WOP_KEY macro op:req, offs:req
367 vmovdqa key_ymm, ymmword ptr [keys2 + offs]
368 AVX__WOP_n op
369endm
370
371
372AVX__CBC_START macro reg
373 ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
374 vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
375endm
376
377AVX__CBC_END macro reg
378 if i eq 0
379 vpxor reg, reg, iv_ymm
380 else
381 vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
382 endif
383endm
384
385
386AVX__WRITE_TO_DATA macro reg
387 vmovdqu ymmword ptr [rD + 32 * i], reg
388endm
389
390AVX__XOR_WITH_DATA macro reg
391 vpxor reg, reg, ymmword ptr [rD + 32 * i]
392endm
393
394AVX__CTR_START macro reg
395 vpaddq iv_ymm, iv_ymm, one_ymm
396 ; vpxor reg, iv_ymm, key_ymm
397 vpxor reg, iv_ymm, key0_ymm
398endm
399
400
401MY_VAES_INSTR_2 macro cmd, dest, a1, a2
402 db 0c4H
403 db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
404 db 5 + 8 * ((not (a1)) and 15)
405 db cmd
406 db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
407endm
408
409MY_VAES_INSTR macro cmd, dest, a
410 MY_VAES_INSTR_2 cmd, dest, dest, a
411endm
412
413MY_vaesenc macro dest, a
414 MY_VAES_INSTR 0dcH, dest, a
415endm
416MY_vaesenclast macro dest, a
417 MY_VAES_INSTR 0ddH, dest, a
418endm
419MY_vaesdec macro dest, a
420 MY_VAES_INSTR 0deH, dest, a
421endm
422MY_vaesdeclast macro dest, a
423 MY_VAES_INSTR 0dfH, dest, a
424endm
425
426
427AVX__VAES_DEC macro reg
428 MY_vaesdec reg, key_ymm_n
429endm
430
431AVX__VAES_DEC_LAST_key_last macro reg
432 ; MY_vaesdeclast reg, key_ymm_n
433 MY_vaesdeclast reg, key_last_ymm_n
434endm
435
436AVX__VAES_ENC macro reg
437 MY_vaesenc reg, key_ymm_n
438endm
439
440AVX__VAES_ENC_LAST macro reg
441 MY_vaesenclast reg, key_ymm_n
442endm
443
444AVX__vinserti128_TO_HIGH macro dest, src
445 vinserti128 dest, dest, src, 1
446endm
447
448
449MY_PROC AesCbc_Decode_HW_256, 3
450 ifdef use_vaes_256
451 MY_PROLOG NUM_CBC_REGS
452
453 cmp rN, ways * 2
454 jb AesCbc_Decode_HW_start_2
455
456 vmovdqa iv, xmmword ptr [keys]
457 add keys, 32
458
459 vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
460 vbroadcasti128 key_last_ymm, xmmword ptr [keys]
461 sub ksize_x, 16
462 mov koffs_x, ksize_x
463 add ksize_x, ksize_x
464
465 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
466 push keys2
467 sub r4, AVX_STACK_SUB
468 ; sub r4, 32
469 ; sub r4, ksize_r
470 ; lea keys2, [r4 + 32]
471 mov keys2, r4
472 and keys2, -32
473 broad:
474 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
475 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
476 sub koffs_r, 16
477 ; jnc broad
478 jnz broad
479
480 sub rN, ways * 2
481
482 align 16
483 avx_cbcdec_nextBlock2:
484 mov koffs_x, ksize_x
485 ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
486 AVX__WOP AVX__CBC_START
487 @@:
488 AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
489 sub koffs_r, 32
490 jnz @B
491 ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
492 AVX__WOP_n AVX__VAES_DEC_LAST_key_last
493
494 AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
495 AVX__WOP AVX__CBC_END
496
497 vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
498 AVX__WOP AVX__WRITE_TO_DATA
499
500 add rD, ways * 32
501 sub rN, ways * 2
502 jnc avx_cbcdec_nextBlock2
503 add rN, ways * 2
504
505 shr ksize_x, 1
506
507 ; lea r4, [r4 + 1 * ksize_r + 32]
508 add r4, AVX_STACK_SUB
509 pop keys2
510
511 vzeroupper
512 jmp AesCbc_Decode_HW_start_3
513 else
514 jmp AesCbc_Decode_HW_start
515 endif
516MY_ENDP
517MY_SEG_ENDP
518
519
520
521
522; ---------- AES-CBC Encode ----------
523
524e0 equ xmm1
525
526CENC_START_KEY equ 2
527CENC_NUM_REG_KEYS equ (3 * 2)
528; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
529
530MY_SEG_PROC AesCbc_Encode_HW, 3
531 MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
532
533 movdqa state, [keys]
534 add keys, 32
535
536 i = 0
537 rept CENC_NUM_REG_KEYS
538 movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
539 i = i + 1
540 endm
541
542 add keys, ksize_r
543 neg ksize_r
544 add ksize_r, (16 * CENC_NUM_REG_KEYS)
545 ; movdqa last_key, [keys]
546 jmp check_e
547
548 align 16
549 nextBlock_e:
550 movdqa e0, [rD]
551 mov koffs_r, ksize_r
552 pxor e0, @CatStr(xmm, %(CENC_START_KEY))
553 pxor state, e0
554
555 i = 1
556 rept (CENC_NUM_REG_KEYS - 1)
557 aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
558 i = i + 1
559 endm
560
561 @@:
562 OP_KEY aesenc, 1 * koffs_r
563 OP_KEY aesenc, 1 * koffs_r + 16
564 add koffs_r, 32
565 jnz @B
566 OP_KEY aesenclast, 0
567 ; aesenclast state, last_key
568
569 movdqa [rD], state
570 add rD, 16
571 check_e:
572 sub rN, 1
573 jnc nextBlock_e
574
575 ; movdqa [keys - 32], state
576 movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
577MY_EPILOG
578MY_SEG_ENDP
579
580
581
582; ---------- AES-CTR ----------
583
584ifdef x64
585 ; ways = 11
586endif
587
588
589one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
590one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
591key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
592key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
593NUM_CTR_REGS equ (ways_start_reg + ways + 3)
594
595INIT_CTR macro reg, _ppp_
596 paddq iv, one
597 movdqa reg, iv
598endm
599
600
601MY_SEG_PROC AesCtr_Code_HW, 3
602 Ctr_start::
603 MY_PROLOG NUM_CTR_REGS
604
605 Ctr_start_2::
606 movdqa iv, [keys]
607 add keys, 32
608 movdqa key0, [keys]
609
610 add keys, ksize_r
611 neg ksize_r
612 add ksize_r, 16
613
614 Ctr_start_3::
615 mov koffs_x, 1
616 movd one, koffs_x
617 jmp check2_c
618
619 align 16
620 nextBlocks2_c:
621 WOP INIT_CTR, 0
622 mov koffs_r, ksize_r
623 ; WOP_KEY pxor, 1 * koffs_r -16
624 WOP pxor, key0
625 @@:
626 WOP_KEY aesenc, 1 * koffs_r
627 add koffs_r, 16
628 jnz @B
629 WOP_KEY aesenclast, 0
630
631 WOP XOR_WITH_DATA
632 WOP WRITE_TO_DATA
633 add rD, ways * 16
634 check2_c:
635 sub rN, ways
636 jnc nextBlocks2_c
637 add rN, ways
638
639 sub keys, 16
640 add ksize_r, 16
641
642 jmp check_c
643
644 ; align 16
645 nextBlock_c:
646 paddq iv, one
647 ; movdqa state, [keys + 1 * koffs_r - 16]
648 movdqa state, key0
649 mov koffs_r, ksize_r
650 pxor state, iv
651
652 @@:
653 OP_KEY aesenc, 1 * koffs_r
654 OP_KEY aesenc, 1 * koffs_r + 16
655 add koffs_r, 32
656 jnz @B
657 OP_KEY aesenc, 0
658 OP_KEY aesenclast, 16
659
660 pxor state, [rD]
661 movdqa [rD], state
662 add rD, 16
663 check_c:
664 sub rN, 1
665 jnc nextBlock_c
666
667 ; movdqa [keys - 32], iv
668 movdqa [keys + 1 * ksize_r - 16 - 32], iv
669MY_EPILOG
670
671
672MY_PROC AesCtr_Code_HW_256, 3
673 ifdef use_vaes_256
674 MY_PROLOG NUM_CTR_REGS
675
676 cmp rN, ways * 2
677 jb Ctr_start_2
678
679 vbroadcasti128 iv_ymm, xmmword ptr [keys]
680 add keys, 32
681 vbroadcasti128 key0_ymm, xmmword ptr [keys]
682 mov koffs_x, 1
683 vmovd one, koffs_x
684 vpsubq iv_ymm, iv_ymm, one_ymm
685 vpaddq one, one, one
686 AVX__vinserti128_TO_HIGH one_ymm, one
687
688 add keys, ksize_r
689 sub ksize_x, 16
690 neg ksize_r
691 mov koffs_r, ksize_r
692 add ksize_r, ksize_r
693
694 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
695 push keys2
696 lea keys2, [r4 - 32]
697 sub r4, AVX_STACK_SUB
698 and keys2, -32
699 vbroadcasti128 key_ymm, xmmword ptr [keys]
700 vmovdqa ymmword ptr [keys2], key_ymm
701 @@:
702 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
703 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
704 add koffs_r, 16
705 jnz @B
706
707 sub rN, ways * 2
708
709 align 16
710 avx_ctr_nextBlock2:
711 mov koffs_r, ksize_r
712 AVX__WOP AVX__CTR_START
713 ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
714 @@:
715 AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
716 add koffs_r, 32
717 jnz @B
718 AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
719
720 AVX__WOP AVX__XOR_WITH_DATA
721 AVX__WOP AVX__WRITE_TO_DATA
722
723 add rD, ways * 32
724 sub rN, ways * 2
725 jnc avx_ctr_nextBlock2
726 add rN, ways * 2
727
728 vextracti128 iv, iv_ymm, 1
729 sar ksize_r, 1
730
731 add r4, AVX_STACK_SUB
732 pop keys2
733
734 vzeroupper
735 jmp Ctr_start_3
736 else
737 jmp Ctr_start
738 endif
739MY_ENDP
740MY_SEG_ENDP
741
742end
diff --git a/Asm/x86/LzFindOpt.asm b/Asm/x86/LzFindOpt.asm
new file mode 100644
index 0000000..42e10bd
--- /dev/null
+++ b/Asm/x86/LzFindOpt.asm
@@ -0,0 +1,513 @@
1; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
2; 2021-07-21: Igor Pavlov : Public domain
3;
4
5ifndef x64
6; x64=1
7; .err <x64_IS_REQUIRED>
8endif
9
10include 7zAsm.asm
11
12MY_ASM_START
13
14_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
15
16MY_ALIGN macro num:req
17 align num
18endm
19
20MY_ALIGN_32 macro
21 MY_ALIGN 32
22endm
23
24MY_ALIGN_64 macro
25 MY_ALIGN 64
26endm
27
28
29t0_L equ x0_L
30t0_x equ x0
31t0 equ r0
32t1_x equ x3
33t1 equ r3
34
35cp_x equ t1_x
36cp_r equ t1
37m equ x5
38m_r equ r5
39len_x equ x6
40len equ r6
41diff_x equ x7
42diff equ r7
43len0 equ r10
44len1_x equ x11
45len1 equ r11
46maxLen_x equ x12
47maxLen equ r12
48d equ r13
49ptr0 equ r14
50ptr1 equ r15
51
52d_lim equ m_r
53cycSize equ len_x
54hash_lim equ len0
55delta1_x equ len1_x
56delta1_r equ len1
57delta_x equ maxLen_x
58delta_r equ maxLen
59hash equ ptr0
60src equ ptr1
61
62
63
64if (IS_LINUX gt 0)
65
66; r1 r2 r8 r9 : win32
67; r7 r6 r2 r1 r8 r9 : linux
68
69lenLimit equ r8
70lenLimit_x equ x8
71; pos_r equ r2
72pos equ x2
73cur equ r1
74son equ r9
75
76else
77
78lenLimit equ REG_ABI_PARAM_2
79lenLimit_x equ REG_ABI_PARAM_2_x
80pos equ REG_ABI_PARAM_1_x
81cur equ REG_ABI_PARAM_0
82son equ REG_ABI_PARAM_3
83
84endif
85
86
87if (IS_LINUX gt 0)
88 maxLen_OFFS equ (REG_SIZE * (6 + 1))
89else
90 cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4))
91 d_OFFS equ (REG_SIZE + cutValue_OFFS)
92 maxLen_OFFS equ (REG_SIZE + d_OFFS)
93endif
94 hash_OFFS equ (REG_SIZE + maxLen_OFFS)
95 limit_OFFS equ (REG_SIZE + hash_OFFS)
96 size_OFFS equ (REG_SIZE + limit_OFFS)
97 cycPos_OFFS equ (REG_SIZE + size_OFFS)
98 cycSize_OFFS equ (REG_SIZE + cycPos_OFFS)
99 posRes_OFFS equ (REG_SIZE + cycSize_OFFS)
100
101if (IS_LINUX gt 0)
102else
103 cutValue_PAR equ [r0 + cutValue_OFFS]
104 d_PAR equ [r0 + d_OFFS]
105endif
106 maxLen_PAR equ [r0 + maxLen_OFFS]
107 hash_PAR equ [r0 + hash_OFFS]
108 limit_PAR equ [r0 + limit_OFFS]
109 size_PAR equ [r0 + size_OFFS]
110 cycPos_PAR equ [r0 + cycPos_OFFS]
111 cycSize_PAR equ [r0 + cycSize_OFFS]
112 posRes_PAR equ [r0 + posRes_OFFS]
113
114
115 cutValue_VAR equ DWORD PTR [r4 + 8 * 0]
116 cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4]
117 cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0]
118 cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4]
119 hash_VAR equ QWORD PTR [r4 + 8 * 2]
120 limit_VAR equ QWORD PTR [r4 + 8 * 3]
121 size_VAR equ QWORD PTR [r4 + 8 * 4]
122 distances equ QWORD PTR [r4 + 8 * 5]
123 maxLen_VAR equ QWORD PTR [r4 + 8 * 6]
124
125 Old_RSP equ QWORD PTR [r4 + 8 * 7]
126 LOCAL_SIZE equ 8 * 8
127
128COPY_VAR_32 macro dest_var, src_var
129 mov x3, src_var
130 mov dest_var, x3
131endm
132
133COPY_VAR_64 macro dest_var, src_var
134 mov r3, src_var
135 mov dest_var, r3
136endm
137
138
139; MY_ALIGN_64
140MY_PROC GetMatchesSpecN_2, 13
141MY_PUSH_PRESERVED_ABI_REGS
142 mov r0, RSP
143 lea r3, [r0 - LOCAL_SIZE]
144 and r3, -64
145 mov RSP, r3
146 mov Old_RSP, r0
147
148if (IS_LINUX gt 0)
149 mov d, REG_ABI_PARAM_5 ; r13 = r9
150 mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8
151 mov son, REG_ABI_PARAM_3 ; r9 = r1
152 mov r8, REG_ABI_PARAM_2 ; r8 = r2
153 mov pos, REG_ABI_PARAM_1_x ; r2 = x6
154 mov r1, REG_ABI_PARAM_0 ; r1 = r7
155else
156 COPY_VAR_32 cutValue_VAR, cutValue_PAR
157 mov d, d_PAR
158endif
159
160 COPY_VAR_64 limit_VAR, limit_PAR
161
162 mov hash_lim, size_PAR
163 mov size_VAR, hash_lim
164
165 mov cp_x, cycPos_PAR
166 mov hash, hash_PAR
167
168 mov cycSize, cycSize_PAR
169 mov cycSize_VAR, cycSize
170
171 ; we want cur in (rcx). So we change the cur and lenLimit variables
172 sub lenLimit, cur
173 neg lenLimit_x
174 inc lenLimit_x
175
176 mov t0_x, maxLen_PAR
177 sub t0, lenLimit
178 mov maxLen_VAR, t0
179
180 jmp main_loop
181
182MY_ALIGN_64
183fill_empty:
184 ; ptr0 = *ptr1 = kEmptyHashValue;
185 mov QWORD PTR [ptr1], 0
186 inc pos
187 inc cp_x
188 mov DWORD PTR [d - 4], 0
189 cmp d, limit_VAR
190 jae fin
191 cmp hash, hash_lim
192 je fin
193
194; MY_ALIGN_64
195main_loop:
196 ; UInt32 delta = *hash++;
197 mov diff_x, [hash] ; delta
198 add hash, 4
199 ; mov cycPos_VAR, cp_x
200
201 inc cur
202 add d, 4
203 mov m, pos
204 sub m, diff_x; ; matchPos
205
206 ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
207 lea ptr1, [son + 8 * cp_r]
208 ; mov cycSize, cycSize_VAR
209 cmp pos, cycSize
210 jb directMode ; if (pos < cycSize_VAR)
211
212 ; CYC MODE
213
214 cmp diff_x, cycSize
215 jae fill_empty ; if (delta >= cycSize_VAR)
216
217 xor t0_x, t0_x
218 mov cycPos_VAR, cp_x
219 sub cp_x, diff_x
220 ; jae prepare_for_tree_loop
221 ; add cp_x, cycSize
222 cmovb t0_x, cycSize
223 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
224 jmp prepare_for_tree_loop
225
226
227directMode:
228 cmp diff_x, pos
229 je fill_empty ; if (delta == pos)
230 jae fin_error ; if (delta >= pos)
231
232 mov cycPos_VAR, cp_x
233 mov cp_x, m
234
235prepare_for_tree_loop:
236 mov len0, lenLimit
237 mov hash_VAR, hash
238 ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
239 lea ptr0, [ptr1 + 4]
240 ; UInt32 *_distances = ++d;
241 mov distances, d
242
243 neg len0
244 mov len1, len0
245
246 mov t0_x, cutValue_VAR
247 mov maxLen, maxLen_VAR
248 mov cutValueCur_VAR, t0_x
249
250MY_ALIGN_32
251tree_loop:
252 neg diff
253 mov len, len0
254 cmp len1, len0
255 cmovb len, len1 ; len = (len1 < len0 ? len1 : len0);
256 add diff, cur
257
258 mov t0_x, [son + cp_r * 8] ; prefetch
259 movzx t0_x, BYTE PTR [diff + 1 * len]
260 lea cp_r, [son + cp_r * 8]
261 cmp [cur + 1 * len], t0_L
262 je matched_1
263
264 jb left_0
265
266 mov [ptr1], m
267 mov m, [cp_r + 4]
268 lea ptr1, [cp_r + 4]
269 sub diff, cur ; FIX32
270 jmp next_node
271
272MY_ALIGN_32
273left_0:
274 mov [ptr0], m
275 mov m, [cp_r]
276 mov ptr0, cp_r
277 sub diff, cur ; FIX32
278 ; jmp next_node
279
280; ------------ NEXT NODE ------------
281; MY_ALIGN_32
282next_node:
283 mov cycSize, cycSize_VAR
284 dec cutValueCur_VAR
285 je finish_tree
286
287 add diff_x, pos ; prev_match = pos + diff
288 cmp m, diff_x
289 jae fin_error ; if (new_match >= prev_match)
290
291 mov diff_x, pos
292 sub diff_x, m ; delta = pos - new_match
293 cmp pos, cycSize
294 jae cyc_mode_2 ; if (pos >= cycSize)
295
296 mov cp_x, m
297 test m, m
298 jne tree_loop ; if (m != 0)
299
300finish_tree:
301 ; ptr0 = *ptr1 = kEmptyHashValue;
302 mov DWORD PTR [ptr0], 0
303 mov DWORD PTR [ptr1], 0
304
305 inc pos
306
307 ; _distances[-1] = (UInt32)(d - _distances);
308 mov t0, distances
309 mov t1, d
310 sub t1, t0
311 shr t1_x, 2
312 mov [t0 - 4], t1_x
313
314 cmp d, limit_VAR
315 jae fin ; if (d >= limit)
316
317 mov cp_x, cycPos_VAR
318 mov hash, hash_VAR
319 mov hash_lim, size_VAR
320 inc cp_x
321 cmp hash, hash_lim
322 jne main_loop ; if (hash != size)
323 jmp fin
324
325
326MY_ALIGN_32
327cyc_mode_2:
328 cmp diff_x, cycSize
329 jae finish_tree ; if (delta >= cycSize)
330
331 mov cp_x, cycPos_VAR
332 xor t0_x, t0_x
333 sub cp_x, diff_x ; cp_x = cycPos - delta
334 cmovb t0_x, cycSize
335 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
336 jmp tree_loop
337
338
339MY_ALIGN_32
340matched_1:
341
342 inc len
343 ; cmp len_x, lenLimit_x
344 je short lenLimit_reach
345 movzx t0_x, BYTE PTR [diff + 1 * len]
346 cmp [cur + 1 * len], t0_L
347 jne mismatch
348
349
350MY_ALIGN_32
351match_loop:
352 ; while (++len != lenLimit) (len[diff] != len[0]) ;
353
354 inc len
355 ; cmp len_x, lenLimit_x
356 je short lenLimit_reach
357 movzx t0_x, BYTE PTR [diff + 1 * len]
358 cmp BYTE PTR [cur + 1 * len], t0_L
359 je match_loop
360
361mismatch:
362 jb left_2
363
364 mov [ptr1], m
365 mov m, [cp_r + 4]
366 lea ptr1, [cp_r + 4]
367 mov len1, len
368
369 jmp max_update
370
371MY_ALIGN_32
372left_2:
373 mov [ptr0], m
374 mov m, [cp_r]
375 mov ptr0, cp_r
376 mov len0, len
377
378max_update:
379 sub diff, cur ; restore diff
380
381 cmp maxLen, len
382 jae next_node
383
384 mov maxLen, len
385 add len, lenLimit
386 mov [d], len_x
387 mov t0_x, diff_x
388 not t0_x
389 mov [d + 4], t0_x
390 add d, 8
391
392 jmp next_node
393
394
395
396MY_ALIGN_32
397lenLimit_reach:
398
399 mov delta_r, cur
400 sub delta_r, diff
401 lea delta1_r, [delta_r - 1]
402
403 mov t0_x, [cp_r]
404 mov [ptr1], t0_x
405 mov t0_x, [cp_r + 4]
406 mov [ptr0], t0_x
407
408 mov [d], lenLimit_x
409 mov [d + 4], delta1_x
410 add d, 8
411
412 ; _distances[-1] = (UInt32)(d - _distances);
413 mov t0, distances
414 mov t1, d
415 sub t1, t0
416 shr t1_x, 2
417 mov [t0 - 4], t1_x
418
419 mov hash, hash_VAR
420 mov hash_lim, size_VAR
421
422 inc pos
423 mov cp_x, cycPos_VAR
424 inc cp_x
425
426 mov d_lim, limit_VAR
427 mov cycSize, cycSize_VAR
428 ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
429 ; break;
430 cmp hash, hash_lim
431 je fin
432 cmp d, d_lim
433 jae fin
434 cmp delta_x, [hash]
435 jne main_loop
436 movzx t0_x, BYTE PTR [diff]
437 cmp [cur], t0_L
438 jne main_loop
439
440 ; jmp main_loop ; bypass for debug
441
442 mov cycPos_VAR, cp_x
443 shl len, 3 ; cycSize * 8
444 sub diff, cur ; restore diff
445 xor t0_x, t0_x
446 cmp cp_x, delta_x ; cmp (cycPos_VAR, delta)
447 lea cp_r, [son + 8 * cp_r] ; dest
448 lea src, [cp_r + 8 * diff]
449 cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0)
450 add src, t0
451 add len, son ; len = son + cycSize * 8
452
453
454MY_ALIGN_32
455long_loop:
456 add hash, 4
457
458 ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
459
460 mov t0, [src]
461 add src, 8
462 mov [cp_r], t0
463 add cp_r, 8
464 cmp src, len
465 cmove src, son ; if end of (son) buffer is reached, we wrap to begin
466
467 mov DWORD PTR [d], 2
468 mov [d + 4], lenLimit_x
469 mov [d + 8], delta1_x
470 add d, 12
471
472 inc cur
473
474 cmp hash, hash_lim
475 je long_footer
476 cmp delta_x, [hash]
477 jne long_footer
478 movzx t0_x, BYTE PTR [diff + 1 * cur]
479 cmp [cur], t0_L
480 jne long_footer
481 cmp d, d_lim
482 jb long_loop
483
484long_footer:
485 sub cp_r, son
486 shr cp_r, 3
487 add pos, cp_x
488 sub pos, cycPos_VAR
489 mov cycSize, cycSize_VAR
490
491 cmp d, d_lim
492 jae fin
493 cmp hash, hash_lim
494 jne main_loop
495 jmp fin
496
497
498
499fin_error:
500 xor d, d
501
502fin:
503 mov RSP, Old_RSP
504 mov t0, [r4 + posRes_OFFS]
505 mov [t0], pos
506 mov r0, d
507
508MY_POP_PRESERVED_ABI_REGS
509MY_ENDP
510
511_TEXT$LZFINDOPT ENDS
512
513end
diff --git a/Asm/x86/LzmaDecOpt.asm b/Asm/x86/LzmaDecOpt.asm
new file mode 100644
index 0000000..f2818e7
--- /dev/null
+++ b/Asm/x86/LzmaDecOpt.asm
@@ -0,0 +1,1303 @@
1; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2; 2021-02-23: Igor Pavlov : Public domain
3;
4; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5; function for check at link time.
6; That code is tightly coupled with LzmaDec_TryDummy()
7; and with another functions in LzmaDec.c file.
8; CLzmaDec structure, (probs) array layout, input and output of
9; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
10
11ifndef x64
12; x64=1
13; .err <x64_IS_REQUIRED>
14endif
15
16include 7zAsm.asm
17
18MY_ASM_START
19
20_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
21
22MY_ALIGN macro num:req
23 align num
24endm
25
26MY_ALIGN_16 macro
27 MY_ALIGN 16
28endm
29
30MY_ALIGN_32 macro
31 MY_ALIGN 32
32endm
33
34MY_ALIGN_64 macro
35 MY_ALIGN 64
36endm
37
38
39; _LZMA_SIZE_OPT equ 1
40
41; _LZMA_PROB32 equ 1
42
43ifdef _LZMA_PROB32
44 PSHIFT equ 2
45 PLOAD macro dest, mem
46 mov dest, dword ptr [mem]
47 endm
48 PSTORE macro src, mem
49 mov dword ptr [mem], src
50 endm
51else
52 PSHIFT equ 1
53 PLOAD macro dest, mem
54 movzx dest, word ptr [mem]
55 endm
56 PSTORE macro src, mem
57 mov word ptr [mem], @CatStr(src, _W)
58 endm
59endif
60
61PMULT equ (1 SHL PSHIFT)
62PMULT_HALF equ (1 SHL (PSHIFT - 1))
63PMULT_2 equ (1 SHL (PSHIFT + 1))
64
65kMatchSpecLen_Error_Data equ (1 SHL 9)
66
67; x0 range
68; x1 pbPos / (prob) TREE
69; x2 probBranch / prm (MATCHED) / pbPos / cnt
70; x3 sym
71;====== r4 === RSP
72; x5 cod
73; x6 t1 NORM_CALC / probs_state / dist
74; x7 t0 NORM_CALC / prob2 IF_BIT_1
75; x8 state
76; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg
77; x10 kBitModelTotal_reg
78; r11 probs
79; x12 offs (MATCHED) / dic / len_temp
80; x13 processedPos
81; x14 bit (MATCHED) / dicPos
82; r15 buf
83
84
85cod equ x5
86cod_L equ x5_L
87range equ x0
88state equ x8
89state_R equ r8
90buf equ r15
91processedPos equ x13
92kBitModelTotal_reg equ x10
93
94probBranch equ x2
95probBranch_R equ r2
96probBranch_W equ x2_W
97
98pbPos equ x1
99pbPos_R equ r1
100
101cnt equ x2
102cnt_R equ r2
103
104lpMask_reg equ x9
105dicPos equ r14
106
107sym equ x3
108sym_R equ r3
109sym_L equ x3_L
110
111probs equ r11
112dic equ r12
113
114t0 equ x7
115t0_W equ x7_W
116t0_R equ r7
117
118prob2 equ t0
119prob2_W equ t0_W
120
121t1 equ x6
122t1_R equ r6
123
124probs_state equ t1
125probs_state_R equ t1_R
126
127prm equ r2
128match equ x9
129match_R equ r9
130offs equ x12
131offs_R equ r12
132bit equ x14
133bit_R equ r14
134
135sym2 equ x9
136sym2_R equ r9
137
138len_temp equ x12
139
140dist equ sym
141dist2 equ x9
142
143
144
145kNumBitModelTotalBits equ 11
146kBitModelTotal equ (1 SHL kNumBitModelTotalBits)
147kNumMoveBits equ 5
148kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)
149kTopValue equ (1 SHL 24)
150
151NORM_2 macro
152 ; movzx t0, BYTE PTR [buf]
153 shl cod, 8
154 mov cod_L, BYTE PTR [buf]
155 shl range, 8
156 ; or cod, t0
157 inc buf
158endm
159
160
161NORM macro
162 cmp range, kTopValue
163 jae SHORT @F
164 NORM_2
165@@:
166endm
167
168
169; ---------- Branch MACROS ----------
170
171UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
172 mov prob2, kBitModelTotal_reg
173 sub prob2, probBranch
174 shr prob2, kNumMoveBits
175 add probBranch, prob2
176 PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT
177endm
178
179
180UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
181 sub prob2, range
182 sub cod, range
183 mov range, prob2
184 mov prob2, probBranch
185 shr probBranch, kNumMoveBits
186 sub prob2, probBranch
187 PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT
188endm
189
190
191CMP_COD macro probsArray:req, probOffset:req, probDisp:req
192 PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT
193 NORM
194 mov prob2, range
195 shr range, kNumBitModelTotalBits
196 imul range, probBranch
197 cmp cod, range
198endm
199
200
201IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
202 CMP_COD probsArray, probOffset, probDisp
203 jae toLabel
204endm
205
206
207IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
208 IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
209 UPDATE_0 probsArray, probOffset, probDisp
210endm
211
212
213IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
214 CMP_COD probsArray, probOffset, probDisp
215 jb toLabel
216endm
217
218
219; ---------- CMOV MACROS ----------
220
221NORM_CALC macro prob:req
222 NORM
223 mov t0, range
224 shr range, kNumBitModelTotalBits
225 imul range, prob
226 sub t0, range
227 mov t1, cod
228 sub cod, range
229endm
230
231
232PUP macro prob:req, probPtr:req
233 sub t0, prob
234 ; only sar works for both 16/32 bit prob modes
235 sar t0, kNumMoveBits
236 add t0, prob
237 PSTORE t0, probPtr
238endm
239
240
241PUP_SUB macro prob:req, probPtr:req, symSub:req
242 sbb sym, symSub
243 PUP prob, probPtr
244endm
245
246
247PUP_COD macro prob:req, probPtr:req, symSub:req
248 mov t0, kBitModelOffset
249 cmovb cod, t1
250 mov t1, sym
251 cmovb t0, kBitModelTotal_reg
252 PUP_SUB prob, probPtr, symSub
253endm
254
255
256BIT_0 macro prob:req, probNext:req
257 PLOAD prob, probs + 1 * PMULT
258 PLOAD probNext, probs + 1 * PMULT_2
259
260 NORM_CALC prob
261
262 cmovae range, t0
263 PLOAD t0, probs + 1 * PMULT_2 + PMULT
264 cmovae probNext, t0
265 mov t0, kBitModelOffset
266 cmovb cod, t1
267 cmovb t0, kBitModelTotal_reg
268 mov sym, 2
269 PUP_SUB prob, probs + 1 * PMULT, 0 - 1
270endm
271
272
273BIT_1 macro prob:req, probNext:req
274 PLOAD probNext, probs + sym_R * PMULT_2
275 add sym, sym
276
277 NORM_CALC prob
278
279 cmovae range, t0
280 PLOAD t0, probs + sym_R * PMULT + PMULT
281 cmovae probNext, t0
282 PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
283endm
284
285
286BIT_2 macro prob:req, symSub:req
287 add sym, sym
288
289 NORM_CALC prob
290
291 cmovae range, t0
292 PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
293endm
294
295
296; ---------- MATCHED LITERAL ----------
297
298LITM_0 macro
299 mov offs, 256 * PMULT
300 shl match, (PSHIFT + 1)
301 mov bit, offs
302 and bit, match
303 PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
304 lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
305 ; lea prm, [probs + 256 * PMULT + 1 * PMULT]
306 ; add prm, bit_R
307 xor offs, bit
308 add match, match
309
310 NORM_CALC x1
311
312 cmovae offs, bit
313 mov bit, match
314 cmovae range, t0
315 mov t0, kBitModelOffset
316 cmovb cod, t1
317 cmovb t0, kBitModelTotal_reg
318 mov sym, 0
319 PUP_SUB x1, prm, -2-1
320endm
321
322
323LITM macro
324 and bit, offs
325 lea prm, [probs + offs_R * 1]
326 add prm, bit_R
327 PLOAD x1, prm + sym_R * PMULT
328 xor offs, bit
329 add sym, sym
330 add match, match
331
332 NORM_CALC x1
333
334 cmovae offs, bit
335 mov bit, match
336 cmovae range, t0
337 PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
338endm
339
340
341LITM_2 macro
342 and bit, offs
343 lea prm, [probs + offs_R * 1]
344 add prm, bit_R
345 PLOAD x1, prm + sym_R * PMULT
346 add sym, sym
347
348 NORM_CALC x1
349
350 cmovae range, t0
351 PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
352endm
353
354
355; ---------- REVERSE BITS ----------
356
357REV_0 macro prob:req, probNext:req
358 ; PLOAD prob, probs + 1 * PMULT
359 ; lea sym2_R, [probs + 2 * PMULT]
360 ; PLOAD probNext, probs + 2 * PMULT
361 PLOAD probNext, sym2_R
362
363 NORM_CALC prob
364
365 cmovae range, t0
366 PLOAD t0, probs + 3 * PMULT
367 cmovae probNext, t0
368 cmovb cod, t1
369 mov t0, kBitModelOffset
370 cmovb t0, kBitModelTotal_reg
371 lea t1_R, [probs + 3 * PMULT]
372 cmovae sym2_R, t1_R
373 PUP prob, probs + 1 * PMULT
374endm
375
376
377REV_1 macro prob:req, probNext:req, step:req
378 add sym2_R, step * PMULT
379 PLOAD probNext, sym2_R
380
381 NORM_CALC prob
382
383 cmovae range, t0
384 PLOAD t0, sym2_R + step * PMULT
385 cmovae probNext, t0
386 cmovb cod, t1
387 mov t0, kBitModelOffset
388 cmovb t0, kBitModelTotal_reg
389 lea t1_R, [sym2_R + step * PMULT]
390 cmovae sym2_R, t1_R
391 PUP prob, t1_R - step * PMULT_2
392endm
393
394
395REV_2 macro prob:req, step:req
396 sub sym2_R, probs
397 shr sym2, PSHIFT
398 or sym, sym2
399
400 NORM_CALC prob
401
402 cmovae range, t0
403 lea t0, [sym - step]
404 cmovb sym, t0
405 cmovb cod, t1
406 mov t0, kBitModelOffset
407 cmovb t0, kBitModelTotal_reg
408 PUP prob, probs + sym2_R * PMULT
409endm
410
411
412REV_1_VAR macro prob:req
413 PLOAD prob, sym_R
414 mov probs, sym_R
415 add sym_R, sym2_R
416
417 NORM_CALC prob
418
419 cmovae range, t0
420 lea t0_R, [sym_R + 1 * sym2_R]
421 cmovae sym_R, t0_R
422 mov t0, kBitModelOffset
423 cmovb cod, t1
424 ; mov t1, kBitModelTotal
425 ; cmovb t0, t1
426 cmovb t0, kBitModelTotal_reg
427 add sym2, sym2
428 PUP prob, probs
429endm
430
431
432
433
434LIT_PROBS macro lpMaskParam:req
435 ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
436 mov t0, processedPos
437 shl t0, 8
438 add sym, t0
439 and sym, lpMaskParam
440 add probs_state_R, pbPos_R
441 mov x1, LOC lc2
442 lea sym, dword ptr[sym_R + 2 * sym_R]
443 add probs, Literal * PMULT
444 shl sym, x1_L
445 add probs, sym_R
446 UPDATE_0 probs_state_R, 0, IsMatch
447 inc processedPos
448endm
449
450
451
452kNumPosBitsMax equ 4
453kNumPosStatesMax equ (1 SHL kNumPosBitsMax)
454
455kLenNumLowBits equ 3
456kLenNumLowSymbols equ (1 SHL kLenNumLowBits)
457kLenNumHighBits equ 8
458kLenNumHighSymbols equ (1 SHL kLenNumHighBits)
459kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
460
461LenLow equ 0
462LenChoice equ LenLow
463LenChoice2 equ (LenLow + kLenNumLowSymbols)
464LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
465
466kNumStates equ 12
467kNumStates2 equ 16
468kNumLitStates equ 7
469
470kStartPosModelIndex equ 4
471kEndPosModelIndex equ 14
472kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))
473
474kNumPosSlotBits equ 6
475kNumLenToPosStates equ 4
476
477kNumAlignBits equ 4
478kAlignTableSize equ (1 SHL kNumAlignBits)
479
480kMatchMinLen equ 2
481kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
482
483kStartOffset equ 1664
484SpecPos equ (-kStartOffset)
485IsRep0Long equ (SpecPos + kNumFullDistances)
486RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
487LenCoder equ (RepLenCoder + kNumLenProbs)
488IsMatch equ (LenCoder + kNumLenProbs)
489kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
490IsRep equ (kAlign + kAlignTableSize)
491IsRepG0 equ (IsRep + kNumStates)
492IsRepG1 equ (IsRepG0 + kNumStates)
493IsRepG2 equ (IsRepG1 + kNumStates)
494PosSlot equ (IsRepG2 + kNumStates)
495Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
496NUM_BASE_PROBS equ (Literal + kStartOffset)
497
498if kAlign ne 0
499 .err <Stop_Compiling_Bad_LZMA_kAlign>
500endif
501
502if NUM_BASE_PROBS ne 1984
503 .err <Stop_Compiling_Bad_LZMA_PROBS>
504endif
505
506
507PTR_FIELD equ dq ?
508
509CLzmaDec_Asm struct
510 lc db ?
511 lp db ?
512 pb db ?
513 _pad_ db ?
514 dicSize dd ?
515
516 probs_Spec PTR_FIELD
517 probs_1664 PTR_FIELD
518 dic_Spec PTR_FIELD
519 dicBufSize PTR_FIELD
520 dicPos_Spec PTR_FIELD
521 buf_Spec PTR_FIELD
522
523 range_Spec dd ?
524 code_Spec dd ?
525 processedPos_Spec dd ?
526 checkDicSize dd ?
527 rep0 dd ?
528 rep1 dd ?
529 rep2 dd ?
530 rep3 dd ?
531 state_Spec dd ?
532 remainLen dd ?
533CLzmaDec_Asm ends
534
535
536CLzmaDec_Asm_Loc struct
537 OLD_RSP PTR_FIELD
538 lzmaPtr PTR_FIELD
539 _pad0_ PTR_FIELD
540 _pad1_ PTR_FIELD
541 _pad2_ PTR_FIELD
542 dicBufSize PTR_FIELD
543 probs_Spec PTR_FIELD
544 dic_Spec PTR_FIELD
545
546 limit PTR_FIELD
547 bufLimit PTR_FIELD
548 lc2 dd ?
549 lpMask dd ?
550 pbMask dd ?
551 checkDicSize dd ?
552
553 _pad_ dd ?
554 remainLen dd ?
555 dicPos_Spec PTR_FIELD
556 rep0 dd ?
557 rep1 dd ?
558 rep2 dd ?
559 rep3 dd ?
560CLzmaDec_Asm_Loc ends
561
562
563GLOB_2 equ [sym_R].CLzmaDec_Asm.
564GLOB equ [r1].CLzmaDec_Asm.
565LOC_0 equ [r0].CLzmaDec_Asm_Loc.
566LOC equ [RSP].CLzmaDec_Asm_Loc.
567
568
569COPY_VAR macro name
570 mov t0, GLOB_2 name
571 mov LOC_0 name, t0
572endm
573
574
575RESTORE_VAR macro name
576 mov t0, LOC name
577 mov GLOB name, t0
578endm
579
580
581
582IsMatchBranch_Pre macro reg
583 ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
584 mov pbPos, LOC pbMask
585 and pbPos, processedPos
586 shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
587 lea probs_state_R, [probs + 1 * state_R]
588endm
589
590
591IsMatchBranch macro reg
592 IsMatchBranch_Pre
593 IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
594endm
595
596
597CheckLimits macro reg
598 cmp buf, LOC bufLimit
599 jae fin_OK
600 cmp dicPos, LOC limit
601 jae fin_OK
602endm
603
604
605
606; RSP is (16x + 8) bytes aligned in WIN64-x64
607; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
608
609PARAM_lzma equ REG_ABI_PARAM_0
610PARAM_limit equ REG_ABI_PARAM_1
611PARAM_bufLimit equ REG_ABI_PARAM_2
612
613; MY_ALIGN_64
614MY_PROC LzmaDec_DecodeReal_3, 3
615MY_PUSH_PRESERVED_ABI_REGS
616
617 lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
618 and r0, -128
619 mov r5, RSP
620 mov RSP, r0
621 mov LOC_0 Old_RSP, r5
622 mov LOC_0 lzmaPtr, PARAM_lzma
623
624 mov LOC_0 remainLen, 0 ; remainLen must be ZERO
625
626 mov LOC_0 bufLimit, PARAM_bufLimit
627 mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2
628 mov dic, GLOB_2 dic_Spec
629 add PARAM_limit, dic
630 mov LOC_0 limit, PARAM_limit
631
632 COPY_VAR(rep0)
633 COPY_VAR(rep1)
634 COPY_VAR(rep2)
635 COPY_VAR(rep3)
636
637 mov dicPos, GLOB_2 dicPos_Spec
638 add dicPos, dic
639 mov LOC_0 dicPos_Spec, dicPos
640 mov LOC_0 dic_Spec, dic
641
642 mov x1_L, GLOB_2 pb
643 mov t0, 1
644 shl t0, x1_L
645 dec t0
646 mov LOC_0 pbMask, t0
647
648 ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
649 ; unsigned lc = p->prop.lc;
650 ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
651
652 mov x1_L, GLOB_2 lc
653 mov x2, 100h
654 mov t0, x2
655 shr x2, x1_L
656 ; inc x1
657 add x1_L, PSHIFT
658 mov LOC_0 lc2, x1
659 mov x1_L, GLOB_2 lp
660 shl t0, x1_L
661 sub t0, x2
662 mov LOC_0 lpMask, t0
663 mov lpMask_reg, t0
664
665 ; mov probs, GLOB_2 probs_Spec
666 ; add probs, kStartOffset SHL PSHIFT
667 mov probs, GLOB_2 probs_1664
668 mov LOC_0 probs_Spec, probs
669
670 mov t0_R, GLOB_2 dicBufSize
671 mov LOC_0 dicBufSize, t0_R
672
673 mov x1, GLOB_2 checkDicSize
674 mov LOC_0 checkDicSize, x1
675
676 mov processedPos, GLOB_2 processedPos_Spec
677
678 mov state, GLOB_2 state_Spec
679 shl state, PSHIFT
680
681 mov buf, GLOB_2 buf_Spec
682 mov range, GLOB_2 range_Spec
683 mov cod, GLOB_2 code_Spec
684 mov kBitModelTotal_reg, kBitModelTotal
685 xor sym, sym
686
687 ; if (processedPos != 0 || checkDicSize != 0)
688 or x1, processedPos
689 jz @f
690
691 add t0_R, dic
692 cmp dicPos, dic
693 cmovnz t0_R, dicPos
694 movzx sym, byte ptr[t0_R - 1]
695
696@@:
697 IsMatchBranch_Pre
698 cmp state, 4 * PMULT
699 jb lit_end
700 cmp state, kNumLitStates * PMULT
701 jb lit_matched_end
702 jmp lz_end
703
704
705
706
707; ---------- LITERAL ----------
708MY_ALIGN_64
709lit_start:
710 xor state, state
711lit_start_2:
712 LIT_PROBS lpMask_reg
713
714 ifdef _LZMA_SIZE_OPT
715
716 PLOAD x1, probs + 1 * PMULT
717 mov sym, 1
718MY_ALIGN_16
719lit_loop:
720 BIT_1 x1, x2
721 mov x1, x2
722 cmp sym, 127
723 jbe lit_loop
724
725 else
726
727 BIT_0 x1, x2
728 BIT_1 x2, x1
729 BIT_1 x1, x2
730 BIT_1 x2, x1
731 BIT_1 x1, x2
732 BIT_1 x2, x1
733 BIT_1 x1, x2
734
735 endif
736
737 BIT_2 x2, 256 - 1
738
739 ; mov dic, LOC dic_Spec
740 mov probs, LOC probs_Spec
741 IsMatchBranch_Pre
742 mov byte ptr[dicPos], sym_L
743 inc dicPos
744
745 CheckLimits
746lit_end:
747 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
748
749 ; jmp IsMatch_label
750
751; ---------- MATCHES ----------
752; MY_ALIGN_32
753IsMatch_label:
754 UPDATE_1 probs_state_R, pbPos_R, IsMatch
755 IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
756
757 add probs, LenCoder * PMULT
758 add state, kNumStates * PMULT
759
760; ---------- LEN DECODE ----------
761len_decode:
762 mov len_temp, 8 - 1 - kMatchMinLen
763 IF_BIT_0_NOUP probs, 0, 0, len_mid_0
764 UPDATE_1 probs, 0, 0
765 add probs, (1 SHL (kLenNumLowBits + PSHIFT))
766 mov len_temp, -1 - kMatchMinLen
767 IF_BIT_0_NOUP probs, 0, 0, len_mid_0
768 UPDATE_1 probs, 0, 0
769 add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
770 mov sym, 1
771 PLOAD x1, probs + 1 * PMULT
772
773MY_ALIGN_32
774len8_loop:
775 BIT_1 x1, x2
776 mov x1, x2
777 cmp sym, 64
778 jb len8_loop
779
780 mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
781 jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
782
783MY_ALIGN_32
784len_mid_0:
785 UPDATE_0 probs, 0, 0
786 add probs, pbPos_R
787 BIT_0 x2, x1
788len_mid_2:
789 BIT_1 x1, x2
790 BIT_2 x2, len_temp
791 mov probs, LOC probs_Spec
792 cmp state, kNumStates * PMULT
793 jb copy_match
794
795
796; ---------- DECODE DISTANCE ----------
797 ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
798
799 mov t0, 3 + kMatchMinLen
800 cmp sym, 3 + kMatchMinLen
801 cmovb t0, sym
802 add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
803 shl t0, (kNumPosSlotBits + PSHIFT)
804 add probs, t0_R
805
806 ; sym = Len
807 ; mov LOC remainLen, sym
808 mov len_temp, sym
809
810 ifdef _LZMA_SIZE_OPT
811
812 PLOAD x1, probs + 1 * PMULT
813 mov sym, 1
814MY_ALIGN_16
815slot_loop:
816 BIT_1 x1, x2
817 mov x1, x2
818 cmp sym, 32
819 jb slot_loop
820
821 else
822
823 BIT_0 x1, x2
824 BIT_1 x2, x1
825 BIT_1 x1, x2
826 BIT_1 x2, x1
827 BIT_1 x1, x2
828
829 endif
830
831 mov x1, sym
832 BIT_2 x2, 64-1
833
834 and sym, 3
835 mov probs, LOC probs_Spec
836 cmp x1, 32 + kEndPosModelIndex / 2
837 jb short_dist
838
839 ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
840 sub x1, (32 + 1 + kNumAlignBits)
841 ; distance = (2 | (distance & 1));
842 or sym, 2
843 PLOAD x2, probs + 1 * PMULT
844 shl sym, kNumAlignBits + 1
845 lea sym2_R, [probs + 2 * PMULT]
846
847 jmp direct_norm
848 ; lea t1, [sym_R + (1 SHL kNumAlignBits)]
849 ; cmp range, kTopValue
850 ; jb direct_norm
851
852; ---------- DIRECT DISTANCE ----------
853MY_ALIGN_32
854direct_loop:
855 shr range, 1
856 mov t0, cod
857 sub cod, range
858 cmovs cod, t0
859 cmovns sym, t1
860
861 comment ~
862 sub cod, range
863 mov x2, cod
864 sar x2, 31
865 lea sym, dword ptr [r2 + sym_R * 2 + 1]
866 and x2, range
867 add cod, x2
868 ~
869 dec x1
870 je direct_end
871
872 add sym, sym
873direct_norm:
874 lea t1, [sym_R + (1 SHL kNumAlignBits)]
875 cmp range, kTopValue
876 jae near ptr direct_loop
877 ; we align for 32 here with "near ptr" command above
878 NORM_2
879 jmp direct_loop
880
881MY_ALIGN_32
882direct_end:
883 ; prob = + kAlign;
884 ; distance <<= kNumAlignBits;
885 REV_0 x2, x1
886 REV_1 x1, x2, 2
887 REV_1 x2, x1, 4
888 REV_2 x1, 8
889
890decode_dist_end:
891
892 ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
893
894 mov t1, LOC rep0
895 mov x1, LOC rep1
896 mov x2, LOC rep2
897
898 mov t0, LOC checkDicSize
899 test t0, t0
900 cmove t0, processedPos
901 cmp sym, t0
902 jae end_of_payload
903 ; jmp end_of_payload ; for debug
904
905 ; rep3 = rep2;
906 ; rep2 = rep1;
907 ; rep1 = rep0;
908 ; rep0 = distance + 1;
909
910 inc sym
911 mov LOC rep0, sym
912 ; mov sym, LOC remainLen
913 mov sym, len_temp
914 mov LOC rep1, t1
915 mov LOC rep2, x1
916 mov LOC rep3, x2
917
918 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
919 cmp state, (kNumStates + kNumLitStates) * PMULT
920 mov state, kNumLitStates * PMULT
921 mov t0, (kNumLitStates + 3) * PMULT
922 cmovae state, t0
923
924
925; ---------- COPY MATCH ----------
926copy_match:
927
928 ; len += kMatchMinLen;
929 ; add sym, kMatchMinLen
930
931 ; if ((rem = limit - dicPos) == 0)
932 ; {
933 ; p->dicPos = dicPos;
934 ; return SZ_ERROR_DATA;
935 ; }
936 mov cnt_R, LOC limit
937 sub cnt_R, dicPos
938 jz fin_dicPos_LIMIT
939
940 ; curLen = ((rem < len) ? (unsigned)rem : len);
941 cmp cnt_R, sym_R
942 ; cmovae cnt_R, sym_R ; 64-bit
943 cmovae cnt, sym ; 32-bit
944
945 mov dic, LOC dic_Spec
946 mov x1, LOC rep0
947
948 mov t0_R, dicPos
949 add dicPos, cnt_R
950 ; processedPos += curLen;
951 add processedPos, cnt
952 ; len -= curLen;
953 sub sym, cnt
954 mov LOC remainLen, sym
955
956 sub t0_R, dic
957
958 ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
959 sub t0_R, r1
960 jae @f
961
962 mov r1, LOC dicBufSize
963 add t0_R, r1
964 sub r1, t0_R
965 cmp cnt_R, r1
966 ja copy_match_cross
967@@:
968 ; if (curLen <= dicBufSize - pos)
969
970; ---------- COPY MATCH FAST ----------
971 ; Byte *dest = dic + dicPos;
972 ; mov r1, dic
973 ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
974 ; sub t0_R, dicPos
975 ; dicPos += curLen;
976
977 ; const Byte *lim = dest + curLen;
978 add t0_R, dic
979 movzx sym, byte ptr[t0_R]
980 add t0_R, cnt_R
981 neg cnt_R
982 ; lea r1, [dicPos - 1]
983copy_common:
984 dec dicPos
985 ; cmp LOC rep0, 1
986 ; je rep0Label
987
988 ; t0_R - src_lim
989 ; r1 - dest_lim - 1
990 ; cnt_R - (-cnt)
991
992 IsMatchBranch_Pre
993 inc cnt_R
994 jz copy_end
995MY_ALIGN_16
996@@:
997 mov byte ptr[cnt_R * 1 + dicPos], sym_L
998 movzx sym, byte ptr[cnt_R * 1 + t0_R]
999 inc cnt_R
1000 jnz @b
1001
1002copy_end:
1003lz_end_match:
1004 mov byte ptr[dicPos], sym_L
1005 inc dicPos
1006
1007 ; IsMatchBranch_Pre
1008 CheckLimits
1009lz_end:
1010 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1011
1012
1013
1014; ---------- LITERAL MATCHED ----------
1015
1016 LIT_PROBS LOC lpMask
1017
1018 ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1019 mov x1, LOC rep0
1020 ; mov dic, LOC dic_Spec
1021 mov LOC dicPos_Spec, dicPos
1022
1023 ; state -= (state < 10) ? 3 : 6;
1024 lea t0, [state_R - 6 * PMULT]
1025 sub state, 3 * PMULT
1026 cmp state, 7 * PMULT
1027 cmovae state, t0
1028
1029 sub dicPos, dic
1030 sub dicPos, r1
1031 jae @f
1032 add dicPos, LOC dicBufSize
1033@@:
1034 comment ~
1035 xor t0, t0
1036 sub dicPos, r1
1037 cmovb t0_R, LOC dicBufSize
1038 ~
1039
1040 movzx match, byte ptr[dic + dicPos * 1]
1041
1042 ifdef _LZMA_SIZE_OPT
1043
1044 mov offs, 256 * PMULT
1045 shl match, (PSHIFT + 1)
1046 mov bit, match
1047 mov sym, 1
1048MY_ALIGN_16
1049litm_loop:
1050 LITM
1051 cmp sym, 256
1052 jb litm_loop
1053 sub sym, 256
1054
1055 else
1056
1057 LITM_0
1058 LITM
1059 LITM
1060 LITM
1061 LITM
1062 LITM
1063 LITM
1064 LITM_2
1065
1066 endif
1067
1068 mov probs, LOC probs_Spec
1069 IsMatchBranch_Pre
1070 ; mov dic, LOC dic_Spec
1071 mov dicPos, LOC dicPos_Spec
1072 mov byte ptr[dicPos], sym_L
1073 inc dicPos
1074
1075 CheckLimits
1076lit_matched_end:
1077 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1078 ; IsMatchBranch
1079 mov lpMask_reg, LOC lpMask
1080 sub state, 3 * PMULT
1081 jmp lit_start_2
1082
1083
1084
1085; ---------- REP 0 LITERAL ----------
1086MY_ALIGN_32
1087IsRep0Short_label:
1088 UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
1089
1090 ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1091 mov dic, LOC dic_Spec
1092 mov t0_R, dicPos
1093 mov probBranch, LOC rep0
1094 sub t0_R, dic
1095
1096 sub probs, RepLenCoder * PMULT
1097
1098 ; state = state < kNumLitStates ? 9 : 11;
1099 or state, 1 * PMULT
1100
1101 ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
1102 ; so we don't need the following (dicPos == limit) check here:
1103 ; cmp dicPos, LOC limit
1104 ; jae fin_dicPos_LIMIT_REP_SHORT
1105
1106 inc processedPos
1107
1108 IsMatchBranch_Pre
1109
1110; xor sym, sym
1111; sub t0_R, probBranch_R
1112; cmovb sym_R, LOC dicBufSize
1113; add t0_R, sym_R
1114 sub t0_R, probBranch_R
1115 jae @f
1116 add t0_R, LOC dicBufSize
1117@@:
1118 movzx sym, byte ptr[dic + t0_R * 1]
1119 jmp lz_end_match
1120
1121
1122MY_ALIGN_32
1123IsRep_label:
1124 UPDATE_1 probs_state_R, 0, IsRep
1125
1126 ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1127 ; So we don't check it here.
1128
1129 ; mov t0, processedPos
1130 ; or t0, LOC checkDicSize
1131 ; jz fin_ERROR_2
1132
1133 ; state = state < kNumLitStates ? 8 : 11;
1134 cmp state, kNumLitStates * PMULT
1135 mov state, 8 * PMULT
1136 mov probBranch, 11 * PMULT
1137 cmovae state, probBranch
1138
1139 ; prob = probs + RepLenCoder;
1140 add probs, RepLenCoder * PMULT
1141
1142 IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
1143 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
1144 UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
1145 jmp len_decode
1146
1147MY_ALIGN_32
1148IsRepG0_label:
1149 UPDATE_1 probs_state_R, 0, IsRepG0
1150 mov dist2, LOC rep0
1151 mov dist, LOC rep1
1152 mov LOC rep1, dist2
1153
1154 IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
1155 mov LOC rep0, dist
1156 jmp len_decode
1157
1158; MY_ALIGN_32
1159IsRepG1_label:
1160 UPDATE_1 probs_state_R, 0, IsRepG1
1161 mov dist2, LOC rep2
1162 mov LOC rep2, dist
1163
1164 IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
1165 mov LOC rep0, dist2
1166 jmp len_decode
1167
1168; MY_ALIGN_32
1169IsRepG2_label:
1170 UPDATE_1 probs_state_R, 0, IsRepG2
1171 mov dist, LOC rep3
1172 mov LOC rep3, dist2
1173 mov LOC rep0, dist
1174 jmp len_decode
1175
1176
1177
1178; ---------- SPEC SHORT DISTANCE ----------
1179
1180MY_ALIGN_32
1181short_dist:
1182 sub x1, 32 + 1
1183 jbe decode_dist_end
1184 or sym, 2
1185 shl sym, x1_L
1186 lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
1187 mov sym2, PMULT ; step
1188MY_ALIGN_32
1189spec_loop:
1190 REV_1_VAR x2
1191 dec x1
1192 jnz spec_loop
1193
1194 mov probs, LOC probs_Spec
1195 sub sym, sym2
1196 sub sym, SpecPos * PMULT
1197 sub sym_R, probs
1198 shr sym, PSHIFT
1199
1200 jmp decode_dist_end
1201
1202
1203; ---------- COPY MATCH CROSS ----------
1204copy_match_cross:
1205 ; t0_R - src pos
1206 ; r1 - len to dicBufSize
1207 ; cnt_R - total copy len
1208
1209 mov t1_R, t0_R ; srcPos
1210 mov t0_R, dic
1211 mov r1, LOC dicBufSize ;
1212 neg cnt_R
1213@@:
1214 movzx sym, byte ptr[t1_R * 1 + t0_R]
1215 inc t1_R
1216 mov byte ptr[cnt_R * 1 + dicPos], sym_L
1217 inc cnt_R
1218 cmp t1_R, r1
1219 jne @b
1220
1221 movzx sym, byte ptr[t0_R]
1222 sub t0_R, cnt_R
1223 jmp copy_common
1224
1225
1226
1227
1228; fin_dicPos_LIMIT_REP_SHORT:
1229 ; mov sym, 1
1230
1231fin_dicPos_LIMIT:
1232 mov LOC remainLen, sym
1233 jmp fin_OK
1234 ; For more strict mode we can stop decoding with error
1235 ; mov sym, 1
1236 ; jmp fin
1237
1238
1239fin_ERROR_MATCH_DIST:
1240
1241 ; rep3 = rep2;
1242 ; rep2 = rep1;
1243 ; rep1 = rep0;
1244 ; rep0 = distance + 1;
1245
1246 add len_temp, kMatchSpecLen_Error_Data
1247 mov LOC remainLen, len_temp
1248
1249 mov LOC rep0, sym
1250 mov LOC rep1, t1
1251 mov LOC rep2, x1
1252 mov LOC rep3, x2
1253
1254 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
1255 cmp state, (kNumStates + kNumLitStates) * PMULT
1256 mov state, kNumLitStates * PMULT
1257 mov t0, (kNumLitStates + 3) * PMULT
1258 cmovae state, t0
1259
1260 ; jmp fin_OK
1261 mov sym, 1
1262 jmp fin
1263
1264end_of_payload:
1265 inc sym
1266 jnz fin_ERROR_MATCH_DIST
1267
1268 mov LOC remainLen, kMatchSpecLenStart
1269 sub state, kNumStates * PMULT
1270
1271fin_OK:
1272 xor sym, sym
1273
1274fin:
1275 NORM
1276
1277 mov r1, LOC lzmaPtr
1278
1279 sub dicPos, LOC dic_Spec
1280 mov GLOB dicPos_Spec, dicPos
1281 mov GLOB buf_Spec, buf
1282 mov GLOB range_Spec, range
1283 mov GLOB code_Spec, cod
1284 shr state, PSHIFT
1285 mov GLOB state_Spec, state
1286 mov GLOB processedPos_Spec, processedPos
1287
1288 RESTORE_VAR(remainLen)
1289 RESTORE_VAR(rep0)
1290 RESTORE_VAR(rep1)
1291 RESTORE_VAR(rep2)
1292 RESTORE_VAR(rep3)
1293
1294 mov x0, sym
1295
1296 mov RSP, LOC Old_RSP
1297
1298MY_POP_PRESERVED_ABI_REGS
1299MY_ENDP
1300
1301_TEXT$LZMADECOPT ENDS
1302
1303end
diff --git a/Asm/x86/Sha1Opt.asm b/Asm/x86/Sha1Opt.asm
new file mode 100644
index 0000000..3495fd1
--- /dev/null
+++ b/Asm/x86/Sha1Opt.asm
@@ -0,0 +1,263 @@
1; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions
2; 2021-03-10 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23CONST SEGMENT
24
25align 16
26Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49CONST ENDS
50
51; _TEXT$SHA1OPT SEGMENT 'CODE'
52
53ifndef x64
54 .686
55 .xmm
56endif
57
58ifdef x64
59 rNum equ REG_ABI_PARAM_2
60 if (IS_LINUX eq 0)
61 LOCAL_SIZE equ (16 * 2)
62 endif
63else
64 rNum equ r0
65 LOCAL_SIZE equ (16 * 1)
66endif
67
68rState equ REG_ABI_PARAM_0
69rData equ REG_ABI_PARAM_1
70
71
72MY_sha1rnds4 macro a1, a2, imm
73 db 0fH, 03aH, 0ccH, (0c0H + a1 * 8 + a2), imm
74endm
75
76MY_SHA_INSTR macro cmd, a1, a2
77 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
78endm
79
80cmd_sha1nexte equ 0c8H
81cmd_sha1msg1 equ 0c9H
82cmd_sha1msg2 equ 0caH
83
84MY_sha1nexte macro a1, a2
85 MY_SHA_INSTR cmd_sha1nexte, a1, a2
86endm
87
88MY_sha1msg1 macro a1, a2
89 MY_SHA_INSTR cmd_sha1msg1, a1, a2
90endm
91
92MY_sha1msg2 macro a1, a2
93 MY_SHA_INSTR cmd_sha1msg2, a1, a2
94endm
95
96MY_PROLOG macro
97 ifdef x64
98 if (IS_LINUX eq 0)
99 movdqa [r4 + 8], xmm6
100 movdqa [r4 + 8 + 16], xmm7
101 sub r4, LOCAL_SIZE + 8
102 movdqa [r4 ], xmm8
103 movdqa [r4 + 16], xmm9
104 endif
105 else ; x86
106 if (IS_CDECL gt 0)
107 mov rState, [r4 + REG_SIZE * 1]
108 mov rData, [r4 + REG_SIZE * 2]
109 mov rNum, [r4 + REG_SIZE * 3]
110 else ; fastcall
111 mov rNum, [r4 + REG_SIZE * 1]
112 endif
113 push r5
114 mov r5, r4
115 and r4, -16
116 sub r4, LOCAL_SIZE
117 endif
118endm
119
120MY_EPILOG macro
121 ifdef x64
122 if (IS_LINUX eq 0)
123 movdqa xmm8, [r4]
124 movdqa xmm9, [r4 + 16]
125 add r4, LOCAL_SIZE + 8
126 movdqa xmm6, [r4 + 8]
127 movdqa xmm7, [r4 + 8 + 16]
128 endif
129 else ; x86
130 mov r4, r5
131 pop r5
132 endif
133 MY_ENDP
134endm
135
136
137e0_N equ 0
138e1_N equ 1
139abcd_N equ 2
140e0_save_N equ 3
141w_regs equ 4
142
143e0 equ @CatStr(xmm, %e0_N)
144e1 equ @CatStr(xmm, %e1_N)
145abcd equ @CatStr(xmm, %abcd_N)
146e0_save equ @CatStr(xmm, %e0_save_N)
147
148
149ifdef x64
150 abcd_save equ xmm8
151 mask2 equ xmm9
152else
153 abcd_save equ [r4]
154 mask2 equ e1
155endif
156
157LOAD_MASK macro
158 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
159endm
160
161LOAD_W macro k:req
162 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
163 pshufb @CatStr(xmm, %(w_regs + k)), mask2
164endm
165
166
167; pre2 can be 2 or 3 (recommended)
168pre2 equ 3
169pre1 equ (pre2 + 1)
170
171NUM_ROUNDS4 equ 20
172
173RND4 macro k
174 movdqa @CatStr(xmm, %(e0_N + ((k + 1) mod 2))), abcd
175 MY_sha1rnds4 abcd_N, (e0_N + (k mod 2)), k / 5
176
177 nextM = (w_regs + ((k + 1) mod 4))
178
179 if (k EQ NUM_ROUNDS4 - 1)
180 nextM = e0_save_N
181 endif
182
183 MY_sha1nexte (e0_N + ((k + 1) mod 2)), nextM
184
185 if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
186 pxor @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4)))
187 endif
188
189 if (k GE (4 - pre1)) AND (k LT (NUM_ROUNDS4 - pre1))
190 MY_sha1msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
191 endif
192
193 if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
194 MY_sha1msg2 (w_regs + ((k + pre2) mod 4)), (w_regs + ((k + pre2 - 1) mod 4))
195 endif
196endm
197
198
199REVERSE_STATE macro
200 ; abcd ; dcba
201 ; e0 ; 000e
202 pshufd abcd, abcd, 01bH ; abcd
203 pshufd e0, e0, 01bH ; e000
204endm
205
206
207
208
209
210MY_PROC Sha1_UpdateBlocks_HW, 3
211 MY_PROLOG
212
213 cmp rNum, 0
214 je end_c
215
216 movdqu abcd, [rState] ; dcba
217 movd e0, dword ptr [rState + 16] ; 000e
218
219 REVERSE_STATE
220
221 ifdef x64
222 LOAD_MASK
223 endif
224
225 align 16
226 nextBlock:
227 movdqa abcd_save, abcd
228 movdqa e0_save, e0
229
230 ifndef x64
231 LOAD_MASK
232 endif
233
234 LOAD_W 0
235 LOAD_W 1
236 LOAD_W 2
237 LOAD_W 3
238
239 paddd e0, @CatStr(xmm, %(w_regs))
240 k = 0
241 rept NUM_ROUNDS4
242 RND4 k
243 k = k + 1
244 endm
245
246 paddd abcd, abcd_save
247
248
249 add rData, 64
250 sub rNum, 1
251 jnz nextBlock
252
253 REVERSE_STATE
254
255 movdqu [rState], abcd
256 movd dword ptr [rState + 16], e0
257
258 end_c:
259MY_EPILOG
260
261; _TEXT$SHA1OPT ENDS
262
263end
diff --git a/Asm/x86/Sha256Opt.asm b/Asm/x86/Sha256Opt.asm
new file mode 100644
index 0000000..5d02c90
--- /dev/null
+++ b/Asm/x86/Sha256Opt.asm
@@ -0,0 +1,263 @@
1; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
2; 2021-03-10 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8; .data
9; public K
10
11; we can use external SHA256_K_ARRAY defined in Sha256.c
12; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes
13
14COMMENT @
15ifdef x64
16K_CONST equ SHA256_K_ARRAY
17else
18K_CONST equ _SHA256_K_ARRAY
19endif
20EXTRN K_CONST:xmmword
21@
22
23CONST SEGMENT
24
25align 16
26Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
27
28; COMMENT @
29align 16
30K_CONST \
31DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H
32DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H
33DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H
34DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H
35DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH
36DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH
37DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H
38DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H
39DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H
40DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H
41DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H
42DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H
43DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H
44DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H
45DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H
46DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H
47; @
48
49CONST ENDS
50
51; _TEXT$SHA256OPT SEGMENT 'CODE'
52
53ifndef x64
54 .686
55 .xmm
56endif
57
58ifdef x64
59 rNum equ REG_ABI_PARAM_2
60 if (IS_LINUX eq 0)
61 LOCAL_SIZE equ (16 * 2)
62 endif
63else
64 rNum equ r0
65 LOCAL_SIZE equ (16 * 1)
66endif
67
68rState equ REG_ABI_PARAM_0
69rData equ REG_ABI_PARAM_1
70
71
72
73
74
75
76MY_SHA_INSTR macro cmd, a1, a2
77 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
78endm
79
80cmd_sha256rnds2 equ 0cbH
81cmd_sha256msg1 equ 0ccH
82cmd_sha256msg2 equ 0cdH
83
84MY_sha256rnds2 macro a1, a2
85 MY_SHA_INSTR cmd_sha256rnds2, a1, a2
86endm
87
88MY_sha256msg1 macro a1, a2
89 MY_SHA_INSTR cmd_sha256msg1, a1, a2
90endm
91
92MY_sha256msg2 macro a1, a2
93 MY_SHA_INSTR cmd_sha256msg2, a1, a2
94endm
95
96MY_PROLOG macro
97 ifdef x64
98 if (IS_LINUX eq 0)
99 movdqa [r4 + 8], xmm6
100 movdqa [r4 + 8 + 16], xmm7
101 sub r4, LOCAL_SIZE + 8
102 movdqa [r4 ], xmm8
103 movdqa [r4 + 16], xmm9
104 endif
105 else ; x86
106 if (IS_CDECL gt 0)
107 mov rState, [r4 + REG_SIZE * 1]
108 mov rData, [r4 + REG_SIZE * 2]
109 mov rNum, [r4 + REG_SIZE * 3]
110 else ; fastcall
111 mov rNum, [r4 + REG_SIZE * 1]
112 endif
113 push r5
114 mov r5, r4
115 and r4, -16
116 sub r4, LOCAL_SIZE
117 endif
118endm
119
120MY_EPILOG macro
121 ifdef x64
122 if (IS_LINUX eq 0)
123 movdqa xmm8, [r4]
124 movdqa xmm9, [r4 + 16]
125 add r4, LOCAL_SIZE + 8
126 movdqa xmm6, [r4 + 8]
127 movdqa xmm7, [r4 + 8 + 16]
128 endif
129 else ; x86
130 mov r4, r5
131 pop r5
132 endif
133 MY_ENDP
134endm
135
136
137msg equ xmm0
138tmp equ xmm0
139state0_N equ 2
140state1_N equ 3
141w_regs equ 4
142
143
144state1_save equ xmm1
145state0 equ @CatStr(xmm, %state0_N)
146state1 equ @CatStr(xmm, %state1_N)
147
148
149ifdef x64
150 state0_save equ xmm8
151 mask2 equ xmm9
152else
153 state0_save equ [r4]
154 mask2 equ xmm0
155endif
156
157LOAD_MASK macro
158 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
159endm
160
161LOAD_W macro k:req
162 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
163 pshufb @CatStr(xmm, %(w_regs + k)), mask2
164endm
165
166
167; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1
168pre1 equ 3
169pre2 equ 2
170
171
172
173RND4 macro k
174 movdqa msg, xmmword ptr [K_CONST + (k) * 16]
175 paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
176 MY_sha256rnds2 state0_N, state1_N
177 pshufd msg, msg, 0eH
178
179 if (k GE (4 - pre1)) AND (k LT (16 - pre1))
180 ; w4[0] = msg1(w4[-4], w4[-3])
181 MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
182 endif
183
184 MY_sha256rnds2 state1_N, state0_N
185
186 if (k GE (4 - pre2)) AND (k LT (16 - pre2))
187 movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))
188 palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4
189 paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp
190 ; w4[0] = msg2(w4[0], w4[-1])
191 MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))
192 endif
193endm
194
195
196
197
198
199REVERSE_STATE macro
200 ; state0 ; dcba
201 ; state1 ; hgfe
202 pshufd tmp, state0, 01bH ; abcd
203 pshufd state0, state1, 01bH ; efgh
204 movdqa state1, state0 ; efgh
205 punpcklqdq state0, tmp ; cdgh
206 punpckhqdq state1, tmp ; abef
207endm
208
209
210MY_PROC Sha256_UpdateBlocks_HW, 3
211 MY_PROLOG
212
213 cmp rNum, 0
214 je end_c
215
216 movdqu state0, [rState] ; dcba
217 movdqu state1, [rState + 16] ; hgfe
218
219 REVERSE_STATE
220
221 ifdef x64
222 LOAD_MASK
223 endif
224
225 align 16
226 nextBlock:
227 movdqa state0_save, state0
228 movdqa state1_save, state1
229
230 ifndef x64
231 LOAD_MASK
232 endif
233
234 LOAD_W 0
235 LOAD_W 1
236 LOAD_W 2
237 LOAD_W 3
238
239
240 k = 0
241 rept 16
242 RND4 k
243 k = k + 1
244 endm
245
246 paddd state0, state0_save
247 paddd state1, state1_save
248
249 add rData, 64
250 sub rNum, 1
251 jnz nextBlock
252
253 REVERSE_STATE
254
255 movdqu [rState], state0
256 movdqu [rState + 16], state1
257
258 end_c:
259MY_EPILOG
260
261; _TEXT$SHA256OPT ENDS
262
263end
diff --git a/Asm/x86/XzCrc64Opt.asm b/Asm/x86/XzCrc64Opt.asm
new file mode 100644
index 0000000..ad22cc2
--- /dev/null
+++ b/Asm/x86/XzCrc64Opt.asm
@@ -0,0 +1,239 @@
1; XzCrc64Opt.asm -- CRC64 calculation : optimized version
2; 2021-02-06 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8ifdef x64
9
10rD equ r9
11rN equ r10
12rT equ r5
13num_VAR equ r8
14
15SRCDAT4 equ dword ptr [rD + rN * 1]
16
17CRC_XOR macro dest:req, src:req, t:req
18 xor dest, QWORD PTR [rT + src * 8 + 0800h * t]
19endm
20
21CRC1b macro
22 movzx x6, BYTE PTR [rD]
23 inc rD
24 movzx x3, x0_L
25 xor x6, x3
26 shr r0, 8
27 CRC_XOR r0, r6, 0
28 dec rN
29endm
30
31MY_PROLOG macro crc_end:req
32 ifdef ABI_LINUX
33 MY_PUSH_2_REGS
34 else
35 MY_PUSH_4_REGS
36 endif
37 mov r0, REG_ABI_PARAM_0
38 mov rN, REG_ABI_PARAM_2
39 mov rT, REG_ABI_PARAM_3
40 mov rD, REG_ABI_PARAM_1
41 test rN, rN
42 jz crc_end
43 @@:
44 test rD, 3
45 jz @F
46 CRC1b
47 jnz @B
48 @@:
49 cmp rN, 8
50 jb crc_end
51 add rN, rD
52 mov num_VAR, rN
53 sub rN, 4
54 and rN, NOT 3
55 sub rD, rN
56 mov x1, SRCDAT4
57 xor r0, r1
58 add rN, 4
59endm
60
61MY_EPILOG macro crc_end:req
62 sub rN, 4
63 mov x1, SRCDAT4
64 xor r0, r1
65 mov rD, rN
66 mov rN, num_VAR
67 sub rN, rD
68 crc_end:
69 test rN, rN
70 jz @F
71 CRC1b
72 jmp crc_end
73 @@:
74 ifdef ABI_LINUX
75 MY_POP_2_REGS
76 else
77 MY_POP_4_REGS
78 endif
79endm
80
81MY_PROC XzCrc64UpdateT4, 4
82 MY_PROLOG crc_end_4
83 align 16
84 main_loop_4:
85 mov x1, SRCDAT4
86 movzx x2, x0_L
87 movzx x3, x0_H
88 shr r0, 16
89 movzx x6, x0_L
90 movzx x7, x0_H
91 shr r0, 16
92 CRC_XOR r1, r2, 3
93 CRC_XOR r0, r3, 2
94 CRC_XOR r1, r6, 1
95 CRC_XOR r0, r7, 0
96 xor r0, r1
97
98 add rD, 4
99 jnz main_loop_4
100
101 MY_EPILOG crc_end_4
102MY_ENDP
103
104else
105; x86 (32-bit)
106
107rD equ r1
108rN equ r7
109rT equ r5
110
111crc_OFFS equ (REG_SIZE * 5)
112
113if (IS_CDECL gt 0) or (IS_LINUX gt 0)
114 ; cdecl or (GNU fastcall) stack:
115 ; (UInt32 *) table
116 ; size_t size
117 ; void * data
118 ; (UInt64) crc
119 ; ret-ip <-(r4)
120 data_OFFS equ (8 + crc_OFFS)
121 size_OFFS equ (REG_SIZE + data_OFFS)
122 table_OFFS equ (REG_SIZE + size_OFFS)
123 num_VAR equ [r4 + size_OFFS]
124 table_VAR equ [r4 + table_OFFS]
125else
126 ; Windows fastcall:
127 ; r1 = data, r2 = size
128 ; stack:
129 ; (UInt32 *) table
130 ; (UInt64) crc
131 ; ret-ip <-(r4)
132 table_OFFS equ (8 + crc_OFFS)
133 table_VAR equ [r4 + table_OFFS]
134 num_VAR equ table_VAR
135endif
136
137SRCDAT4 equ dword ptr [rD + rN * 1]
138
139CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
140 op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t]
141 op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4]
142endm
143
144CRC_XOR macro dest0:req, dest1:req, src:req, t:req
145 CRC xor, xor, dest0, dest1, src, t
146endm
147
148
149CRC1b macro
150 movzx x6, BYTE PTR [rD]
151 inc rD
152 movzx x3, x0_L
153 xor x6, x3
154 shrd r0, r2, 8
155 shr r2, 8
156 CRC_XOR r0, r2, r6, 0
157 dec rN
158endm
159
160MY_PROLOG macro crc_end:req
161 MY_PUSH_4_REGS
162
163 if (IS_CDECL gt 0) or (IS_LINUX gt 0)
164 proc_numParams = proc_numParams + 2 ; for ABI_LINUX
165 mov rN, [r4 + size_OFFS]
166 mov rD, [r4 + data_OFFS]
167 else
168 mov rN, r2
169 endif
170
171 mov x0, [r4 + crc_OFFS]
172 mov x2, [r4 + crc_OFFS + 4]
173 mov rT, table_VAR
174 test rN, rN
175 jz crc_end
176 @@:
177 test rD, 3
178 jz @F
179 CRC1b
180 jnz @B
181 @@:
182 cmp rN, 8
183 jb crc_end
184 add rN, rD
185
186 mov num_VAR, rN
187
188 sub rN, 4
189 and rN, NOT 3
190 sub rD, rN
191 xor r0, SRCDAT4
192 add rN, 4
193endm
194
195MY_EPILOG macro crc_end:req
196 sub rN, 4
197 xor r0, SRCDAT4
198
199 mov rD, rN
200 mov rN, num_VAR
201 sub rN, rD
202 crc_end:
203 test rN, rN
204 jz @F
205 CRC1b
206 jmp crc_end
207 @@:
208 MY_POP_4_REGS
209endm
210
211MY_PROC XzCrc64UpdateT4, 5
212 MY_PROLOG crc_end_4
213 movzx x6, x0_L
214 align 16
215 main_loop_4:
216 mov r3, SRCDAT4
217 xor r3, r2
218
219 CRC xor, mov, r3, r2, r6, 3
220 movzx x6, x0_H
221 shr r0, 16
222 CRC_XOR r3, r2, r6, 2
223
224 movzx x6, x0_L
225 movzx x0, x0_H
226 CRC_XOR r3, r2, r6, 1
227 CRC_XOR r3, r2, r0, 0
228 movzx x6, x3_L
229 mov r0, r3
230
231 add rD, 4
232 jnz main_loop_4
233
234 MY_EPILOG crc_end_4
235MY_ENDP
236
237endif ; ! x64
238
239end