aboutsummaryrefslogtreecommitdiff
path: root/Asm
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2021-12-27 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2022-03-18 15:35:13 +0500
commitf19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /Asm
parent98e06a519b63b81986abe76d28887f6984a7732b (diff)
download7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip
'21.07'21.07
Diffstat (limited to 'Asm')
-rw-r--r--Asm/arm/7zCrcOpt.asm100
-rw-r--r--Asm/arm64/7zAsm.S181
-rw-r--r--Asm/arm64/LzmaDecOpt.S1487
-rw-r--r--Asm/x86/7zAsm.asm284
-rw-r--r--Asm/x86/7zCrcOpt.asm180
-rw-r--r--Asm/x86/AesOpt.asm742
-rw-r--r--Asm/x86/LzFindOpt.asm513
-rw-r--r--Asm/x86/LzmaDecOpt.asm1303
-rw-r--r--Asm/x86/Sha1Opt.asm263
-rw-r--r--Asm/x86/Sha256Opt.asm263
-rw-r--r--Asm/x86/XzCrc64Opt.asm239
11 files changed, 5555 insertions, 0 deletions
diff --git a/Asm/arm/7zCrcOpt.asm b/Asm/arm/7zCrcOpt.asm
new file mode 100644
index 0000000..6001d8e
--- /dev/null
+++ b/Asm/arm/7zCrcOpt.asm
@@ -0,0 +1,100 @@
1 CODE32
2
3 EXPORT |CrcUpdateT4@16|
4
5 AREA |.text|, CODE, ARM
6
7 MACRO
8 CRC32_STEP_1
9
10 ldrb r4, [r1], #1
11 subs r2, r2, #1
12 eor r4, r4, r0
13 and r4, r4, #0xFF
14 ldr r4, [r3, +r4, lsl #2]
15 eor r0, r4, r0, lsr #8
16
17 MEND
18
19
20 MACRO
21 CRC32_STEP_4 $STREAM_WORD
22
23 eor r7, r7, r8
24 eor r7, r7, r9
25 eor r0, r0, r7
26 eor r0, r0, $STREAM_WORD
27 ldr $STREAM_WORD, [r1], #4
28
29 and r7, r0, #0xFF
30 and r8, r0, #0xFF00
31 and r9, r0, #0xFF0000
32 and r0, r0, #0xFF000000
33
34 ldr r7, [r6, +r7, lsl #2]
35 ldr r8, [r5, +r8, lsr #6]
36 ldr r9, [r4, +r9, lsr #14]
37 ldr r0, [r3, +r0, lsr #22]
38
39 MEND
40
41
42|CrcUpdateT4@16| PROC
43
44 stmdb sp!, {r4-r11, lr}
45 cmp r2, #0
46 beq |$fin|
47
48|$v1|
49 tst r1, #7
50 beq |$v2|
51 CRC32_STEP_1
52 bne |$v1|
53
54|$v2|
55 cmp r2, #16
56 blo |$v3|
57
58 ldr r10, [r1], #4
59 ldr r11, [r1], #4
60
61 add r4, r3, #0x400
62 add r5, r3, #0x800
63 add r6, r3, #0xC00
64
65 mov r7, #0
66 mov r8, #0
67 mov r9, #0
68
69 sub r2, r2, #16
70
71|$loop|
72 ; pld [r1, #0x40]
73
74 CRC32_STEP_4 r10
75 CRC32_STEP_4 r11
76
77 subs r2, r2, #8
78 bhs |$loop|
79
80 sub r1, r1, #8
81 add r2, r2, #16
82
83 eor r7, r7, r8
84 eor r7, r7, r9
85 eor r0, r0, r7
86
87|$v3|
88 cmp r2, #0
89 beq |$fin|
90
91|$v4|
92 CRC32_STEP_1
93 bne |$v4|
94
95|$fin|
96 ldmia sp!, {r4-r11, pc}
97
98|CrcUpdateT4@16| ENDP
99
100 END
diff --git a/Asm/arm64/7zAsm.S b/Asm/arm64/7zAsm.S
new file mode 100644
index 0000000..12e950b
--- /dev/null
+++ b/Asm/arm64/7zAsm.S
@@ -0,0 +1,181 @@
1// 7zAsm.S -- ASM macros for arm64
2// 2021-04-25 : Igor Pavlov : Public domain
3
4#define r0 x0
5#define r1 x1
6#define r2 x2
7#define r3 x3
8#define r4 x4
9#define r5 x5
10#define r6 x6
11#define r7 x7
12#define r8 x8
13#define r9 x9
14#define r10 x10
15#define r11 x11
16#define r12 x12
17#define r13 x13
18#define r14 x14
19#define r15 x15
20#define r16 x16
21#define r17 x17
22#define r18 x18
23#define r19 x19
24#define r20 x20
25#define r21 x21
26#define r22 x22
27#define r23 x23
28#define r24 x24
29#define r25 x25
30#define r26 x26
31#define r27 x27
32#define r28 x28
33#define r29 x29
34#define r30 x30
35
36#define REG_ABI_PARAM_0 r0
37#define REG_ABI_PARAM_1 r1
38#define REG_ABI_PARAM_2 r2
39
40
41.macro p2_add reg:req, param:req
42 add \reg, \reg, \param
43.endm
44
45.macro p2_sub reg:req, param:req
46 sub \reg, \reg, \param
47.endm
48
49.macro p2_sub_s reg:req, param:req
50 subs \reg, \reg, \param
51.endm
52
53.macro p2_and reg:req, param:req
54 and \reg, \reg, \param
55.endm
56
57.macro xor reg:req, param:req
58 eor \reg, \reg, \param
59.endm
60
61.macro or reg:req, param:req
62 orr \reg, \reg, \param
63.endm
64
65.macro shl reg:req, param:req
66 lsl \reg, \reg, \param
67.endm
68
69.macro shr reg:req, param:req
70 lsr \reg, \reg, \param
71.endm
72
73.macro sar reg:req, param:req
74 asr \reg, \reg, \param
75.endm
76
77.macro p1_neg reg:req
78 neg \reg, \reg
79.endm
80
81.macro dec reg:req
82 sub \reg, \reg, 1
83.endm
84
85.macro dec_s reg:req
86 subs \reg, \reg, 1
87.endm
88
89.macro inc reg:req
90 add \reg, \reg, 1
91.endm
92
93.macro inc_s reg:req
94 adds \reg, \reg, 1
95.endm
96
97
98.macro imul reg:req, param:req
99 mul \reg, \reg, \param
100.endm
101
102/*
103arm64 and arm use reverted c flag after subs/cmp instructions:
104 arm64-arm : x86
105 b.lo / b.cc : jb / jc
106 b.hs / b.cs : jae / jnc
107*/
108
109.macro jmp lab:req
110 b \lab
111.endm
112
113.macro je lab:req
114 b.eq \lab
115.endm
116
117.macro jz lab:req
118 b.eq \lab
119.endm
120
121.macro jnz lab:req
122 b.ne \lab
123.endm
124
125.macro jne lab:req
126 b.ne \lab
127.endm
128
129.macro jb lab:req
130 b.lo \lab
131.endm
132
133.macro jbe lab:req
134 b.ls \lab
135.endm
136
137.macro ja lab:req
138 b.hi \lab
139.endm
140
141.macro jae lab:req
142 b.hs \lab
143.endm
144
145
146.macro cmove dest:req, srcTrue:req
147 csel \dest, \srcTrue, \dest, eq
148.endm
149
150.macro cmovne dest:req, srcTrue:req
151 csel \dest, \srcTrue, \dest, ne
152.endm
153
154.macro cmovs dest:req, srcTrue:req
155 csel \dest, \srcTrue, \dest, mi
156.endm
157
158.macro cmovns dest:req, srcTrue:req
159 csel \dest, \srcTrue, \dest, pl
160.endm
161
162.macro cmovb dest:req, srcTrue:req
163 csel \dest, \srcTrue, \dest, lo
164.endm
165
166.macro cmovae dest:req, srcTrue:req
167 csel \dest, \srcTrue, \dest, hs
168.endm
169
170
171.macro MY_ALIGN_16 macro
172 .p2align 4,, (1 << 4) - 1
173.endm
174
175.macro MY_ALIGN_32 macro
176 .p2align 5,, (1 << 5) - 1
177.endm
178
179.macro MY_ALIGN_64 macro
180 .p2align 6,, (1 << 6) - 1
181.endm
diff --git a/Asm/arm64/LzmaDecOpt.S b/Asm/arm64/LzmaDecOpt.S
new file mode 100644
index 0000000..10dc473
--- /dev/null
+++ b/Asm/arm64/LzmaDecOpt.S
@@ -0,0 +1,1487 @@
1// LzmaDecOpt.S -- ARM64-ASM version of LzmaDec_DecodeReal_3() function
2// 2021-04-25 : Igor Pavlov : Public domain
3
4/*
5; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
6; function for check at link time.
7; That code is tightly coupled with LzmaDec_TryDummy()
8; and with another functions in LzmaDec.c file.
9; CLzmaDec structure, (probs) array layout, input and output of
10; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
11*/
12
13
14#include "7zAsm.S"
15
16 // .arch armv8-a
17 // .file "LzmaDecOpt.c"
18 .text
19 .align 2
20 .p2align 4,,15
21#ifdef __APPLE__
22 .globl _LzmaDec_DecodeReal_3
23#else
24 .global LzmaDec_DecodeReal_3
25#endif
26 // .type LzmaDec_DecodeReal_3, %function
27
28// #define _LZMA_SIZE_OPT 1
29
30#define LZMA_USE_4BYTES_FILL 1
31// #define LZMA_USE_2BYTES_COPY 1
32// #define LZMA_USE_CMOV_LZ_WRAP 1
33// #define _LZMA_PROB32 1
34
35#define MY_ALIGN_FOR_ENTRY MY_ALIGN_32
36#define MY_ALIGN_FOR_LOOP MY_ALIGN_32
37#define MY_ALIGN_FOR_LOOP_16 MY_ALIGN_16
38
39#ifdef _LZMA_PROB32
40 .equ PSHIFT , 2
41 .macro PLOAD dest:req, mem:req
42 ldr \dest, [\mem]
43 .endm
44 .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
45 ldr \dest, [\mem, \offset]!
46 .endm
47 .macro PLOAD_2 dest:req, mem1:req, mem2:req
48 ldr \dest, [\mem1, \mem2]
49 .endm
50 .macro PLOAD_LSL dest:req, mem1:req, mem2:req
51 ldr \dest, [\mem1, \mem2, lsl #PSHIFT]
52 .endm
53 .macro PSTORE src:req, mem:req
54 str \src, [\mem]
55 .endm
56 .macro PSTORE_2 src:req, mem1:req, mem2:req
57 str \src, [\mem1, \mem2]
58 .endm
59 .macro PSTORE_LSL src:req, mem1:req, mem2:req
60 str \src, [\mem1, \mem2, lsl #PSHIFT]
61 .endm
62 .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
63 // you must check that temp_reg is free register when macro is used
64 add \temp_reg, \mem1, \mem2
65 str \src, [\temp_reg, \mem2]
66 .endm
67#else
68 // .equ PSHIFT , 1
69 #define PSHIFT 1
70 .macro PLOAD dest:req, mem:req
71 ldrh \dest, [\mem]
72 .endm
73 .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req
74 ldrh \dest, [\mem, \offset]!
75 .endm
76 .macro PLOAD_2 dest:req, mem1:req, mem2:req
77 ldrh \dest, [\mem1, \mem2]
78 .endm
79 .macro PLOAD_LSL dest:req, mem1:req, mem2:req
80 ldrh \dest, [\mem1, \mem2, lsl #PSHIFT]
81 .endm
82 .macro PSTORE src:req, mem:req
83 strh \src, [\mem]
84 .endm
85 .macro PSTORE_2 src:req, mem1:req, mem2:req
86 strh \src, [\mem1, \mem2]
87 .endm
88 .macro PSTORE_LSL src:req, mem1:req, mem2:req
89 strh \src, [\mem1, \mem2, lsl #PSHIFT]
90 .endm
91 .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req
92 strh \src, [\mem1, \mem2]
93 .endm
94#endif
95
96.equ PMULT , (1 << PSHIFT)
97.equ PMULT_2 , (2 << PSHIFT)
98
99.equ kMatchSpecLen_Error_Data , (1 << 9)
100
101# x7 t0 : NORM_CALC : prob2 (IF_BIT_1)
102# x6 t1 : NORM_CALC : probs_state
103# x8 t2 : (LITM) temp : (TREE) temp
104# x4 t3 : (LITM) bit : (TREE) temp : UPDATE_0/UPDATE_0 temp
105# x10 t4 : (LITM) offs : (TREE) probs_PMULT : numBits
106# x9 t5 : (LITM) match : sym2 (ShortDist)
107# x1 t6 : (LITM) litm_prob : (TREE) prob_reg : pbPos
108# x2 t7 : (LITM) prm : probBranch : cnt
109# x3 sym : dist
110# x12 len
111# x0 range
112# x5 cod
113
114
115#define range w0
116
117// t6
118#define pbPos w1
119#define pbPos_R r1
120#define prob_reg w1
121#define litm_prob prob_reg
122
123// t7
124#define probBranch w2
125#define cnt w2
126#define cnt_R r2
127#define prm r2
128
129#define sym w3
130#define sym_R r3
131#define dist sym
132
133#define t3 w4
134#define bit w4
135#define bit_R r4
136#define update_temp_reg r4
137
138#define cod w5
139
140#define t1 w6
141#define t1_R r6
142#define probs_state t1_R
143
144#define t0 w7
145#define t0_R r7
146#define prob2 t0
147
148#define t2 w8
149#define t2_R r8
150
151// t5
152#define match w9
153#define sym2 w9
154#define sym2_R r9
155
156#define t4 w10
157#define t4_R r10
158
159#define offs w10
160#define offs_R r10
161
162#define probs r11
163
164#define len w12
165#define len_R x12
166
167#define state w13
168#define state_R r13
169
170#define dicPos r14
171#define buf r15
172#define bufLimit r16
173#define dicBufSize r17
174
175#define limit r19
176#define rep0 w20
177#define rep0_R r20
178#define rep1 w21
179#define rep2 w22
180#define rep3 w23
181#define dic r24
182#define probs_IsMatch r25
183#define probs_Spec r26
184#define checkDicSize w27
185#define processedPos w28
186#define pbMask w29
187#define lc2_lpMask w30
188
189
190.equ kNumBitModelTotalBits , 11
191.equ kBitModelTotal , (1 << kNumBitModelTotalBits)
192.equ kNumMoveBits , 5
193.equ kBitModelOffset , (kBitModelTotal - (1 << kNumMoveBits) + 1)
194
195.macro NORM_2 macro
196 ldrb t0, [buf], 1
197 shl range, 8
198 orr cod, t0, cod, lsl 8
199 /*
200 mov t0, cod
201 ldrb cod, [buf], 1
202 shl range, 8
203 bfi cod, t0, #8, #24
204 */
205.endm
206
207.macro TEST_HIGH_BYTE_range macro
208 tst range, 0xFF000000
209.endm
210
211.macro NORM macro
212 TEST_HIGH_BYTE_range
213 jnz 1f
214 NORM_2
2151:
216.endm
217
218
219# ---------- Branch MACROS ----------
220
221.macro UPDATE_0__0
222 sub prob2, probBranch, kBitModelOffset
223.endm
224
225.macro UPDATE_0__1
226 sub probBranch, probBranch, prob2, asr #(kNumMoveBits)
227.endm
228
229.macro UPDATE_0__2 probsArray:req, probOffset:req, probDisp:req
230 .if \probDisp == 0
231 PSTORE_2 probBranch, \probsArray, \probOffset
232 .elseif \probOffset == 0
233 PSTORE_2 probBranch, \probsArray, \probDisp * PMULT
234 .else
235 .error "unsupported"
236 // add update_temp_reg, \probsArray, \probOffset
237 PSTORE_2 probBranch, update_temp_reg, \probDisp * PMULT
238 .endif
239.endm
240
241.macro UPDATE_0 probsArray:req, probOffset:req, probDisp:req
242 UPDATE_0__0
243 UPDATE_0__1
244 UPDATE_0__2 \probsArray, \probOffset, \probDisp
245.endm
246
247
248.macro UPDATE_1 probsArray:req, probOffset:req, probDisp:req
249 // sub cod, cod, prob2
250 // sub range, range, prob2
251 p2_sub cod, range
252 sub range, prob2, range
253 sub prob2, probBranch, probBranch, lsr #(kNumMoveBits)
254 .if \probDisp == 0
255 PSTORE_2 prob2, \probsArray, \probOffset
256 .elseif \probOffset == 0
257 PSTORE_2 prob2, \probsArray, \probDisp * PMULT
258 .else
259 .error "unsupported"
260 // add update_temp_reg, \probsArray, \probOffset
261 PSTORE_2 prob2, update_temp_reg, \probDisp * PMULT
262 .endif
263.endm
264
265
266.macro CMP_COD_BASE
267 NORM
268 // lsr prob2, range, kNumBitModelTotalBits
269 // imul prob2, probBranch
270 // cmp cod, prob2
271 mov prob2, range
272 shr range, kNumBitModelTotalBits
273 imul range, probBranch
274 cmp cod, range
275.endm
276
277.macro CMP_COD_1 probsArray:req
278 PLOAD probBranch, \probsArray
279 CMP_COD_BASE
280.endm
281
282.macro CMP_COD_3 probsArray:req, probOffset:req, probDisp:req
283 .if \probDisp == 0
284 PLOAD_2 probBranch, \probsArray, \probOffset
285 .elseif \probOffset == 0
286 PLOAD_2 probBranch, \probsArray, \probDisp * PMULT
287 .else
288 .error "unsupported"
289 add update_temp_reg, \probsArray, \probOffset
290 PLOAD_2 probBranch, update_temp_reg, \probDisp * PMULT
291 .endif
292 CMP_COD_BASE
293.endm
294
295
296.macro IF_BIT_1_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
297 CMP_COD_3 \probsArray, \probOffset, \probDisp
298 jae \toLabel
299.endm
300
301
302.macro IF_BIT_1 probsArray:req, probOffset:req, probDisp:req, toLabel:req
303 IF_BIT_1_NOUP \probsArray, \probOffset, \probDisp, \toLabel
304 UPDATE_0 \probsArray, \probOffset, \probDisp
305.endm
306
307
308.macro IF_BIT_0_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req
309 CMP_COD_3 \probsArray, \probOffset, \probDisp
310 jb \toLabel
311.endm
312
313.macro IF_BIT_0_NOUP_1 probsArray:req, toLabel:req
314 CMP_COD_1 \probsArray
315 jb \toLabel
316.endm
317
318
319# ---------- CMOV MACROS ----------
320
321.macro NORM_LSR
322 NORM
323 lsr t0, range, #kNumBitModelTotalBits
324.endm
325
326.macro COD_RANGE_SUB
327 subs t1, cod, t0
328 p2_sub range, t0
329.endm
330
331.macro RANGE_IMUL prob:req
332 imul t0, \prob
333.endm
334
335.macro NORM_CALC prob:req
336 NORM_LSR
337 RANGE_IMUL \prob
338 COD_RANGE_SUB
339.endm
340
341.macro CMOV_range
342 cmovb range, t0
343.endm
344
345.macro CMOV_code
346 cmovae cod, t1
347.endm
348
349.macro CMOV_code_Model_Pre prob:req
350 sub t0, \prob, kBitModelOffset
351 CMOV_code
352 cmovae t0, \prob
353.endm
354
355
356.macro PUP_BASE_2 prob:req, dest_reg:req
357 # only sar works for both 16/32 bit prob modes
358 sub \dest_reg, \prob, \dest_reg, asr #(kNumMoveBits)
359.endm
360
361.macro PUP prob:req, probPtr:req, mem2:req
362 PUP_BASE_2 \prob, t0
363 PSTORE_2 t0, \probPtr, \mem2
364.endm
365
366
367
368#define probs_PMULT t4_R
369
370.macro BIT_01
371 add probs_PMULT, probs, PMULT
372.endm
373
374
375.macro BIT_0_R prob:req
376 PLOAD_2 \prob, probs, 1 * PMULT
377 NORM_LSR
378 sub t3, \prob, kBitModelOffset
379 RANGE_IMUL \prob
380 PLOAD_2 t2, probs, 1 * PMULT_2
381 COD_RANGE_SUB
382 CMOV_range
383 cmovae t3, \prob
384 PLOAD_2 t0, probs, 1 * PMULT_2 + PMULT
385 PUP_BASE_2 \prob, t3
386 csel \prob, t2, t0, lo
387 CMOV_code
388 mov sym, 2
389 PSTORE_2 t3, probs, 1 * PMULT
390 adc sym, sym, wzr
391 BIT_01
392.endm
393
394.macro BIT_1_R prob:req
395 NORM_LSR
396 p2_add sym, sym
397 sub t3, \prob, kBitModelOffset
398 RANGE_IMUL \prob
399 PLOAD_LSL t2, probs, sym_R
400 COD_RANGE_SUB
401 CMOV_range
402 cmovae t3, \prob
403 PLOAD_LSL t0, probs_PMULT, sym_R
404 PUP_BASE_2 \prob, t3
405 csel \prob, t2, t0, lo
406 CMOV_code
407 PSTORE_LSL_M1 t3, probs, sym_R, t2_R
408 adc sym, sym, wzr
409.endm
410
411
412.macro BIT_2_R prob:req
413 NORM_LSR
414 p2_add sym, sym
415 sub t3, \prob, kBitModelOffset
416 RANGE_IMUL \prob
417 COD_RANGE_SUB
418 CMOV_range
419 cmovae t3, \prob
420 CMOV_code
421 PUP_BASE_2 \prob, t3
422 PSTORE_LSL_M1 t3, probs, sym_R, t2_R
423 adc sym, sym, wzr
424.endm
425
426
427# ---------- MATCHED LITERAL ----------
428
429.macro LITM_0 macro
430 shl match, (PSHIFT + 1)
431 and bit, match, 256 * PMULT
432 add prm, probs, 256 * PMULT + 1 * PMULT
433 p2_add match, match
434 p2_add prm, bit_R
435 eor offs, bit, 256 * PMULT
436 PLOAD litm_prob, prm
437
438 NORM_LSR
439 sub t2, litm_prob, kBitModelOffset
440 RANGE_IMUL litm_prob
441 COD_RANGE_SUB
442 cmovae offs, bit
443 CMOV_range
444 and bit, match, offs
445 cmovae t2, litm_prob
446 CMOV_code
447 mov sym, 2
448 PUP_BASE_2 litm_prob, t2
449 PSTORE t2, prm
450 add prm, probs, offs_R
451 adc sym, sym, wzr
452.endm
453
454.macro LITM macro
455 p2_add prm, bit_R
456 xor offs, bit
457 PLOAD_LSL litm_prob, prm, sym_R
458
459 NORM_LSR
460 p2_add match, match
461 sub t2, litm_prob, kBitModelOffset
462 RANGE_IMUL litm_prob
463 COD_RANGE_SUB
464 cmovae offs, bit
465 CMOV_range
466 and bit, match, offs
467 cmovae t2, litm_prob
468 CMOV_code
469 PUP_BASE_2 litm_prob, t2
470 PSTORE_LSL t2, prm, sym_R
471 add prm, probs, offs_R
472 adc sym, sym, sym
473.endm
474
475
476.macro LITM_2 macro
477 p2_add prm, bit_R
478 PLOAD_LSL litm_prob, prm, sym_R
479
480 NORM_LSR
481 sub t2, litm_prob, kBitModelOffset
482 RANGE_IMUL litm_prob
483 COD_RANGE_SUB
484 CMOV_range
485 cmovae t2, litm_prob
486 CMOV_code
487 PUP_BASE_2 litm_prob, t2
488 PSTORE_LSL t2, prm, sym_R
489 adc sym, sym, sym
490.endm
491
492
493# ---------- REVERSE BITS ----------
494
495.macro REV_0 prob:req
496 NORM_CALC \prob
497 CMOV_range
498 PLOAD t2, sym2_R
499 PLOAD_2 t3, probs, 3 * PMULT
500 CMOV_code_Model_Pre \prob
501 add t1_R, probs, 3 * PMULT
502 cmovae sym2_R, t1_R
503 PUP \prob, probs, 1 * PMULT
504 csel \prob, t2, t3, lo
505.endm
506
507
508.macro REV_1 prob:req, step:req
509 NORM_LSR
510 PLOAD_PREINDEXED t2, sym2_R, (\step * PMULT)
511 RANGE_IMUL \prob
512 COD_RANGE_SUB
513 CMOV_range
514 PLOAD_2 t3, sym2_R, (\step * PMULT)
515 sub t0, \prob, kBitModelOffset
516 CMOV_code
517 add t1_R, sym2_R, \step * PMULT
518 cmovae t0, \prob
519 cmovae sym2_R, t1_R
520 PUP_BASE_2 \prob, t0
521 csel \prob, t2, t3, lo
522 PSTORE_2 t0, t1_R, 0 - \step * PMULT_2
523.endm
524
525
526.macro REV_2 prob:req, step:req
527 sub t1_R, sym2_R, probs
528 NORM_LSR
529 orr sym, sym, t1, lsr #PSHIFT
530 RANGE_IMUL \prob
531 COD_RANGE_SUB
532 sub t2, sym, \step
533 CMOV_range
534 cmovb sym, t2
535 CMOV_code_Model_Pre \prob
536 PUP \prob, sym2_R, 0
537.endm
538
539
540.macro REV_1_VAR prob:req
541 PLOAD \prob, sym_R
542 mov probs, sym_R
543 p2_add sym_R, sym2_R
544 NORM_LSR
545 add t2_R, sym_R, sym2_R
546 RANGE_IMUL \prob
547 COD_RANGE_SUB
548 cmovae sym_R, t2_R
549 CMOV_range
550 CMOV_code_Model_Pre \prob
551 p2_add sym2, sym2
552 PUP \prob, probs, 0
553.endm
554
555
556.macro add_big dest:req, src:req, param:req
557 .if (\param) < (1 << 12)
558 add \dest, \src, \param
559 .else
560 #ifndef _LZMA_PROB32
561 .error "unexpcted add_big expansion"
562 #endif
563 add \dest, \src, (\param) / 2
564 add \dest, \dest, (\param) - (\param) / 2
565 .endif
566.endm
567
568.macro sub_big dest:req, src:req, param:req
569 .if (\param) < (1 << 12)
570 sub \dest, \src, \param
571 .else
572 #ifndef _LZMA_PROB32
573 .error "unexpcted sub_big expansion"
574 #endif
575 sub \dest, \src, (\param) / 2
576 sub \dest, \dest, (\param) - (\param) / 2
577 .endif
578.endm
579
580
581.macro SET_probs offset:req
582 // add_big probs, probs_Spec, (\offset) * PMULT
583 add probs, probs_IsMatch, ((\offset) - IsMatch) * PMULT
584.endm
585
586
587.macro LIT_PROBS
588 add sym, sym, processedPos, lsl 8
589 inc processedPos
590 UPDATE_0__0
591 shl sym, lc2_lpMask
592 SET_probs Literal
593 p2_and sym, lc2_lpMask
594 // p2_add probs_state, pbPos_R
595 p2_add probs, sym_R
596 UPDATE_0__1
597 add probs, probs, sym_R, lsl 1
598 UPDATE_0__2 probs_state, pbPos_R, 0
599.endm
600
601
602
603.equ kNumPosBitsMax , 4
604.equ kNumPosStatesMax , (1 << kNumPosBitsMax)
605
606.equ kLenNumLowBits , 3
607.equ kLenNumLowSymbols , (1 << kLenNumLowBits)
608.equ kLenNumHighBits , 8
609.equ kLenNumHighSymbols , (1 << kLenNumHighBits)
610.equ kNumLenProbs , (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
611
612.equ LenLow , 0
613.equ LenChoice , LenLow
614.equ LenChoice2 , (LenLow + kLenNumLowSymbols)
615.equ LenHigh , (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
616
617.equ kNumStates , 12
618.equ kNumStates2 , 16
619.equ kNumLitStates , 7
620
621.equ kStartPosModelIndex , 4
622.equ kEndPosModelIndex , 14
623.equ kNumFullDistances , (1 << (kEndPosModelIndex >> 1))
624
625.equ kNumPosSlotBits , 6
626.equ kNumLenToPosStates , 4
627
628.equ kNumAlignBits , 4
629.equ kAlignTableSize , (1 << kNumAlignBits)
630
631.equ kMatchMinLen , 2
632.equ kMatchSpecLenStart , (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
633
634// .equ kStartOffset , 1408
635.equ kStartOffset , 0
636.equ SpecPos , (-kStartOffset)
637.equ IsRep0Long , (SpecPos + kNumFullDistances)
638.equ RepLenCoder , (IsRep0Long + (kNumStates2 << kNumPosBitsMax))
639.equ LenCoder , (RepLenCoder + kNumLenProbs)
640.equ IsMatch , (LenCoder + kNumLenProbs)
641.equ kAlign , (IsMatch + (kNumStates2 << kNumPosBitsMax))
642.equ IsRep , (kAlign + kAlignTableSize)
643.equ IsRepG0 , (IsRep + kNumStates)
644.equ IsRepG1 , (IsRepG0 + kNumStates)
645.equ IsRepG2 , (IsRepG1 + kNumStates)
646.equ PosSlot , (IsRepG2 + kNumStates)
647.equ Literal , (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
648.equ NUM_BASE_PROBS , (Literal + kStartOffset)
649
650.if kStartOffset != 0 // && IsMatch != 0
651 .error "Stop_Compiling_Bad_StartOffset"
652.endif
653
654.if NUM_BASE_PROBS != 1984
655 .error "Stop_Compiling_Bad_LZMA_PROBS"
656.endif
657
658.equ offset_lc , 0
659.equ offset_lp , 1
660.equ offset_pb , 2
661.equ offset_dicSize , 4
662.equ offset_probs , 4 + offset_dicSize
663.equ offset_probs_1664 , 8 + offset_probs
664.equ offset_dic , 8 + offset_probs_1664
665.equ offset_dicBufSize , 8 + offset_dic
666.equ offset_dicPos , 8 + offset_dicBufSize
667.equ offset_buf , 8 + offset_dicPos
668.equ offset_range , 8 + offset_buf
669.equ offset_code , 4 + offset_range
670.equ offset_processedPos , 4 + offset_code
671.equ offset_checkDicSize , 4 + offset_processedPos
672.equ offset_rep0 , 4 + offset_checkDicSize
673.equ offset_rep1 , 4 + offset_rep0
674.equ offset_rep2 , 4 + offset_rep1
675.equ offset_rep3 , 4 + offset_rep2
676.equ offset_state , 4 + offset_rep3
677.equ offset_remainLen , 4 + offset_state
678.equ offset_TOTAL_SIZE , 4 + offset_remainLen
679
680.if offset_TOTAL_SIZE != 96
681 .error "Incorrect offset_TOTAL_SIZE"
682.endif
683
684
685.macro IsMatchBranch_Pre
686 # prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
687 and pbPos, pbMask, processedPos, lsl #(kLenNumLowBits + 1 + PSHIFT)
688 add probs_state, probs_IsMatch, state_R
689.endm
690
691
692/*
693.macro IsMatchBranch
694 IsMatchBranch_Pre
695 IF_BIT_1 probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
696.endm
697*/
698
699.macro CheckLimits
700 cmp buf, bufLimit
701 jae fin_OK
702 cmp dicPos, limit
703 jae fin_OK
704.endm
705
706#define CheckLimits_lit CheckLimits
707/*
708.macro CheckLimits_lit
709 cmp buf, bufLimit
710 jae fin_OK_lit
711 cmp dicPos, limit
712 jae fin_OK_lit
713.endm
714*/
715
716
717#define PARAM_lzma REG_ABI_PARAM_0
718#define PARAM_limit REG_ABI_PARAM_1
719#define PARAM_bufLimit REG_ABI_PARAM_2
720
721
722.macro LOAD_LZMA_VAR reg:req, struct_offs:req
723 ldr \reg, [PARAM_lzma, \struct_offs]
724.endm
725
726.macro LOAD_LZMA_BYTE reg:req, struct_offs:req
727 ldrb \reg, [PARAM_lzma, \struct_offs]
728.endm
729
730.macro LOAD_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
731 ldp \reg0, \reg1, [PARAM_lzma, \struct_offs]
732.endm
733
734
735LzmaDec_DecodeReal_3:
736_LzmaDec_DecodeReal_3:
737/*
738.LFB0:
739 .cfi_startproc
740*/
741
742 stp x19, x20, [sp, -128]!
743 stp x21, x22, [sp, 16]
744 stp x23, x24, [sp, 32]
745 stp x25, x26, [sp, 48]
746 stp x27, x28, [sp, 64]
747 stp x29, x30, [sp, 80]
748
749 str PARAM_lzma, [sp, 120]
750
751 mov bufLimit, PARAM_bufLimit
752 mov limit, PARAM_limit
753
754 LOAD_LZMA_PAIR dic, dicBufSize, offset_dic
755 LOAD_LZMA_PAIR dicPos, buf, offset_dicPos
756 LOAD_LZMA_PAIR rep0, rep1, offset_rep0
757 LOAD_LZMA_PAIR rep2, rep3, offset_rep2
758
759 mov t0, 1 << (kLenNumLowBits + 1 + PSHIFT)
760 LOAD_LZMA_BYTE pbMask, offset_pb
761 p2_add limit, dic
762 mov len, wzr // we can set it in all requiread branches instead
763 lsl pbMask, t0, pbMask
764 p2_add dicPos, dic
765 p2_sub pbMask, t0
766
767 LOAD_LZMA_BYTE lc2_lpMask, offset_lc
768 mov t0, 256 << PSHIFT
769 LOAD_LZMA_BYTE t1, offset_lp
770 p2_add t1, lc2_lpMask
771 p2_sub lc2_lpMask, (256 << PSHIFT) - PSHIFT
772 shl t0, t1
773 p2_add lc2_lpMask, t0
774
775 LOAD_LZMA_VAR probs_Spec, offset_probs
776 LOAD_LZMA_VAR checkDicSize, offset_checkDicSize
777 LOAD_LZMA_VAR processedPos, offset_processedPos
778 LOAD_LZMA_VAR state, offset_state
779 // range is r0 : this load must be last don't move
780 LOAD_LZMA_PAIR range, cod, offset_range
781 mov sym, wzr
782 shl state, PSHIFT
783
784 add_big probs_IsMatch, probs_Spec, ((IsMatch - SpecPos) << PSHIFT)
785
786 // if (processedPos != 0 || checkDicSize != 0)
787 orr t0, checkDicSize, processedPos
788 cbz t0, 1f
789 add t0_R, dicBufSize, dic
790 cmp dicPos, dic
791 cmovne t0_R, dicPos
792 ldrb sym, [t0_R, -1]
7931:
794 IsMatchBranch_Pre
795 cmp state, 4 * PMULT
796 jb lit_end
797 cmp state, kNumLitStates * PMULT
798 jb lit_matched_end
799 jmp lz_end
800
801
802
803#define BIT_0 BIT_0_R prob_reg
804#define BIT_1 BIT_1_R prob_reg
805#define BIT_2 BIT_2_R prob_reg
806
807# ---------- LITERAL ----------
808MY_ALIGN_64
809lit_start:
810 mov state, wzr
811lit_start_2:
812 LIT_PROBS
813
814 #ifdef _LZMA_SIZE_OPT
815
816 PLOAD_2 prob_reg, probs, 1 * PMULT
817 mov sym, 1
818 BIT_01
819MY_ALIGN_FOR_LOOP
820lit_loop:
821 BIT_1
822 tbz sym, 7, lit_loop
823
824 #else
825
826 BIT_0
827 BIT_1
828 BIT_1
829 BIT_1
830 BIT_1
831 BIT_1
832 BIT_1
833
834 #endif
835
836 BIT_2
837 IsMatchBranch_Pre
838 strb sym, [dicPos], 1
839 p2_and sym, 255
840
841 CheckLimits_lit
842lit_end:
843 IF_BIT_0_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), lit_start
844
845 # jmp IsMatch_label
846
847
848#define FLAG_STATE_BITS (4 + PSHIFT)
849
850# ---------- MATCHES ----------
851# MY_ALIGN_FOR_ENTRY
852IsMatch_label:
853 UPDATE_1 probs_state, pbPos_R, (IsMatch - IsMatch)
854 IF_BIT_1 probs_state, 0, (IsRep - IsMatch), IsRep_label
855
856 SET_probs LenCoder
857 or state, (1 << FLAG_STATE_BITS)
858
859# ---------- LEN DECODE ----------
860len_decode:
861 mov len, 8 - kMatchMinLen
862 IF_BIT_0_NOUP_1 probs, len_mid_0
863 UPDATE_1 probs, 0, 0
864 p2_add probs, (1 << (kLenNumLowBits + PSHIFT))
865 mov len, 0 - kMatchMinLen
866 IF_BIT_0_NOUP_1 probs, len_mid_0
867 UPDATE_1 probs, 0, 0
868 p2_add probs, LenHigh * PMULT - (1 << (kLenNumLowBits + PSHIFT))
869
870 #if 0 == 1
871 BIT_0
872 BIT_1
873 BIT_1
874 BIT_1
875 BIT_1
876 BIT_1
877 #else
878 PLOAD_2 prob_reg, probs, 1 * PMULT
879 mov sym, 1
880 BIT_01
881MY_ALIGN_FOR_LOOP
882len8_loop:
883 BIT_1
884 tbz sym, 6, len8_loop
885 #endif
886
887 mov len, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - kMatchMinLen
888 jmp len_mid_2
889
890MY_ALIGN_FOR_ENTRY
891len_mid_0:
892 UPDATE_0 probs, 0, 0
893 p2_add probs, pbPos_R
894 BIT_0
895len_mid_2:
896 BIT_1
897 BIT_2
898 sub len, sym, len
899 tbz state, FLAG_STATE_BITS, copy_match
900
901# ---------- DECODE DISTANCE ----------
902 // probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
903
904 mov t0, 3 + kMatchMinLen
905 cmp len, 3 + kMatchMinLen
906 cmovb t0, len
907 SET_probs PosSlot - (kMatchMinLen << (kNumPosSlotBits))
908 add probs, probs, t0_R, lsl #(kNumPosSlotBits + PSHIFT)
909
910 #ifdef _LZMA_SIZE_OPT
911
912 PLOAD_2 prob_reg, probs, 1 * PMULT
913 mov sym, 1
914 BIT_01
915MY_ALIGN_FOR_LOOP
916slot_loop:
917 BIT_1
918 tbz sym, 5, slot_loop
919
920 #else
921
922 BIT_0
923 BIT_1
924 BIT_1
925 BIT_1
926 BIT_1
927
928 #endif
929
930 #define numBits t4
931 mov numBits, sym
932 BIT_2
933 // we need only low bits
934 p2_and sym, 3
935 cmp numBits, 32 + kEndPosModelIndex / 2
936 jb short_dist
937
938 SET_probs kAlign
939
940 # unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
941 p2_sub numBits, (32 + 1 + kNumAlignBits)
942 # distance = (2 | (distance & 1));
943 or sym, 2
944 PLOAD_2 prob_reg, probs, 1 * PMULT
945 add sym2_R, probs, 2 * PMULT
946
947# ---------- DIRECT DISTANCE ----------
948
949.macro DIRECT_1
950 shr range, 1
951 subs t0, cod, range
952 p2_add sym, sym
953 // add t1, sym, 1
954 csel cod, cod, t0, mi
955 csinc sym, sym, sym, mi
956 // csel sym, t1, sym, pl
957 // adc sym, sym, sym // not 100% compatible for "corruptued-allowed" LZMA streams
958 dec_s numBits
959 je direct_end
960.endm
961
962 #ifdef _LZMA_SIZE_OPT
963
964 jmp direct_norm
965MY_ALIGN_FOR_ENTRY
966direct_loop:
967 DIRECT_1
968direct_norm:
969 TEST_HIGH_BYTE_range
970 jnz direct_loop
971 NORM_2
972 jmp direct_loop
973
974 #else
975
976.macro DIRECT_2
977 TEST_HIGH_BYTE_range
978 jz direct_unroll
979 DIRECT_1
980.endm
981
982 DIRECT_2
983 DIRECT_2
984 DIRECT_2
985 DIRECT_2
986 DIRECT_2
987 DIRECT_2
988 DIRECT_2
989 DIRECT_2
990
991direct_unroll:
992 NORM_2
993 DIRECT_1
994 DIRECT_1
995 DIRECT_1
996 DIRECT_1
997 DIRECT_1
998 DIRECT_1
999 DIRECT_1
1000 DIRECT_1
1001 jmp direct_unroll
1002
1003 #endif
1004
1005MY_ALIGN_FOR_ENTRY
1006direct_end:
1007 shl sym, kNumAlignBits
1008 REV_0 prob_reg
1009 REV_1 prob_reg, 2
1010 REV_1 prob_reg, 4
1011 REV_2 prob_reg, 8
1012
1013decode_dist_end:
1014
1015 // if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
1016
1017 tst checkDicSize, checkDicSize
1018 csel t0, processedPos, checkDicSize, eq
1019 cmp sym, t0
1020 jae end_of_payload
1021 // jmp end_of_payload # for debug
1022
1023 mov rep3, rep2
1024 mov rep2, rep1
1025 mov rep1, rep0
1026 add rep0, sym, 1
1027
1028.macro STATE_UPDATE_FOR_MATCH
1029 // state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
1030 // cmp state, (kNumStates + kNumLitStates) * PMULT
1031 cmp state, kNumLitStates * PMULT + (1 << FLAG_STATE_BITS)
1032 mov state, kNumLitStates * PMULT
1033 mov t0, (kNumLitStates + 3) * PMULT
1034 cmovae state, t0
1035.endm
1036 STATE_UPDATE_FOR_MATCH
1037
1038# ---------- COPY MATCH ----------
1039copy_match:
1040
1041 // if ((rem = limit - dicPos) == 0) break // return SZ_ERROR_DATA;
1042 subs cnt_R, limit, dicPos
1043 // jz fin_dicPos_LIMIT
1044 jz fin_OK
1045
1046 // curLen = ((rem < len) ? (unsigned)rem : len);
1047 cmp cnt_R, len_R
1048 cmovae cnt, len
1049
1050 sub t0_R, dicPos, dic
1051 p2_add dicPos, cnt_R
1052 p2_add processedPos, cnt
1053 p2_sub len, cnt
1054
1055 // pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
1056 p2_sub_s t0_R, rep0_R
1057 jae 1f
1058
1059 cmn t0_R, cnt_R
1060 p2_add t0_R, dicBufSize
1061 ja copy_match_cross
10621:
1063# ---------- COPY MATCH FAST ----------
1064 # t0_R : src_pos
1065 p2_add t0_R, dic
1066 ldrb sym, [t0_R]
1067 p2_add t0_R, cnt_R
1068 p1_neg cnt_R
1069
1070copy_common:
1071 dec dicPos
1072
1073 # dicPos : (ptr_to_last_dest_BYTE)
1074 # t0_R : (src_lim)
1075 # cnt_R : (-curLen)
1076
1077 IsMatchBranch_Pre
1078
1079 inc_s cnt_R
1080 jz copy_end
1081
1082 cmp rep0, 1
1083 je copy_match_0
1084
1085 #ifdef LZMA_USE_2BYTES_COPY
1086 strb sym, [dicPos, cnt_R]
1087 dec dicPos
1088 # dicPos : (ptr_to_last_dest_16bitWORD)
1089 p2_and cnt_R, -2
1090 ldrh sym, [t0_R, cnt_R]
1091 adds cnt_R, cnt_R, 2
1092 jz 2f
1093MY_ALIGN_FOR_LOOP
10941:
1095 /*
1096 strh sym, [dicPos, cnt_R]
1097 ldrh sym, [t0_R, cnt_R]
1098 adds cnt_R, cnt_R, 2
1099 jz 2f
1100 */
1101
1102 strh sym, [dicPos, cnt_R]
1103 ldrh sym, [t0_R, cnt_R]
1104 adds cnt_R, cnt_R, 2
1105 jnz 1b
11062:
1107
1108 /*
1109 // for universal little/big endian code, but slow
1110 strh sym, [dicPos]
1111 inc dicPos
1112 ldrb sym, [t0_R, -1]
1113 */
1114
1115 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1116 // we must improve big-endian detection for another compilers
1117 // for big-endian we need to revert bytes
1118 rev16 sym, sym
1119 #endif
1120
1121 // (sym) must represent as little-endian here:
1122 strb sym, [dicPos], 1
1123 shr sym, 8
1124
1125 #else
1126
1127MY_ALIGN_FOR_LOOP
11281:
1129 strb sym, [dicPos, cnt_R]
1130 ldrb sym, [t0_R, cnt_R]
1131 inc_s cnt_R
1132 jz copy_end
1133
1134 strb sym, [dicPos, cnt_R]
1135 ldrb sym, [t0_R, cnt_R]
1136 inc_s cnt_R
1137 jnz 1b
1138 #endif
1139
1140copy_end:
1141lz_end_match:
1142 strb sym, [dicPos], 1
1143
1144 # IsMatchBranch_Pre
1145 CheckLimits
1146lz_end:
1147 IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
1148
1149
1150
1151# ---------- LITERAL MATCHED ----------
1152
1153 LIT_PROBS
1154
1155 // matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1156
1157 sub t0_R, dicPos, dic
1158 p2_sub_s t0_R, rep0_R
1159
1160 #ifdef LZMA_USE_CMOV_LZ_WRAP
1161 add t1_R, t0_R, dicBufSize
1162 cmovb t0_R, t1_R
1163 #else
1164 jae 1f
1165 p2_add t0_R, dicBufSize
11661:
1167 #endif
1168
1169 ldrb match, [dic, t0_R]
1170
1171 // state -= (state < 10) ? 3 : 6;
1172 sub sym, state, 6 * PMULT
1173 cmp state, 10 * PMULT
1174 p2_sub state, 3 * PMULT
1175 cmovae state, sym
1176
1177 #ifdef _LZMA_SIZE_OPT
1178
1179 mov offs, 256 * PMULT
1180 shl match, (PSHIFT + 1)
1181 mov sym, 1
1182 and bit, match, offs
1183 add prm, probs, offs_R
1184
1185MY_ALIGN_FOR_LOOP
1186litm_loop:
1187 LITM
1188 tbz sym, 8, litm_loop
1189
1190 #else
1191
1192 LITM_0
1193 LITM
1194 LITM
1195 LITM
1196 LITM
1197 LITM
1198 LITM
1199 LITM_2
1200
1201 #endif
1202
1203 IsMatchBranch_Pre
1204 strb sym, [dicPos], 1
1205 p2_and sym, 255
1206
1207 // mov len, wzr // LITM uses same regisetr (len / offs). So we clear it
1208 CheckLimits_lit
1209lit_matched_end:
1210 IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label
1211 # IsMatchBranch
1212 p2_sub state, 3 * PMULT
1213 jmp lit_start_2
1214
1215
1216
1217# ---------- REP 0 LITERAL ----------
1218MY_ALIGN_FOR_ENTRY
1219IsRep0Short_label:
1220 UPDATE_0 probs_state, pbPos_R, 0
1221
1222 // dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1223 sub t0_R, dicPos, dic
1224
1225 // state = state < kNumLitStates ? 9 : 11;
1226 or state, 1 * PMULT
1227
1228 # the caller doesn't allow (dicPos >= limit) case for REP_SHORT
1229 # so we don't need the following (dicPos == limit) check here:
1230 # cmp dicPos, limit
1231 # jae fin_dicPos_LIMIT_REP_SHORT
1232 # // jmp fin_dicPos_LIMIT_REP_SHORT // for testing/debug puposes
1233
1234 inc processedPos
1235
1236 IsMatchBranch_Pre
1237
1238 p2_sub_s t0_R, rep0_R
1239 #ifdef LZMA_USE_CMOV_LZ_WRAP
1240 add sym_R, t0_R, dicBufSize
1241 cmovb t0_R, sym_R
1242 #else
1243 jae 1f
1244 p2_add t0_R, dicBufSize
12451:
1246 #endif
1247
1248 ldrb sym, [dic, t0_R]
1249 // mov len, wzr
1250 jmp lz_end_match
1251
1252MY_ALIGN_FOR_ENTRY
1253IsRep_label:
1254 UPDATE_1 probs_state, 0, (IsRep - IsMatch)
1255
1256 # The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1257 # So we don't check it here.
1258
1259 # mov t0, processedPos
1260 # or t0, checkDicSize
1261 # jz fin_ERROR_2
1262
1263 // state = state < kNumLitStates ? 8 : 11;
1264 cmp state, kNumLitStates * PMULT
1265 mov state, 8 * PMULT
1266 mov probBranch, 11 * PMULT
1267 cmovae state, probBranch
1268
1269 SET_probs RepLenCoder
1270
1271 IF_BIT_1 probs_state, 0, (IsRepG0 - IsMatch), IsRepG0_label
1272 sub_big probs_state, probs_state, (IsMatch - IsRep0Long) << PSHIFT
1273 IF_BIT_0_NOUP probs_state, pbPos_R, 0, IsRep0Short_label
1274 UPDATE_1 probs_state, pbPos_R, 0
1275 jmp len_decode
1276
1277MY_ALIGN_FOR_ENTRY
1278IsRepG0_label:
1279 UPDATE_1 probs_state, 0, (IsRepG0 - IsMatch)
1280 IF_BIT_1 probs_state, 0, (IsRepG1 - IsMatch), IsRepG1_label
1281 mov dist, rep1
1282 mov rep1, rep0
1283 mov rep0, dist
1284 jmp len_decode
1285
1286# MY_ALIGN_FOR_ENTRY
1287IsRepG1_label:
1288 UPDATE_1 probs_state, 0, (IsRepG1 - IsMatch)
1289 IF_BIT_1 probs_state, 0, (IsRepG2 - IsMatch), IsRepG2_label
1290 mov dist, rep2
1291 mov rep2, rep1
1292 mov rep1, rep0
1293 mov rep0, dist
1294 jmp len_decode
1295
1296# MY_ALIGN_FOR_ENTRY
1297IsRepG2_label:
1298 UPDATE_1 probs_state, 0, (IsRepG2 - IsMatch)
1299 mov dist, rep3
1300 mov rep3, rep2
1301 mov rep2, rep1
1302 mov rep1, rep0
1303 mov rep0, dist
1304 jmp len_decode
1305
1306
1307
1308# ---------- SPEC SHORT DISTANCE ----------
1309
1310MY_ALIGN_FOR_ENTRY
1311short_dist:
1312 p2_sub_s numBits, 32 + 1
1313 jbe decode_dist_end
1314 or sym, 2
1315 shl sym, numBits
1316 add sym_R, probs_Spec, sym_R, lsl #PSHIFT
1317 p2_add sym_R, SpecPos * PMULT + 1 * PMULT
1318 mov sym2, PMULT // # step
1319MY_ALIGN_FOR_LOOP
1320spec_loop:
1321 REV_1_VAR prob_reg
1322 dec_s numBits
1323 jnz spec_loop
1324
1325 p2_add sym2_R, probs_Spec
1326 .if SpecPos != 0
1327 p2_add sym2_R, SpecPos * PMULT
1328 .endif
1329 p2_sub sym_R, sym2_R
1330 shr sym, PSHIFT
1331
1332 jmp decode_dist_end
1333
1334
1335
1336# ---------- COPY MATCH 0 ----------
1337MY_ALIGN_FOR_ENTRY
1338copy_match_0:
1339 #ifdef LZMA_USE_4BYTES_FILL
1340 strb sym, [dicPos, cnt_R]
1341 inc_s cnt_R
1342 jz copy_end
1343
1344 strb sym, [dicPos, cnt_R]
1345 inc_s cnt_R
1346 jz copy_end
1347
1348 strb sym, [dicPos, cnt_R]
1349 inc_s cnt_R
1350 jz copy_end
1351
1352 orr t3, sym, sym, lsl 8
1353 p2_and cnt_R, -4
1354 orr t3, t3, t3, lsl 16
1355MY_ALIGN_FOR_LOOP_16
13561:
1357 /*
1358 str t3, [dicPos, cnt_R]
1359 adds cnt_R, cnt_R, 4
1360 jz 2f
1361 */
1362
1363 str t3, [dicPos, cnt_R]
1364 adds cnt_R, cnt_R, 4
1365 jnz 1b
13662:
1367 // p2_and sym, 255
1368 #else
1369
1370MY_ALIGN_FOR_LOOP
13711:
1372 strb sym, [dicPos, cnt_R]
1373 inc_s cnt_R
1374 jz copy_end
1375
1376 strb sym, [dicPos, cnt_R]
1377 inc_s cnt_R
1378 jnz 1b
1379 #endif
1380
1381 jmp copy_end
1382
1383
1384# ---------- COPY MATCH CROSS ----------
1385copy_match_cross:
1386 # t0_R - src pos
1387 # cnt_R - total copy len
1388
1389 p1_neg cnt_R
13901:
1391 ldrb sym, [dic, t0_R]
1392 inc t0_R
1393 strb sym, [dicPos, cnt_R]
1394 inc cnt_R
1395 cmp t0_R, dicBufSize
1396 jne 1b
1397
1398 ldrb sym, [dic]
1399 sub t0_R, dic, cnt_R
1400 jmp copy_common
1401
1402
1403
1404
1405/*
1406fin_dicPos_LIMIT_REP_SHORT:
1407 mov len, 1
1408 jmp fin_OK
1409*/
1410
1411/*
1412fin_dicPos_LIMIT:
1413 jmp fin_OK
1414 # For more strict mode we can stop decoding with error
1415 # mov sym, 1
1416 # jmp fin
1417*/
1418
1419fin_ERROR_MATCH_DIST:
1420 # rep0 = distance + 1;
1421 p2_add len, kMatchSpecLen_Error_Data
1422 mov rep3, rep2
1423 mov rep2, rep1
1424 mov rep1, rep0
1425 mov rep0, sym
1426 STATE_UPDATE_FOR_MATCH
1427 # jmp fin_OK
1428 mov sym, 1
1429 jmp fin
1430
1431end_of_payload:
1432 inc_s sym
1433 jnz fin_ERROR_MATCH_DIST
1434
1435 mov len, kMatchSpecLenStart
1436 xor state, (1 << FLAG_STATE_BITS)
1437 jmp fin_OK
1438
1439/*
1440fin_OK_lit:
1441 mov len, wzr
1442*/
1443
1444fin_OK:
1445 mov sym, wzr
1446
1447fin:
1448 NORM
1449
1450 #define fin_lzma_reg t0_R
1451
1452 .macro STORE_LZMA_VAR reg:req, struct_offs:req
1453 str \reg, [fin_lzma_reg, \struct_offs]
1454 .endm
1455
1456 .macro STORE_LZMA_PAIR reg0:req, reg1:req, struct_offs:req
1457 stp \reg0, \reg1, [fin_lzma_reg, \struct_offs]
1458 .endm
1459
1460 ldr fin_lzma_reg, [sp, 120]
1461 p2_sub dicPos, dic
1462 shr state, PSHIFT
1463
1464 STORE_LZMA_PAIR dicPos, buf, offset_dicPos
1465 STORE_LZMA_PAIR range, cod, offset_range
1466 STORE_LZMA_VAR processedPos, offset_processedPos
1467 STORE_LZMA_PAIR rep0, rep1, offset_rep0
1468 STORE_LZMA_PAIR rep2, rep3, offset_rep2
1469 STORE_LZMA_PAIR state, len, offset_state
1470
1471 mov w0, sym
1472
1473 ldp x29, x30, [sp, 80]
1474 ldp x27, x28, [sp, 64]
1475 ldp x25, x26, [sp, 48]
1476 ldp x23, x24, [sp, 32]
1477 ldp x21, x22, [sp, 16]
1478 ldp x19, x20, [sp], 128
1479
1480 ret
1481/*
1482 .cfi_endproc
1483.LFE0:
1484 .size LzmaDec_DecodeReal_3, .-LzmaDec_DecodeReal_3
1485 .ident "TAG_LZMA"
1486 .section .note.GNU-stack,"",@progbits
1487*/
diff --git a/Asm/x86/7zAsm.asm b/Asm/x86/7zAsm.asm
new file mode 100644
index 0000000..6275bb7
--- /dev/null
+++ b/Asm/x86/7zAsm.asm
@@ -0,0 +1,284 @@
1; 7zAsm.asm -- ASM macros
2; 2021-12-25 : Igor Pavlov : Public domain
3
4
5ifdef @wordsize
6; @wordsize is defined only in JWASM and ASMC and is not defined in MASM
7; @wordsize eq 8 for 64-bit x64
8; @wordsize eq 2 for 32-bit x86
9if @wordsize eq 8
10 x64 equ 1
11endif
12else
13ifdef RAX
14 x64 equ 1
15endif
16endif
17
18
19ifdef x64
20 IS_X64 equ 1
21else
22 IS_X64 equ 0
23endif
24
25ifdef ABI_LINUX
26 IS_LINUX equ 1
27else
28 IS_LINUX equ 0
29endif
30
31ifndef x64
32; Use ABI_CDECL for x86 (32-bit) only
33; if ABI_CDECL is not defined, we use fastcall abi
34ifdef ABI_CDECL
35 IS_CDECL equ 1
36else
37 IS_CDECL equ 0
38endif
39endif
40
41OPTION PROLOGUE:NONE
42OPTION EPILOGUE:NONE
43
44MY_ASM_START macro
45 ifdef x64
46 .code
47 else
48 .386
49 .model flat
50 _TEXT$00 SEGMENT PARA PUBLIC 'CODE'
51 endif
52endm
53
54MY_PROC macro name:req, numParams:req
55 align 16
56 proc_numParams = numParams
57 if (IS_X64 gt 0)
58 proc_name equ name
59 elseif (IS_LINUX gt 0)
60 proc_name equ name
61 elseif (IS_CDECL gt 0)
62 proc_name equ @CatStr(_,name)
63 else
64 proc_name equ @CatStr(@,name,@, %numParams * 4)
65 endif
66 proc_name PROC
67endm
68
69MY_ENDP macro
70 if (IS_X64 gt 0)
71 ret
72 elseif (IS_CDECL gt 0)
73 ret
74 elseif (proc_numParams LT 3)
75 ret
76 else
77 ret (proc_numParams - 2) * 4
78 endif
79 proc_name ENDP
80endm
81
82
83ifdef x64
84 REG_SIZE equ 8
85 REG_LOGAR_SIZE equ 3
86else
87 REG_SIZE equ 4
88 REG_LOGAR_SIZE equ 2
89endif
90
91 x0 equ EAX
92 x1 equ ECX
93 x2 equ EDX
94 x3 equ EBX
95 x4 equ ESP
96 x5 equ EBP
97 x6 equ ESI
98 x7 equ EDI
99
100 x0_W equ AX
101 x1_W equ CX
102 x2_W equ DX
103 x3_W equ BX
104
105 x5_W equ BP
106 x6_W equ SI
107 x7_W equ DI
108
109 x0_L equ AL
110 x1_L equ CL
111 x2_L equ DL
112 x3_L equ BL
113
114 x0_H equ AH
115 x1_H equ CH
116 x2_H equ DH
117 x3_H equ BH
118
119ifdef x64
120 x5_L equ BPL
121 x6_L equ SIL
122 x7_L equ DIL
123
124 r0 equ RAX
125 r1 equ RCX
126 r2 equ RDX
127 r3 equ RBX
128 r4 equ RSP
129 r5 equ RBP
130 r6 equ RSI
131 r7 equ RDI
132 x8 equ r8d
133 x9 equ r9d
134 x10 equ r10d
135 x11 equ r11d
136 x12 equ r12d
137 x13 equ r13d
138 x14 equ r14d
139 x15 equ r15d
140else
141 r0 equ x0
142 r1 equ x1
143 r2 equ x2
144 r3 equ x3
145 r4 equ x4
146 r5 equ x5
147 r6 equ x6
148 r7 equ x7
149endif
150
151
152ifdef x64
153ifdef ABI_LINUX
154
155MY_PUSH_2_REGS macro
156 push r3
157 push r5
158endm
159
160MY_POP_2_REGS macro
161 pop r5
162 pop r3
163endm
164
165endif
166endif
167
168
169MY_PUSH_4_REGS macro
170 push r3
171 push r5
172 push r6
173 push r7
174endm
175
176MY_POP_4_REGS macro
177 pop r7
178 pop r6
179 pop r5
180 pop r3
181endm
182
183
184; for fastcall and for WIN-x64
185REG_PARAM_0_x equ x1
186REG_PARAM_0 equ r1
187REG_PARAM_1_x equ x2
188REG_PARAM_1 equ r2
189
190ifndef x64
191; for x86-fastcall
192
193REG_ABI_PARAM_0_x equ REG_PARAM_0_x
194REG_ABI_PARAM_0 equ REG_PARAM_0
195REG_ABI_PARAM_1_x equ REG_PARAM_1_x
196REG_ABI_PARAM_1 equ REG_PARAM_1
197
198else
199; x64
200
201if (IS_LINUX eq 0)
202
203; for WIN-x64:
204REG_PARAM_2_x equ x8
205REG_PARAM_2 equ r8
206REG_PARAM_3 equ r9
207
208REG_ABI_PARAM_0_x equ REG_PARAM_0_x
209REG_ABI_PARAM_0 equ REG_PARAM_0
210REG_ABI_PARAM_1_x equ REG_PARAM_1_x
211REG_ABI_PARAM_1 equ REG_PARAM_1
212REG_ABI_PARAM_2_x equ REG_PARAM_2_x
213REG_ABI_PARAM_2 equ REG_PARAM_2
214REG_ABI_PARAM_3 equ REG_PARAM_3
215
216else
217; for LINUX-x64:
218REG_LINUX_PARAM_0_x equ x7
219REG_LINUX_PARAM_0 equ r7
220REG_LINUX_PARAM_1_x equ x6
221REG_LINUX_PARAM_1 equ r6
222REG_LINUX_PARAM_2 equ r2
223REG_LINUX_PARAM_3 equ r1
224REG_LINUX_PARAM_4_x equ x8
225REG_LINUX_PARAM_4 equ r8
226REG_LINUX_PARAM_5 equ r9
227
228REG_ABI_PARAM_0_x equ REG_LINUX_PARAM_0_x
229REG_ABI_PARAM_0 equ REG_LINUX_PARAM_0
230REG_ABI_PARAM_1_x equ REG_LINUX_PARAM_1_x
231REG_ABI_PARAM_1 equ REG_LINUX_PARAM_1
232REG_ABI_PARAM_2 equ REG_LINUX_PARAM_2
233REG_ABI_PARAM_3 equ REG_LINUX_PARAM_3
234REG_ABI_PARAM_4_x equ REG_LINUX_PARAM_4_x
235REG_ABI_PARAM_4 equ REG_LINUX_PARAM_4
236REG_ABI_PARAM_5 equ REG_LINUX_PARAM_5
237
238MY_ABI_LINUX_TO_WIN_2 macro
239 mov r2, r6
240 mov r1, r7
241endm
242
243MY_ABI_LINUX_TO_WIN_3 macro
244 mov r8, r2
245 mov r2, r6
246 mov r1, r7
247endm
248
249MY_ABI_LINUX_TO_WIN_4 macro
250 mov r9, r1
251 mov r8, r2
252 mov r2, r6
253 mov r1, r7
254endm
255
256endif ; IS_LINUX
257
258
259MY_PUSH_PRESERVED_ABI_REGS macro
260 if (IS_LINUX gt 0)
261 MY_PUSH_2_REGS
262 else
263 MY_PUSH_4_REGS
264 endif
265 push r12
266 push r13
267 push r14
268 push r15
269endm
270
271
272MY_POP_PRESERVED_ABI_REGS macro
273 pop r15
274 pop r14
275 pop r13
276 pop r12
277 if (IS_LINUX gt 0)
278 MY_POP_2_REGS
279 else
280 MY_POP_4_REGS
281 endif
282endm
283
284endif ; x64
diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm
new file mode 100644
index 0000000..0fee206
--- /dev/null
+++ b/Asm/x86/7zCrcOpt.asm
@@ -0,0 +1,180 @@
1; 7zCrcOpt.asm -- CRC32 calculation : optimized version
2; 2021-02-07 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8rD equ r2
9rN equ r7
10rT equ r5
11
12ifdef x64
13 num_VAR equ r8
14 table_VAR equ r9
15else
16 if (IS_CDECL gt 0)
17 crc_OFFS equ (REG_SIZE * 5)
18 data_OFFS equ (REG_SIZE + crc_OFFS)
19 size_OFFS equ (REG_SIZE + data_OFFS)
20 else
21 size_OFFS equ (REG_SIZE * 5)
22 endif
23 table_OFFS equ (REG_SIZE + size_OFFS)
24 num_VAR equ [r4 + size_OFFS]
25 table_VAR equ [r4 + table_OFFS]
26endif
27
28SRCDAT equ rD + rN * 1 + 4 *
29
30CRC macro op:req, dest:req, src:req, t:req
31 op dest, DWORD PTR [rT + src * 4 + 0400h * t]
32endm
33
34CRC_XOR macro dest:req, src:req, t:req
35 CRC xor, dest, src, t
36endm
37
38CRC_MOV macro dest:req, src:req, t:req
39 CRC mov, dest, src, t
40endm
41
42CRC1b macro
43 movzx x6, BYTE PTR [rD]
44 inc rD
45 movzx x3, x0_L
46 xor x6, x3
47 shr x0, 8
48 CRC xor, x0, r6, 0
49 dec rN
50endm
51
52MY_PROLOG macro crc_end:req
53
54 ifdef x64
55 if (IS_LINUX gt 0)
56 MY_PUSH_2_REGS
57 mov x0, REG_ABI_PARAM_0_x ; x0 = x7
58 mov rT, REG_ABI_PARAM_3 ; r5 = r1
59 mov rN, REG_ABI_PARAM_2 ; r7 = r2
60 mov rD, REG_ABI_PARAM_1 ; r2 = r6
61 else
62 MY_PUSH_4_REGS
63 mov x0, REG_ABI_PARAM_0_x ; x0 = x1
64 mov rT, REG_ABI_PARAM_3 ; r5 = r9
65 mov rN, REG_ABI_PARAM_2 ; r7 = r8
66 ; mov rD, REG_ABI_PARAM_1 ; r2 = r2
67 endif
68 else
69 MY_PUSH_4_REGS
70 if (IS_CDECL gt 0)
71 mov x0, [r4 + crc_OFFS]
72 mov rD, [r4 + data_OFFS]
73 else
74 mov x0, REG_ABI_PARAM_0_x
75 endif
76 mov rN, num_VAR
77 mov rT, table_VAR
78 endif
79
80 test rN, rN
81 jz crc_end
82 @@:
83 test rD, 7
84 jz @F
85 CRC1b
86 jnz @B
87 @@:
88 cmp rN, 16
89 jb crc_end
90 add rN, rD
91 mov num_VAR, rN
92 sub rN, 8
93 and rN, NOT 7
94 sub rD, rN
95 xor x0, [SRCDAT 0]
96endm
97
98MY_EPILOG macro crc_end:req
99 xor x0, [SRCDAT 0]
100 mov rD, rN
101 mov rN, num_VAR
102 sub rN, rD
103 crc_end:
104 test rN, rN
105 jz @F
106 CRC1b
107 jmp crc_end
108 @@:
109 if (IS_X64 gt 0) and (IS_LINUX gt 0)
110 MY_POP_2_REGS
111 else
112 MY_POP_4_REGS
113 endif
114endm
115
116MY_PROC CrcUpdateT8, 4
117 MY_PROLOG crc_end_8
118 mov x1, [SRCDAT 1]
119 align 16
120 main_loop_8:
121 mov x6, [SRCDAT 2]
122 movzx x3, x1_L
123 CRC_XOR x6, r3, 3
124 movzx x3, x1_H
125 CRC_XOR x6, r3, 2
126 shr x1, 16
127 movzx x3, x1_L
128 movzx x1, x1_H
129 CRC_XOR x6, r3, 1
130 movzx x3, x0_L
131 CRC_XOR x6, r1, 0
132
133 mov x1, [SRCDAT 3]
134 CRC_XOR x6, r3, 7
135 movzx x3, x0_H
136 shr x0, 16
137 CRC_XOR x6, r3, 6
138 movzx x3, x0_L
139 CRC_XOR x6, r3, 5
140 movzx x3, x0_H
141 CRC_MOV x0, r3, 4
142 xor x0, x6
143 add rD, 8
144 jnz main_loop_8
145
146 MY_EPILOG crc_end_8
147MY_ENDP
148
149MY_PROC CrcUpdateT4, 4
150 MY_PROLOG crc_end_4
151 align 16
152 main_loop_4:
153 movzx x1, x0_L
154 movzx x3, x0_H
155 shr x0, 16
156 movzx x6, x0_H
157 and x0, 0FFh
158 CRC_MOV x1, r1, 3
159 xor x1, [SRCDAT 1]
160 CRC_XOR x1, r3, 2
161 CRC_XOR x1, r6, 0
162 CRC_XOR x1, r0, 1
163
164 movzx x0, x1_L
165 movzx x3, x1_H
166 shr x1, 16
167 movzx x6, x1_H
168 and x1, 0FFh
169 CRC_MOV x0, r0, 3
170 xor x0, [SRCDAT 2]
171 CRC_XOR x0, r3, 2
172 CRC_XOR x0, r6, 0
173 CRC_XOR x0, r1, 1
174 add rD, 8
175 jnz main_loop_4
176
177 MY_EPILOG crc_end_4
178MY_ENDP
179
180end
diff --git a/Asm/x86/AesOpt.asm b/Asm/x86/AesOpt.asm
new file mode 100644
index 0000000..84bf897
--- /dev/null
+++ b/Asm/x86/AesOpt.asm
@@ -0,0 +1,742 @@
1; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
2; 2021-12-25 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6ifdef __ASMC__
7 use_vaes_256 equ 1
8else
9ifdef ymm0
10 use_vaes_256 equ 1
11endif
12endif
13
14
15ifdef use_vaes_256
16 ECHO "++ VAES 256"
17else
18 ECHO "-- NO VAES 256"
19endif
20
21ifdef x64
22 ECHO "x86-64"
23else
24 ECHO "x86"
25if (IS_CDECL gt 0)
26 ECHO "ABI : CDECL"
27else
28 ECHO "ABI : no CDECL : FASTCALL"
29endif
30endif
31
32if (IS_LINUX gt 0)
33 ECHO "ABI : LINUX"
34else
35 ECHO "ABI : WINDOWS"
36endif
37
38MY_ASM_START
39
40ifndef x64
41 .686
42 .xmm
43endif
44
45
46; MY_ALIGN EQU ALIGN(64)
47MY_ALIGN EQU
48
49SEG_ALIGN EQU MY_ALIGN
50
51MY_SEG_PROC macro name:req, numParams:req
52 ; seg_name equ @CatStr(_TEXT$, name)
53 ; seg_name SEGMENT SEG_ALIGN 'CODE'
54 MY_PROC name, numParams
55endm
56
57MY_SEG_ENDP macro
58 ; seg_name ENDS
59endm
60
61
62NUM_AES_KEYS_MAX equ 15
63
64; the number of push operators in function PROLOG
65if (IS_LINUX eq 0) or (IS_X64 eq 0)
66num_regs_push equ 2
67stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
68endif
69
70ifdef x64
71 num_param equ REG_ABI_PARAM_2
72else
73 if (IS_CDECL gt 0)
74 ; size_t size
75 ; void * data
76 ; UInt32 * aes
77 ; ret-ip <- (r4)
78 aes_OFFS equ (stack_param_offset)
79 data_OFFS equ (REG_SIZE + aes_OFFS)
80 size_OFFS equ (REG_SIZE + data_OFFS)
81 num_param equ [r4 + size_OFFS]
82 else
83 num_param equ [r4 + stack_param_offset]
84 endif
85endif
86
87keys equ REG_PARAM_0 ; r1
88rD equ REG_PARAM_1 ; r2
89rN equ r0
90
91koffs_x equ x7
92koffs_r equ r7
93
94ksize_x equ x6
95ksize_r equ r6
96
97keys2 equ r3
98
99state equ xmm0
100key equ xmm0
101key_ymm equ ymm0
102key_ymm_n equ 0
103
104ifdef x64
105 ways = 11
106else
107 ways = 4
108endif
109
110ways_start_reg equ 1
111
112iv equ @CatStr(xmm, %(ways_start_reg + ways))
113iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
114
115
116WOP macro op, op2
117 i = 0
118 rept ways
119 op @CatStr(xmm, %(ways_start_reg + i)), op2
120 i = i + 1
121 endm
122endm
123
124
125ifndef ABI_LINUX
126ifdef x64
127
128; we use 32 bytes of home space in stack in WIN64-x64
129NUM_HOME_MM_REGS equ (32 / 16)
130; we preserve xmm registers starting from xmm6 in WIN64-x64
131MM_START_SAVE_REG equ 6
132
133SAVE_XMM macro num_used_mm_regs:req
134 num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
135 if num_save_mm_regs GT 0
136 num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
137 ; RSP is (16*x + 8) after entering the function in WIN64-x64
138 stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
139
140 i = 0
141 rept num_save_mm_regs
142
143 if i eq NUM_HOME_MM_REGS
144 sub r4, stack_offset
145 endif
146
147 if i lt NUM_HOME_MM_REGS
148 movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
149 else
150 movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
151 endif
152
153 i = i + 1
154 endm
155 endif
156endm
157
158RESTORE_XMM macro num_used_mm_regs:req
159 if num_save_mm_regs GT 0
160 i = 0
161 if num_save_mm_regs2 GT 0
162 rept num_save_mm_regs2
163 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
164 i = i + 1
165 endm
166 add r4, stack_offset
167 endif
168
169 num_low_regs = num_save_mm_regs - i
170 i = 0
171 rept num_low_regs
172 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
173 i = i + 1
174 endm
175 endif
176endm
177
178endif ; x64
179endif ; ABI_LINUX
180
181
182MY_PROLOG macro num_used_mm_regs:req
183 ; num_regs_push: must be equal to the number of push operators
184 ; push r3
185 ; push r5
186 if (IS_LINUX eq 0) or (IS_X64 eq 0)
187 push r6
188 push r7
189 endif
190
191 mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
192
193 if (IS_X64 eq 0)
194 if (IS_CDECL gt 0)
195 mov rD, [r4 + data_OFFS]
196 mov keys, [r4 + aes_OFFS]
197 endif
198 elseif (IS_LINUX gt 0)
199 MY_ABI_LINUX_TO_WIN_2
200 endif
201
202
203 ifndef ABI_LINUX
204 ifdef x64
205 SAVE_XMM num_used_mm_regs
206 endif
207 endif
208
209 mov ksize_x, [keys + 16]
210 shl ksize_x, 5
211endm
212
213
214MY_EPILOG macro
215 ifndef ABI_LINUX
216 ifdef x64
217 RESTORE_XMM num_save_mm_regs
218 endif
219 endif
220
221 if (IS_LINUX eq 0) or (IS_X64 eq 0)
222 pop r7
223 pop r6
224 endif
225 ; pop r5
226 ; pop r3
227 MY_ENDP
228endm
229
230
231OP_KEY macro op:req, offs:req
232 op state, [keys + offs]
233endm
234
235
236WOP_KEY macro op:req, offs:req
237 movdqa key, [keys + offs]
238 WOP op, key
239endm
240
241
242; ---------- AES-CBC Decode ----------
243
244
245XOR_WITH_DATA macro reg, _ppp_
246 pxor reg, [rD + i * 16]
247endm
248
249WRITE_TO_DATA macro reg, _ppp_
250 movdqa [rD + i * 16], reg
251endm
252
253
254; state0 equ @CatStr(xmm, %(ways_start_reg))
255
256key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
257key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
258
259key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
260key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
261key_last_ymm_n equ (ways_start_reg + ways + 2)
262
263NUM_CBC_REGS equ (ways_start_reg + ways + 3)
264
265
266MY_SEG_PROC AesCbc_Decode_HW, 3
267
268 AesCbc_Decode_HW_start::
269 MY_PROLOG NUM_CBC_REGS
270
271 AesCbc_Decode_HW_start_2::
272 movdqa iv, [keys]
273 add keys, 32
274
275 movdqa key0, [keys + 1 * ksize_r]
276 movdqa key_last, [keys]
277 sub ksize_x, 16
278
279 jmp check2
280 align 16
281 nextBlocks2:
282 WOP movdqa, [rD + i * 16]
283 mov koffs_x, ksize_x
284 ; WOP_KEY pxor, ksize_r + 16
285 WOP pxor, key0
286 ; align 16
287 @@:
288 WOP_KEY aesdec, 1 * koffs_r
289 sub koffs_r, 16
290 jnz @B
291 ; WOP_KEY aesdeclast, 0
292 WOP aesdeclast, key_last
293
294 pxor @CatStr(xmm, %(ways_start_reg)), iv
295 i = 1
296 rept ways - 1
297 pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
298 i = i + 1
299 endm
300 movdqa iv, [rD + ways * 16 - 16]
301 WOP WRITE_TO_DATA
302
303 add rD, ways * 16
304 AesCbc_Decode_HW_start_3::
305 check2:
306 sub rN, ways
307 jnc nextBlocks2
308 add rN, ways
309
310 sub ksize_x, 16
311
312 jmp check
313 nextBlock:
314 movdqa state, [rD]
315 mov koffs_x, ksize_x
316 ; OP_KEY pxor, 1 * ksize_r + 32
317 pxor state, key0
318 ; movdqa state0, [rD]
319 ; movdqa state, key0
320 ; pxor state, state0
321 @@:
322 OP_KEY aesdec, 1 * koffs_r + 16
323 OP_KEY aesdec, 1 * koffs_r
324 sub koffs_r, 32
325 jnz @B
326 OP_KEY aesdec, 16
327 ; OP_KEY aesdeclast, 0
328 aesdeclast state, key_last
329
330 pxor state, iv
331 movdqa iv, [rD]
332 ; movdqa iv, state0
333 movdqa [rD], state
334
335 add rD, 16
336 check:
337 sub rN, 1
338 jnc nextBlock
339
340 movdqa [keys - 32], iv
341MY_EPILOG
342
343
344
345
346; ---------- AVX ----------
347
348
349AVX__WOP_n macro op
350 i = 0
351 rept ways
352 op (ways_start_reg + i)
353 i = i + 1
354 endm
355endm
356
357AVX__WOP macro op
358 i = 0
359 rept ways
360 op @CatStr(ymm, %(ways_start_reg + i))
361 i = i + 1
362 endm
363endm
364
365
366AVX__WOP_KEY macro op:req, offs:req
367 vmovdqa key_ymm, ymmword ptr [keys2 + offs]
368 AVX__WOP_n op
369endm
370
371
372AVX__CBC_START macro reg
373 ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
374 vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
375endm
376
377AVX__CBC_END macro reg
378 if i eq 0
379 vpxor reg, reg, iv_ymm
380 else
381 vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
382 endif
383endm
384
385
386AVX__WRITE_TO_DATA macro reg
387 vmovdqu ymmword ptr [rD + 32 * i], reg
388endm
389
390AVX__XOR_WITH_DATA macro reg
391 vpxor reg, reg, ymmword ptr [rD + 32 * i]
392endm
393
394AVX__CTR_START macro reg
395 vpaddq iv_ymm, iv_ymm, one_ymm
396 ; vpxor reg, iv_ymm, key_ymm
397 vpxor reg, iv_ymm, key0_ymm
398endm
399
400
401MY_VAES_INSTR_2 macro cmd, dest, a1, a2
402 db 0c4H
403 db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
404 db 5 + 8 * ((not (a1)) and 15)
405 db cmd
406 db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
407endm
408
409MY_VAES_INSTR macro cmd, dest, a
410 MY_VAES_INSTR_2 cmd, dest, dest, a
411endm
412
413MY_vaesenc macro dest, a
414 MY_VAES_INSTR 0dcH, dest, a
415endm
416MY_vaesenclast macro dest, a
417 MY_VAES_INSTR 0ddH, dest, a
418endm
419MY_vaesdec macro dest, a
420 MY_VAES_INSTR 0deH, dest, a
421endm
422MY_vaesdeclast macro dest, a
423 MY_VAES_INSTR 0dfH, dest, a
424endm
425
426
427AVX__VAES_DEC macro reg
428 MY_vaesdec reg, key_ymm_n
429endm
430
431AVX__VAES_DEC_LAST_key_last macro reg
432 ; MY_vaesdeclast reg, key_ymm_n
433 MY_vaesdeclast reg, key_last_ymm_n
434endm
435
436AVX__VAES_ENC macro reg
437 MY_vaesenc reg, key_ymm_n
438endm
439
440AVX__VAES_ENC_LAST macro reg
441 MY_vaesenclast reg, key_ymm_n
442endm
443
444AVX__vinserti128_TO_HIGH macro dest, src
445 vinserti128 dest, dest, src, 1
446endm
447
448
449MY_PROC AesCbc_Decode_HW_256, 3
450 ifdef use_vaes_256
451 MY_PROLOG NUM_CBC_REGS
452
453 cmp rN, ways * 2
454 jb AesCbc_Decode_HW_start_2
455
456 vmovdqa iv, xmmword ptr [keys]
457 add keys, 32
458
459 vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
460 vbroadcasti128 key_last_ymm, xmmword ptr [keys]
461 sub ksize_x, 16
462 mov koffs_x, ksize_x
463 add ksize_x, ksize_x
464
465 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
466 push keys2
467 sub r4, AVX_STACK_SUB
468 ; sub r4, 32
469 ; sub r4, ksize_r
470 ; lea keys2, [r4 + 32]
471 mov keys2, r4
472 and keys2, -32
473 broad:
474 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
475 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
476 sub koffs_r, 16
477 ; jnc broad
478 jnz broad
479
480 sub rN, ways * 2
481
482 align 16
483 avx_cbcdec_nextBlock2:
484 mov koffs_x, ksize_x
485 ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
486 AVX__WOP AVX__CBC_START
487 @@:
488 AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
489 sub koffs_r, 32
490 jnz @B
491 ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
492 AVX__WOP_n AVX__VAES_DEC_LAST_key_last
493
494 AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
495 AVX__WOP AVX__CBC_END
496
497 vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
498 AVX__WOP AVX__WRITE_TO_DATA
499
500 add rD, ways * 32
501 sub rN, ways * 2
502 jnc avx_cbcdec_nextBlock2
503 add rN, ways * 2
504
505 shr ksize_x, 1
506
507 ; lea r4, [r4 + 1 * ksize_r + 32]
508 add r4, AVX_STACK_SUB
509 pop keys2
510
511 vzeroupper
512 jmp AesCbc_Decode_HW_start_3
513 else
514 jmp AesCbc_Decode_HW_start
515 endif
516MY_ENDP
517MY_SEG_ENDP
518
519
520
521
522; ---------- AES-CBC Encode ----------
523
524e0 equ xmm1
525
526CENC_START_KEY equ 2
527CENC_NUM_REG_KEYS equ (3 * 2)
528; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
529
530MY_SEG_PROC AesCbc_Encode_HW, 3
531 MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
532
533 movdqa state, [keys]
534 add keys, 32
535
536 i = 0
537 rept CENC_NUM_REG_KEYS
538 movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
539 i = i + 1
540 endm
541
542 add keys, ksize_r
543 neg ksize_r
544 add ksize_r, (16 * CENC_NUM_REG_KEYS)
545 ; movdqa last_key, [keys]
546 jmp check_e
547
548 align 16
549 nextBlock_e:
550 movdqa e0, [rD]
551 mov koffs_r, ksize_r
552 pxor e0, @CatStr(xmm, %(CENC_START_KEY))
553 pxor state, e0
554
555 i = 1
556 rept (CENC_NUM_REG_KEYS - 1)
557 aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
558 i = i + 1
559 endm
560
561 @@:
562 OP_KEY aesenc, 1 * koffs_r
563 OP_KEY aesenc, 1 * koffs_r + 16
564 add koffs_r, 32
565 jnz @B
566 OP_KEY aesenclast, 0
567 ; aesenclast state, last_key
568
569 movdqa [rD], state
570 add rD, 16
571 check_e:
572 sub rN, 1
573 jnc nextBlock_e
574
575 ; movdqa [keys - 32], state
576 movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
577MY_EPILOG
578MY_SEG_ENDP
579
580
581
582; ---------- AES-CTR ----------
583
584ifdef x64
585 ; ways = 11
586endif
587
588
589one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
590one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
591key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
592key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
593NUM_CTR_REGS equ (ways_start_reg + ways + 3)
594
595INIT_CTR macro reg, _ppp_
596 paddq iv, one
597 movdqa reg, iv
598endm
599
600
601MY_SEG_PROC AesCtr_Code_HW, 3
602 Ctr_start::
603 MY_PROLOG NUM_CTR_REGS
604
605 Ctr_start_2::
606 movdqa iv, [keys]
607 add keys, 32
608 movdqa key0, [keys]
609
610 add keys, ksize_r
611 neg ksize_r
612 add ksize_r, 16
613
614 Ctr_start_3::
615 mov koffs_x, 1
616 movd one, koffs_x
617 jmp check2_c
618
619 align 16
620 nextBlocks2_c:
621 WOP INIT_CTR, 0
622 mov koffs_r, ksize_r
623 ; WOP_KEY pxor, 1 * koffs_r -16
624 WOP pxor, key0
625 @@:
626 WOP_KEY aesenc, 1 * koffs_r
627 add koffs_r, 16
628 jnz @B
629 WOP_KEY aesenclast, 0
630
631 WOP XOR_WITH_DATA
632 WOP WRITE_TO_DATA
633 add rD, ways * 16
634 check2_c:
635 sub rN, ways
636 jnc nextBlocks2_c
637 add rN, ways
638
639 sub keys, 16
640 add ksize_r, 16
641
642 jmp check_c
643
644 ; align 16
645 nextBlock_c:
646 paddq iv, one
647 ; movdqa state, [keys + 1 * koffs_r - 16]
648 movdqa state, key0
649 mov koffs_r, ksize_r
650 pxor state, iv
651
652 @@:
653 OP_KEY aesenc, 1 * koffs_r
654 OP_KEY aesenc, 1 * koffs_r + 16
655 add koffs_r, 32
656 jnz @B
657 OP_KEY aesenc, 0
658 OP_KEY aesenclast, 16
659
660 pxor state, [rD]
661 movdqa [rD], state
662 add rD, 16
663 check_c:
664 sub rN, 1
665 jnc nextBlock_c
666
667 ; movdqa [keys - 32], iv
668 movdqa [keys + 1 * ksize_r - 16 - 32], iv
669MY_EPILOG
670
671
672MY_PROC AesCtr_Code_HW_256, 3
673 ifdef use_vaes_256
674 MY_PROLOG NUM_CTR_REGS
675
676 cmp rN, ways * 2
677 jb Ctr_start_2
678
679 vbroadcasti128 iv_ymm, xmmword ptr [keys]
680 add keys, 32
681 vbroadcasti128 key0_ymm, xmmword ptr [keys]
682 mov koffs_x, 1
683 vmovd one, koffs_x
684 vpsubq iv_ymm, iv_ymm, one_ymm
685 vpaddq one, one, one
686 AVX__vinserti128_TO_HIGH one_ymm, one
687
688 add keys, ksize_r
689 sub ksize_x, 16
690 neg ksize_r
691 mov koffs_r, ksize_r
692 add ksize_r, ksize_r
693
694 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
695 push keys2
696 lea keys2, [r4 - 32]
697 sub r4, AVX_STACK_SUB
698 and keys2, -32
699 vbroadcasti128 key_ymm, xmmword ptr [keys]
700 vmovdqa ymmword ptr [keys2], key_ymm
701 @@:
702 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
703 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
704 add koffs_r, 16
705 jnz @B
706
707 sub rN, ways * 2
708
709 align 16
710 avx_ctr_nextBlock2:
711 mov koffs_r, ksize_r
712 AVX__WOP AVX__CTR_START
713 ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
714 @@:
715 AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
716 add koffs_r, 32
717 jnz @B
718 AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
719
720 AVX__WOP AVX__XOR_WITH_DATA
721 AVX__WOP AVX__WRITE_TO_DATA
722
723 add rD, ways * 32
724 sub rN, ways * 2
725 jnc avx_ctr_nextBlock2
726 add rN, ways * 2
727
728 vextracti128 iv, iv_ymm, 1
729 sar ksize_r, 1
730
731 add r4, AVX_STACK_SUB
732 pop keys2
733
734 vzeroupper
735 jmp Ctr_start_3
736 else
737 jmp Ctr_start
738 endif
739MY_ENDP
740MY_SEG_ENDP
741
742end
diff --git a/Asm/x86/LzFindOpt.asm b/Asm/x86/LzFindOpt.asm
new file mode 100644
index 0000000..42e10bd
--- /dev/null
+++ b/Asm/x86/LzFindOpt.asm
@@ -0,0 +1,513 @@
1; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
2; 2021-07-21: Igor Pavlov : Public domain
3;
4
5ifndef x64
6; x64=1
7; .err <x64_IS_REQUIRED>
8endif
9
10include 7zAsm.asm
11
12MY_ASM_START
13
14_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
15
16MY_ALIGN macro num:req
17 align num
18endm
19
20MY_ALIGN_32 macro
21 MY_ALIGN 32
22endm
23
24MY_ALIGN_64 macro
25 MY_ALIGN 64
26endm
27
28
29t0_L equ x0_L
30t0_x equ x0
31t0 equ r0
32t1_x equ x3
33t1 equ r3
34
35cp_x equ t1_x
36cp_r equ t1
37m equ x5
38m_r equ r5
39len_x equ x6
40len equ r6
41diff_x equ x7
42diff equ r7
43len0 equ r10
44len1_x equ x11
45len1 equ r11
46maxLen_x equ x12
47maxLen equ r12
48d equ r13
49ptr0 equ r14
50ptr1 equ r15
51
52d_lim equ m_r
53cycSize equ len_x
54hash_lim equ len0
55delta1_x equ len1_x
56delta1_r equ len1
57delta_x equ maxLen_x
58delta_r equ maxLen
59hash equ ptr0
60src equ ptr1
61
62
63
64if (IS_LINUX gt 0)
65
66; r1 r2 r8 r9 : win32
67; r7 r6 r2 r1 r8 r9 : linux
68
69lenLimit equ r8
70lenLimit_x equ x8
71; pos_r equ r2
72pos equ x2
73cur equ r1
74son equ r9
75
76else
77
78lenLimit equ REG_ABI_PARAM_2
79lenLimit_x equ REG_ABI_PARAM_2_x
80pos equ REG_ABI_PARAM_1_x
81cur equ REG_ABI_PARAM_0
82son equ REG_ABI_PARAM_3
83
84endif
85
86
87if (IS_LINUX gt 0)
88 maxLen_OFFS equ (REG_SIZE * (6 + 1))
89else
90 cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4))
91 d_OFFS equ (REG_SIZE + cutValue_OFFS)
92 maxLen_OFFS equ (REG_SIZE + d_OFFS)
93endif
94 hash_OFFS equ (REG_SIZE + maxLen_OFFS)
95 limit_OFFS equ (REG_SIZE + hash_OFFS)
96 size_OFFS equ (REG_SIZE + limit_OFFS)
97 cycPos_OFFS equ (REG_SIZE + size_OFFS)
98 cycSize_OFFS equ (REG_SIZE + cycPos_OFFS)
99 posRes_OFFS equ (REG_SIZE + cycSize_OFFS)
100
101if (IS_LINUX gt 0)
102else
103 cutValue_PAR equ [r0 + cutValue_OFFS]
104 d_PAR equ [r0 + d_OFFS]
105endif
106 maxLen_PAR equ [r0 + maxLen_OFFS]
107 hash_PAR equ [r0 + hash_OFFS]
108 limit_PAR equ [r0 + limit_OFFS]
109 size_PAR equ [r0 + size_OFFS]
110 cycPos_PAR equ [r0 + cycPos_OFFS]
111 cycSize_PAR equ [r0 + cycSize_OFFS]
112 posRes_PAR equ [r0 + posRes_OFFS]
113
114
115 cutValue_VAR equ DWORD PTR [r4 + 8 * 0]
116 cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4]
117 cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0]
118 cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4]
119 hash_VAR equ QWORD PTR [r4 + 8 * 2]
120 limit_VAR equ QWORD PTR [r4 + 8 * 3]
121 size_VAR equ QWORD PTR [r4 + 8 * 4]
122 distances equ QWORD PTR [r4 + 8 * 5]
123 maxLen_VAR equ QWORD PTR [r4 + 8 * 6]
124
125 Old_RSP equ QWORD PTR [r4 + 8 * 7]
126 LOCAL_SIZE equ 8 * 8
127
128COPY_VAR_32 macro dest_var, src_var
129 mov x3, src_var
130 mov dest_var, x3
131endm
132
133COPY_VAR_64 macro dest_var, src_var
134 mov r3, src_var
135 mov dest_var, r3
136endm
137
138
139; MY_ALIGN_64
140MY_PROC GetMatchesSpecN_2, 13
141MY_PUSH_PRESERVED_ABI_REGS
142 mov r0, RSP
143 lea r3, [r0 - LOCAL_SIZE]
144 and r3, -64
145 mov RSP, r3
146 mov Old_RSP, r0
147
148if (IS_LINUX gt 0)
149 mov d, REG_ABI_PARAM_5 ; r13 = r9
150 mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8
151 mov son, REG_ABI_PARAM_3 ; r9 = r1
152 mov r8, REG_ABI_PARAM_2 ; r8 = r2
153 mov pos, REG_ABI_PARAM_1_x ; r2 = x6
154 mov r1, REG_ABI_PARAM_0 ; r1 = r7
155else
156 COPY_VAR_32 cutValue_VAR, cutValue_PAR
157 mov d, d_PAR
158endif
159
160 COPY_VAR_64 limit_VAR, limit_PAR
161
162 mov hash_lim, size_PAR
163 mov size_VAR, hash_lim
164
165 mov cp_x, cycPos_PAR
166 mov hash, hash_PAR
167
168 mov cycSize, cycSize_PAR
169 mov cycSize_VAR, cycSize
170
171 ; we want cur in (rcx). So we change the cur and lenLimit variables
172 sub lenLimit, cur
173 neg lenLimit_x
174 inc lenLimit_x
175
176 mov t0_x, maxLen_PAR
177 sub t0, lenLimit
178 mov maxLen_VAR, t0
179
180 jmp main_loop
181
182MY_ALIGN_64
183fill_empty:
184 ; ptr0 = *ptr1 = kEmptyHashValue;
185 mov QWORD PTR [ptr1], 0
186 inc pos
187 inc cp_x
188 mov DWORD PTR [d - 4], 0
189 cmp d, limit_VAR
190 jae fin
191 cmp hash, hash_lim
192 je fin
193
194; MY_ALIGN_64
195main_loop:
196 ; UInt32 delta = *hash++;
197 mov diff_x, [hash] ; delta
198 add hash, 4
199 ; mov cycPos_VAR, cp_x
200
201 inc cur
202 add d, 4
203 mov m, pos
204 sub m, diff_x; ; matchPos
205
206 ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
207 lea ptr1, [son + 8 * cp_r]
208 ; mov cycSize, cycSize_VAR
209 cmp pos, cycSize
210 jb directMode ; if (pos < cycSize_VAR)
211
212 ; CYC MODE
213
214 cmp diff_x, cycSize
215 jae fill_empty ; if (delta >= cycSize_VAR)
216
217 xor t0_x, t0_x
218 mov cycPos_VAR, cp_x
219 sub cp_x, diff_x
220 ; jae prepare_for_tree_loop
221 ; add cp_x, cycSize
222 cmovb t0_x, cycSize
223 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
224 jmp prepare_for_tree_loop
225
226
227directMode:
228 cmp diff_x, pos
229 je fill_empty ; if (delta == pos)
230 jae fin_error ; if (delta >= pos)
231
232 mov cycPos_VAR, cp_x
233 mov cp_x, m
234
235prepare_for_tree_loop:
236 mov len0, lenLimit
237 mov hash_VAR, hash
238 ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
239 lea ptr0, [ptr1 + 4]
240 ; UInt32 *_distances = ++d;
241 mov distances, d
242
243 neg len0
244 mov len1, len0
245
246 mov t0_x, cutValue_VAR
247 mov maxLen, maxLen_VAR
248 mov cutValueCur_VAR, t0_x
249
250MY_ALIGN_32
251tree_loop:
252 neg diff
253 mov len, len0
254 cmp len1, len0
255 cmovb len, len1 ; len = (len1 < len0 ? len1 : len0);
256 add diff, cur
257
258 mov t0_x, [son + cp_r * 8] ; prefetch
259 movzx t0_x, BYTE PTR [diff + 1 * len]
260 lea cp_r, [son + cp_r * 8]
261 cmp [cur + 1 * len], t0_L
262 je matched_1
263
264 jb left_0
265
266 mov [ptr1], m
267 mov m, [cp_r + 4]
268 lea ptr1, [cp_r + 4]
269 sub diff, cur ; FIX32
270 jmp next_node
271
272MY_ALIGN_32
273left_0:
274 mov [ptr0], m
275 mov m, [cp_r]
276 mov ptr0, cp_r
277 sub diff, cur ; FIX32
278 ; jmp next_node
279
280; ------------ NEXT NODE ------------
281; MY_ALIGN_32
282next_node:
283 mov cycSize, cycSize_VAR
284 dec cutValueCur_VAR
285 je finish_tree
286
287 add diff_x, pos ; prev_match = pos + diff
288 cmp m, diff_x
289 jae fin_error ; if (new_match >= prev_match)
290
291 mov diff_x, pos
292 sub diff_x, m ; delta = pos - new_match
293 cmp pos, cycSize
294 jae cyc_mode_2 ; if (pos >= cycSize)
295
296 mov cp_x, m
297 test m, m
298 jne tree_loop ; if (m != 0)
299
300finish_tree:
301 ; ptr0 = *ptr1 = kEmptyHashValue;
302 mov DWORD PTR [ptr0], 0
303 mov DWORD PTR [ptr1], 0
304
305 inc pos
306
307 ; _distances[-1] = (UInt32)(d - _distances);
308 mov t0, distances
309 mov t1, d
310 sub t1, t0
311 shr t1_x, 2
312 mov [t0 - 4], t1_x
313
314 cmp d, limit_VAR
315 jae fin ; if (d >= limit)
316
317 mov cp_x, cycPos_VAR
318 mov hash, hash_VAR
319 mov hash_lim, size_VAR
320 inc cp_x
321 cmp hash, hash_lim
322 jne main_loop ; if (hash != size)
323 jmp fin
324
325
326MY_ALIGN_32
327cyc_mode_2:
328 cmp diff_x, cycSize
329 jae finish_tree ; if (delta >= cycSize)
330
331 mov cp_x, cycPos_VAR
332 xor t0_x, t0_x
333 sub cp_x, diff_x ; cp_x = cycPos - delta
334 cmovb t0_x, cycSize
335 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
336 jmp tree_loop
337
338
339MY_ALIGN_32
340matched_1:
341
342 inc len
343 ; cmp len_x, lenLimit_x
344 je short lenLimit_reach
345 movzx t0_x, BYTE PTR [diff + 1 * len]
346 cmp [cur + 1 * len], t0_L
347 jne mismatch
348
349
350MY_ALIGN_32
351match_loop:
352 ; while (++len != lenLimit) (len[diff] != len[0]) ;
353
354 inc len
355 ; cmp len_x, lenLimit_x
356 je short lenLimit_reach
357 movzx t0_x, BYTE PTR [diff + 1 * len]
358 cmp BYTE PTR [cur + 1 * len], t0_L
359 je match_loop
360
361mismatch:
362 jb left_2
363
364 mov [ptr1], m
365 mov m, [cp_r + 4]
366 lea ptr1, [cp_r + 4]
367 mov len1, len
368
369 jmp max_update
370
371MY_ALIGN_32
372left_2:
373 mov [ptr0], m
374 mov m, [cp_r]
375 mov ptr0, cp_r
376 mov len0, len
377
378max_update:
379 sub diff, cur ; restore diff
380
381 cmp maxLen, len
382 jae next_node
383
384 mov maxLen, len
385 add len, lenLimit
386 mov [d], len_x
387 mov t0_x, diff_x
388 not t0_x
389 mov [d + 4], t0_x
390 add d, 8
391
392 jmp next_node
393
394
395
396MY_ALIGN_32
397lenLimit_reach:
398
399 mov delta_r, cur
400 sub delta_r, diff
401 lea delta1_r, [delta_r - 1]
402
403 mov t0_x, [cp_r]
404 mov [ptr1], t0_x
405 mov t0_x, [cp_r + 4]
406 mov [ptr0], t0_x
407
408 mov [d], lenLimit_x
409 mov [d + 4], delta1_x
410 add d, 8
411
412 ; _distances[-1] = (UInt32)(d - _distances);
413 mov t0, distances
414 mov t1, d
415 sub t1, t0
416 shr t1_x, 2
417 mov [t0 - 4], t1_x
418
419 mov hash, hash_VAR
420 mov hash_lim, size_VAR
421
422 inc pos
423 mov cp_x, cycPos_VAR
424 inc cp_x
425
426 mov d_lim, limit_VAR
427 mov cycSize, cycSize_VAR
428 ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
429 ; break;
430 cmp hash, hash_lim
431 je fin
432 cmp d, d_lim
433 jae fin
434 cmp delta_x, [hash]
435 jne main_loop
436 movzx t0_x, BYTE PTR [diff]
437 cmp [cur], t0_L
438 jne main_loop
439
440 ; jmp main_loop ; bypass for debug
441
442 mov cycPos_VAR, cp_x
443 shl len, 3 ; cycSize * 8
444 sub diff, cur ; restore diff
445 xor t0_x, t0_x
446 cmp cp_x, delta_x ; cmp (cycPos_VAR, delta)
447 lea cp_r, [son + 8 * cp_r] ; dest
448 lea src, [cp_r + 8 * diff]
449 cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0)
450 add src, t0
451 add len, son ; len = son + cycSize * 8
452
453
454MY_ALIGN_32
455long_loop:
456 add hash, 4
457
458 ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
459
460 mov t0, [src]
461 add src, 8
462 mov [cp_r], t0
463 add cp_r, 8
464 cmp src, len
465 cmove src, son ; if end of (son) buffer is reached, we wrap to begin
466
467 mov DWORD PTR [d], 2
468 mov [d + 4], lenLimit_x
469 mov [d + 8], delta1_x
470 add d, 12
471
472 inc cur
473
474 cmp hash, hash_lim
475 je long_footer
476 cmp delta_x, [hash]
477 jne long_footer
478 movzx t0_x, BYTE PTR [diff + 1 * cur]
479 cmp [cur], t0_L
480 jne long_footer
481 cmp d, d_lim
482 jb long_loop
483
484long_footer:
485 sub cp_r, son
486 shr cp_r, 3
487 add pos, cp_x
488 sub pos, cycPos_VAR
489 mov cycSize, cycSize_VAR
490
491 cmp d, d_lim
492 jae fin
493 cmp hash, hash_lim
494 jne main_loop
495 jmp fin
496
497
498
499fin_error:
500 xor d, d
501
502fin:
503 mov RSP, Old_RSP
504 mov t0, [r4 + posRes_OFFS]
505 mov [t0], pos
506 mov r0, d
507
508MY_POP_PRESERVED_ABI_REGS
509MY_ENDP
510
511_TEXT$LZFINDOPT ENDS
512
513end
diff --git a/Asm/x86/LzmaDecOpt.asm b/Asm/x86/LzmaDecOpt.asm
new file mode 100644
index 0000000..f2818e7
--- /dev/null
+++ b/Asm/x86/LzmaDecOpt.asm
@@ -0,0 +1,1303 @@
1; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2; 2021-02-23: Igor Pavlov : Public domain
3;
4; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5; function for check at link time.
6; That code is tightly coupled with LzmaDec_TryDummy()
7; and with another functions in LzmaDec.c file.
8; CLzmaDec structure, (probs) array layout, input and output of
9; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
10
11ifndef x64
12; x64=1
13; .err <x64_IS_REQUIRED>
14endif
15
16include 7zAsm.asm
17
18MY_ASM_START
19
20_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
21
22MY_ALIGN macro num:req
23 align num
24endm
25
26MY_ALIGN_16 macro
27 MY_ALIGN 16
28endm
29
30MY_ALIGN_32 macro
31 MY_ALIGN 32
32endm
33
34MY_ALIGN_64 macro
35 MY_ALIGN 64
36endm
37
38
39; _LZMA_SIZE_OPT equ 1
40
41; _LZMA_PROB32 equ 1
42
43ifdef _LZMA_PROB32
44 PSHIFT equ 2
45 PLOAD macro dest, mem
46 mov dest, dword ptr [mem]
47 endm
48 PSTORE macro src, mem
49 mov dword ptr [mem], src
50 endm
51else
52 PSHIFT equ 1
53 PLOAD macro dest, mem
54 movzx dest, word ptr [mem]
55 endm
56 PSTORE macro src, mem
57 mov word ptr [mem], @CatStr(src, _W)
58 endm
59endif
60
61PMULT equ (1 SHL PSHIFT)
62PMULT_HALF equ (1 SHL (PSHIFT - 1))
63PMULT_2 equ (1 SHL (PSHIFT + 1))
64
65kMatchSpecLen_Error_Data equ (1 SHL 9)
66
67; x0 range
68; x1 pbPos / (prob) TREE
69; x2 probBranch / prm (MATCHED) / pbPos / cnt
70; x3 sym
71;====== r4 === RSP
72; x5 cod
73; x6 t1 NORM_CALC / probs_state / dist
74; x7 t0 NORM_CALC / prob2 IF_BIT_1
75; x8 state
76; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg
77; x10 kBitModelTotal_reg
78; r11 probs
79; x12 offs (MATCHED) / dic / len_temp
80; x13 processedPos
81; x14 bit (MATCHED) / dicPos
82; r15 buf
83
84
85cod equ x5
86cod_L equ x5_L
87range equ x0
88state equ x8
89state_R equ r8
90buf equ r15
91processedPos equ x13
92kBitModelTotal_reg equ x10
93
94probBranch equ x2
95probBranch_R equ r2
96probBranch_W equ x2_W
97
98pbPos equ x1
99pbPos_R equ r1
100
101cnt equ x2
102cnt_R equ r2
103
104lpMask_reg equ x9
105dicPos equ r14
106
107sym equ x3
108sym_R equ r3
109sym_L equ x3_L
110
111probs equ r11
112dic equ r12
113
114t0 equ x7
115t0_W equ x7_W
116t0_R equ r7
117
118prob2 equ t0
119prob2_W equ t0_W
120
121t1 equ x6
122t1_R equ r6
123
124probs_state equ t1
125probs_state_R equ t1_R
126
127prm equ r2
128match equ x9
129match_R equ r9
130offs equ x12
131offs_R equ r12
132bit equ x14
133bit_R equ r14
134
135sym2 equ x9
136sym2_R equ r9
137
138len_temp equ x12
139
140dist equ sym
141dist2 equ x9
142
143
144
145kNumBitModelTotalBits equ 11
146kBitModelTotal equ (1 SHL kNumBitModelTotalBits)
147kNumMoveBits equ 5
148kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)
149kTopValue equ (1 SHL 24)
150
151NORM_2 macro
152 ; movzx t0, BYTE PTR [buf]
153 shl cod, 8
154 mov cod_L, BYTE PTR [buf]
155 shl range, 8
156 ; or cod, t0
157 inc buf
158endm
159
160
161NORM macro
162 cmp range, kTopValue
163 jae SHORT @F
164 NORM_2
165@@:
166endm
167
168
169; ---------- Branch MACROS ----------
170
171UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
172 mov prob2, kBitModelTotal_reg
173 sub prob2, probBranch
174 shr prob2, kNumMoveBits
175 add probBranch, prob2
176 PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT
177endm
178
179
180UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
181 sub prob2, range
182 sub cod, range
183 mov range, prob2
184 mov prob2, probBranch
185 shr probBranch, kNumMoveBits
186 sub prob2, probBranch
187 PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT
188endm
189
190
191CMP_COD macro probsArray:req, probOffset:req, probDisp:req
192 PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT
193 NORM
194 mov prob2, range
195 shr range, kNumBitModelTotalBits
196 imul range, probBranch
197 cmp cod, range
198endm
199
200
201IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
202 CMP_COD probsArray, probOffset, probDisp
203 jae toLabel
204endm
205
206
207IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
208 IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
209 UPDATE_0 probsArray, probOffset, probDisp
210endm
211
212
213IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
214 CMP_COD probsArray, probOffset, probDisp
215 jb toLabel
216endm
217
218
219; ---------- CMOV MACROS ----------
220
221NORM_CALC macro prob:req
222 NORM
223 mov t0, range
224 shr range, kNumBitModelTotalBits
225 imul range, prob
226 sub t0, range
227 mov t1, cod
228 sub cod, range
229endm
230
231
232PUP macro prob:req, probPtr:req
233 sub t0, prob
234 ; only sar works for both 16/32 bit prob modes
235 sar t0, kNumMoveBits
236 add t0, prob
237 PSTORE t0, probPtr
238endm
239
240
241PUP_SUB macro prob:req, probPtr:req, symSub:req
242 sbb sym, symSub
243 PUP prob, probPtr
244endm
245
246
247PUP_COD macro prob:req, probPtr:req, symSub:req
248 mov t0, kBitModelOffset
249 cmovb cod, t1
250 mov t1, sym
251 cmovb t0, kBitModelTotal_reg
252 PUP_SUB prob, probPtr, symSub
253endm
254
255
256BIT_0 macro prob:req, probNext:req
257 PLOAD prob, probs + 1 * PMULT
258 PLOAD probNext, probs + 1 * PMULT_2
259
260 NORM_CALC prob
261
262 cmovae range, t0
263 PLOAD t0, probs + 1 * PMULT_2 + PMULT
264 cmovae probNext, t0
265 mov t0, kBitModelOffset
266 cmovb cod, t1
267 cmovb t0, kBitModelTotal_reg
268 mov sym, 2
269 PUP_SUB prob, probs + 1 * PMULT, 0 - 1
270endm
271
272
273BIT_1 macro prob:req, probNext:req
274 PLOAD probNext, probs + sym_R * PMULT_2
275 add sym, sym
276
277 NORM_CALC prob
278
279 cmovae range, t0
280 PLOAD t0, probs + sym_R * PMULT + PMULT
281 cmovae probNext, t0
282 PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
283endm
284
285
286BIT_2 macro prob:req, symSub:req
287 add sym, sym
288
289 NORM_CALC prob
290
291 cmovae range, t0
292 PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
293endm
294
295
296; ---------- MATCHED LITERAL ----------
297
298LITM_0 macro
299 mov offs, 256 * PMULT
300 shl match, (PSHIFT + 1)
301 mov bit, offs
302 and bit, match
303 PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
304 lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
305 ; lea prm, [probs + 256 * PMULT + 1 * PMULT]
306 ; add prm, bit_R
307 xor offs, bit
308 add match, match
309
310 NORM_CALC x1
311
312 cmovae offs, bit
313 mov bit, match
314 cmovae range, t0
315 mov t0, kBitModelOffset
316 cmovb cod, t1
317 cmovb t0, kBitModelTotal_reg
318 mov sym, 0
319 PUP_SUB x1, prm, -2-1
320endm
321
322
323LITM macro
324 and bit, offs
325 lea prm, [probs + offs_R * 1]
326 add prm, bit_R
327 PLOAD x1, prm + sym_R * PMULT
328 xor offs, bit
329 add sym, sym
330 add match, match
331
332 NORM_CALC x1
333
334 cmovae offs, bit
335 mov bit, match
336 cmovae range, t0
337 PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
338endm
339
340
341LITM_2 macro
342 and bit, offs
343 lea prm, [probs + offs_R * 1]
344 add prm, bit_R
345 PLOAD x1, prm + sym_R * PMULT
346 add sym, sym
347
348 NORM_CALC x1
349
350 cmovae range, t0
351 PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
352endm
353
354
355; ---------- REVERSE BITS ----------
356
357REV_0 macro prob:req, probNext:req
358 ; PLOAD prob, probs + 1 * PMULT
359 ; lea sym2_R, [probs + 2 * PMULT]
360 ; PLOAD probNext, probs + 2 * PMULT
361 PLOAD probNext, sym2_R
362
363 NORM_CALC prob
364
365 cmovae range, t0
366 PLOAD t0, probs + 3 * PMULT
367 cmovae probNext, t0
368 cmovb cod, t1
369 mov t0, kBitModelOffset
370 cmovb t0, kBitModelTotal_reg
371 lea t1_R, [probs + 3 * PMULT]
372 cmovae sym2_R, t1_R
373 PUP prob, probs + 1 * PMULT
374endm
375
376
377REV_1 macro prob:req, probNext:req, step:req
378 add sym2_R, step * PMULT
379 PLOAD probNext, sym2_R
380
381 NORM_CALC prob
382
383 cmovae range, t0
384 PLOAD t0, sym2_R + step * PMULT
385 cmovae probNext, t0
386 cmovb cod, t1
387 mov t0, kBitModelOffset
388 cmovb t0, kBitModelTotal_reg
389 lea t1_R, [sym2_R + step * PMULT]
390 cmovae sym2_R, t1_R
391 PUP prob, t1_R - step * PMULT_2
392endm
393
394
395REV_2 macro prob:req, step:req
396 sub sym2_R, probs
397 shr sym2, PSHIFT
398 or sym, sym2
399
400 NORM_CALC prob
401
402 cmovae range, t0
403 lea t0, [sym - step]
404 cmovb sym, t0
405 cmovb cod, t1
406 mov t0, kBitModelOffset
407 cmovb t0, kBitModelTotal_reg
408 PUP prob, probs + sym2_R * PMULT
409endm
410
411
412REV_1_VAR macro prob:req
413 PLOAD prob, sym_R
414 mov probs, sym_R
415 add sym_R, sym2_R
416
417 NORM_CALC prob
418
419 cmovae range, t0
420 lea t0_R, [sym_R + 1 * sym2_R]
421 cmovae sym_R, t0_R
422 mov t0, kBitModelOffset
423 cmovb cod, t1
424 ; mov t1, kBitModelTotal
425 ; cmovb t0, t1
426 cmovb t0, kBitModelTotal_reg
427 add sym2, sym2
428 PUP prob, probs
429endm
430
431
432
433
434LIT_PROBS macro lpMaskParam:req
435 ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
436 mov t0, processedPos
437 shl t0, 8
438 add sym, t0
439 and sym, lpMaskParam
440 add probs_state_R, pbPos_R
441 mov x1, LOC lc2
442 lea sym, dword ptr[sym_R + 2 * sym_R]
443 add probs, Literal * PMULT
444 shl sym, x1_L
445 add probs, sym_R
446 UPDATE_0 probs_state_R, 0, IsMatch
447 inc processedPos
448endm
449
450
451
452kNumPosBitsMax equ 4
453kNumPosStatesMax equ (1 SHL kNumPosBitsMax)
454
455kLenNumLowBits equ 3
456kLenNumLowSymbols equ (1 SHL kLenNumLowBits)
457kLenNumHighBits equ 8
458kLenNumHighSymbols equ (1 SHL kLenNumHighBits)
459kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
460
461LenLow equ 0
462LenChoice equ LenLow
463LenChoice2 equ (LenLow + kLenNumLowSymbols)
464LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
465
466kNumStates equ 12
467kNumStates2 equ 16
468kNumLitStates equ 7
469
470kStartPosModelIndex equ 4
471kEndPosModelIndex equ 14
472kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))
473
474kNumPosSlotBits equ 6
475kNumLenToPosStates equ 4
476
477kNumAlignBits equ 4
478kAlignTableSize equ (1 SHL kNumAlignBits)
479
480kMatchMinLen equ 2
481kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
482
483kStartOffset equ 1664
484SpecPos equ (-kStartOffset)
485IsRep0Long equ (SpecPos + kNumFullDistances)
486RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
487LenCoder equ (RepLenCoder + kNumLenProbs)
488IsMatch equ (LenCoder + kNumLenProbs)
489kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
490IsRep equ (kAlign + kAlignTableSize)
491IsRepG0 equ (IsRep + kNumStates)
492IsRepG1 equ (IsRepG0 + kNumStates)
493IsRepG2 equ (IsRepG1 + kNumStates)
494PosSlot equ (IsRepG2 + kNumStates)
495Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
496NUM_BASE_PROBS equ (Literal + kStartOffset)
497
498if kAlign ne 0
499 .err <Stop_Compiling_Bad_LZMA_kAlign>
500endif
501
502if NUM_BASE_PROBS ne 1984
503 .err <Stop_Compiling_Bad_LZMA_PROBS>
504endif
505
506
507PTR_FIELD equ dq ?
508
509CLzmaDec_Asm struct
510 lc db ?
511 lp db ?
512 pb db ?
513 _pad_ db ?
514 dicSize dd ?
515
516 probs_Spec PTR_FIELD
517 probs_1664 PTR_FIELD
518 dic_Spec PTR_FIELD
519 dicBufSize PTR_FIELD
520 dicPos_Spec PTR_FIELD
521 buf_Spec PTR_FIELD
522
523 range_Spec dd ?
524 code_Spec dd ?
525 processedPos_Spec dd ?
526 checkDicSize dd ?
527 rep0 dd ?
528 rep1 dd ?
529 rep2 dd ?
530 rep3 dd ?
531 state_Spec dd ?
532 remainLen dd ?
533CLzmaDec_Asm ends
534
535
536CLzmaDec_Asm_Loc struct
537 OLD_RSP PTR_FIELD
538 lzmaPtr PTR_FIELD
539 _pad0_ PTR_FIELD
540 _pad1_ PTR_FIELD
541 _pad2_ PTR_FIELD
542 dicBufSize PTR_FIELD
543 probs_Spec PTR_FIELD
544 dic_Spec PTR_FIELD
545
546 limit PTR_FIELD
547 bufLimit PTR_FIELD
548 lc2 dd ?
549 lpMask dd ?
550 pbMask dd ?
551 checkDicSize dd ?
552
553 _pad_ dd ?
554 remainLen dd ?
555 dicPos_Spec PTR_FIELD
556 rep0 dd ?
557 rep1 dd ?
558 rep2 dd ?
559 rep3 dd ?
560CLzmaDec_Asm_Loc ends
561
562
563GLOB_2 equ [sym_R].CLzmaDec_Asm.
564GLOB equ [r1].CLzmaDec_Asm.
565LOC_0 equ [r0].CLzmaDec_Asm_Loc.
566LOC equ [RSP].CLzmaDec_Asm_Loc.
567
568
569COPY_VAR macro name
570 mov t0, GLOB_2 name
571 mov LOC_0 name, t0
572endm
573
574
575RESTORE_VAR macro name
576 mov t0, LOC name
577 mov GLOB name, t0
578endm
579
580
581
582IsMatchBranch_Pre macro reg
583 ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
584 mov pbPos, LOC pbMask
585 and pbPos, processedPos
586 shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
587 lea probs_state_R, [probs + 1 * state_R]
588endm
589
590
591IsMatchBranch macro reg
592 IsMatchBranch_Pre
593 IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
594endm
595
596
597CheckLimits macro reg
598 cmp buf, LOC bufLimit
599 jae fin_OK
600 cmp dicPos, LOC limit
601 jae fin_OK
602endm
603
604
605
606; RSP is (16x + 8) bytes aligned in WIN64-x64
607; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
608
609PARAM_lzma equ REG_ABI_PARAM_0
610PARAM_limit equ REG_ABI_PARAM_1
611PARAM_bufLimit equ REG_ABI_PARAM_2
612
613; MY_ALIGN_64
614MY_PROC LzmaDec_DecodeReal_3, 3
615MY_PUSH_PRESERVED_ABI_REGS
616
617 lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
618 and r0, -128
619 mov r5, RSP
620 mov RSP, r0
621 mov LOC_0 Old_RSP, r5
622 mov LOC_0 lzmaPtr, PARAM_lzma
623
624 mov LOC_0 remainLen, 0 ; remainLen must be ZERO
625
626 mov LOC_0 bufLimit, PARAM_bufLimit
627 mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2
628 mov dic, GLOB_2 dic_Spec
629 add PARAM_limit, dic
630 mov LOC_0 limit, PARAM_limit
631
632 COPY_VAR(rep0)
633 COPY_VAR(rep1)
634 COPY_VAR(rep2)
635 COPY_VAR(rep3)
636
637 mov dicPos, GLOB_2 dicPos_Spec
638 add dicPos, dic
639 mov LOC_0 dicPos_Spec, dicPos
640 mov LOC_0 dic_Spec, dic
641
642 mov x1_L, GLOB_2 pb
643 mov t0, 1
644 shl t0, x1_L
645 dec t0
646 mov LOC_0 pbMask, t0
647
648 ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
649 ; unsigned lc = p->prop.lc;
650 ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
651
652 mov x1_L, GLOB_2 lc
653 mov x2, 100h
654 mov t0, x2
655 shr x2, x1_L
656 ; inc x1
657 add x1_L, PSHIFT
658 mov LOC_0 lc2, x1
659 mov x1_L, GLOB_2 lp
660 shl t0, x1_L
661 sub t0, x2
662 mov LOC_0 lpMask, t0
663 mov lpMask_reg, t0
664
665 ; mov probs, GLOB_2 probs_Spec
666 ; add probs, kStartOffset SHL PSHIFT
667 mov probs, GLOB_2 probs_1664
668 mov LOC_0 probs_Spec, probs
669
670 mov t0_R, GLOB_2 dicBufSize
671 mov LOC_0 dicBufSize, t0_R
672
673 mov x1, GLOB_2 checkDicSize
674 mov LOC_0 checkDicSize, x1
675
676 mov processedPos, GLOB_2 processedPos_Spec
677
678 mov state, GLOB_2 state_Spec
679 shl state, PSHIFT
680
681 mov buf, GLOB_2 buf_Spec
682 mov range, GLOB_2 range_Spec
683 mov cod, GLOB_2 code_Spec
684 mov kBitModelTotal_reg, kBitModelTotal
685 xor sym, sym
686
687 ; if (processedPos != 0 || checkDicSize != 0)
688 or x1, processedPos
689 jz @f
690
691 add t0_R, dic
692 cmp dicPos, dic
693 cmovnz t0_R, dicPos
694 movzx sym, byte ptr[t0_R - 1]
695
696@@:
697 IsMatchBranch_Pre
698 cmp state, 4 * PMULT
699 jb lit_end
700 cmp state, kNumLitStates * PMULT
701 jb lit_matched_end
702 jmp lz_end
703
704
705
706
707; ---------- LITERAL ----------
708MY_ALIGN_64
709lit_start:
710 xor state, state
711lit_start_2:
712 LIT_PROBS lpMask_reg
713
714 ifdef _LZMA_SIZE_OPT
715
716 PLOAD x1, probs + 1 * PMULT
717 mov sym, 1
718MY_ALIGN_16
719lit_loop:
720 BIT_1 x1, x2
721 mov x1, x2
722 cmp sym, 127
723 jbe lit_loop
724
725 else
726
727 BIT_0 x1, x2
728 BIT_1 x2, x1
729 BIT_1 x1, x2
730 BIT_1 x2, x1
731 BIT_1 x1, x2
732 BIT_1 x2, x1
733 BIT_1 x1, x2
734
735 endif
736
737 BIT_2 x2, 256 - 1
738
739 ; mov dic, LOC dic_Spec
740 mov probs, LOC probs_Spec
741 IsMatchBranch_Pre
742 mov byte ptr[dicPos], sym_L
743 inc dicPos
744
745 CheckLimits
746lit_end:
747 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
748
749 ; jmp IsMatch_label
750
751; ---------- MATCHES ----------
752; MY_ALIGN_32
753IsMatch_label:
754 UPDATE_1 probs_state_R, pbPos_R, IsMatch
755 IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
756
757 add probs, LenCoder * PMULT
758 add state, kNumStates * PMULT
759
760; ---------- LEN DECODE ----------
761len_decode:
762 mov len_temp, 8 - 1 - kMatchMinLen
763 IF_BIT_0_NOUP probs, 0, 0, len_mid_0
764 UPDATE_1 probs, 0, 0
765 add probs, (1 SHL (kLenNumLowBits + PSHIFT))
766 mov len_temp, -1 - kMatchMinLen
767 IF_BIT_0_NOUP probs, 0, 0, len_mid_0
768 UPDATE_1 probs, 0, 0
769 add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
770 mov sym, 1
771 PLOAD x1, probs + 1 * PMULT
772
773MY_ALIGN_32
774len8_loop:
775 BIT_1 x1, x2
776 mov x1, x2
777 cmp sym, 64
778 jb len8_loop
779
780 mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
781 jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
782
783MY_ALIGN_32
784len_mid_0:
785 UPDATE_0 probs, 0, 0
786 add probs, pbPos_R
787 BIT_0 x2, x1
788len_mid_2:
789 BIT_1 x1, x2
790 BIT_2 x2, len_temp
791 mov probs, LOC probs_Spec
792 cmp state, kNumStates * PMULT
793 jb copy_match
794
795
796; ---------- DECODE DISTANCE ----------
797 ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
798
799 mov t0, 3 + kMatchMinLen
800 cmp sym, 3 + kMatchMinLen
801 cmovb t0, sym
802 add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
803 shl t0, (kNumPosSlotBits + PSHIFT)
804 add probs, t0_R
805
806 ; sym = Len
807 ; mov LOC remainLen, sym
808 mov len_temp, sym
809
810 ifdef _LZMA_SIZE_OPT
811
812 PLOAD x1, probs + 1 * PMULT
813 mov sym, 1
814MY_ALIGN_16
815slot_loop:
816 BIT_1 x1, x2
817 mov x1, x2
818 cmp sym, 32
819 jb slot_loop
820
821 else
822
823 BIT_0 x1, x2
824 BIT_1 x2, x1
825 BIT_1 x1, x2
826 BIT_1 x2, x1
827 BIT_1 x1, x2
828
829 endif
830
831 mov x1, sym
832 BIT_2 x2, 64-1
833
834 and sym, 3
835 mov probs, LOC probs_Spec
836 cmp x1, 32 + kEndPosModelIndex / 2
837 jb short_dist
838
839 ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
840 sub x1, (32 + 1 + kNumAlignBits)
841 ; distance = (2 | (distance & 1));
842 or sym, 2
843 PLOAD x2, probs + 1 * PMULT
844 shl sym, kNumAlignBits + 1
845 lea sym2_R, [probs + 2 * PMULT]
846
847 jmp direct_norm
848 ; lea t1, [sym_R + (1 SHL kNumAlignBits)]
849 ; cmp range, kTopValue
850 ; jb direct_norm
851
852; ---------- DIRECT DISTANCE ----------
853MY_ALIGN_32
854direct_loop:
855 shr range, 1
856 mov t0, cod
857 sub cod, range
858 cmovs cod, t0
859 cmovns sym, t1
860
861 comment ~
862 sub cod, range
863 mov x2, cod
864 sar x2, 31
865 lea sym, dword ptr [r2 + sym_R * 2 + 1]
866 and x2, range
867 add cod, x2
868 ~
869 dec x1
870 je direct_end
871
872 add sym, sym
873direct_norm:
874 lea t1, [sym_R + (1 SHL kNumAlignBits)]
875 cmp range, kTopValue
876 jae near ptr direct_loop
877 ; we align for 32 here with "near ptr" command above
878 NORM_2
879 jmp direct_loop
880
881MY_ALIGN_32
882direct_end:
883 ; prob = + kAlign;
884 ; distance <<= kNumAlignBits;
885 REV_0 x2, x1
886 REV_1 x1, x2, 2
887 REV_1 x2, x1, 4
888 REV_2 x1, 8
889
890decode_dist_end:
891
892 ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
893
894 mov t1, LOC rep0
895 mov x1, LOC rep1
896 mov x2, LOC rep2
897
898 mov t0, LOC checkDicSize
899 test t0, t0
900 cmove t0, processedPos
901 cmp sym, t0
902 jae end_of_payload
903 ; jmp end_of_payload ; for debug
904
905 ; rep3 = rep2;
906 ; rep2 = rep1;
907 ; rep1 = rep0;
908 ; rep0 = distance + 1;
909
910 inc sym
911 mov LOC rep0, sym
912 ; mov sym, LOC remainLen
913 mov sym, len_temp
914 mov LOC rep1, t1
915 mov LOC rep2, x1
916 mov LOC rep3, x2
917
918 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
919 cmp state, (kNumStates + kNumLitStates) * PMULT
920 mov state, kNumLitStates * PMULT
921 mov t0, (kNumLitStates + 3) * PMULT
922 cmovae state, t0
923
924
925; ---------- COPY MATCH ----------
926copy_match:
927
928 ; len += kMatchMinLen;
929 ; add sym, kMatchMinLen
930
931 ; if ((rem = limit - dicPos) == 0)
932 ; {
933 ; p->dicPos = dicPos;
934 ; return SZ_ERROR_DATA;
935 ; }
936 mov cnt_R, LOC limit
937 sub cnt_R, dicPos
938 jz fin_dicPos_LIMIT
939
940 ; curLen = ((rem < len) ? (unsigned)rem : len);
941 cmp cnt_R, sym_R
942 ; cmovae cnt_R, sym_R ; 64-bit
943 cmovae cnt, sym ; 32-bit
944
945 mov dic, LOC dic_Spec
946 mov x1, LOC rep0
947
948 mov t0_R, dicPos
949 add dicPos, cnt_R
950 ; processedPos += curLen;
951 add processedPos, cnt
952 ; len -= curLen;
953 sub sym, cnt
954 mov LOC remainLen, sym
955
956 sub t0_R, dic
957
958 ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
959 sub t0_R, r1
960 jae @f
961
962 mov r1, LOC dicBufSize
963 add t0_R, r1
964 sub r1, t0_R
965 cmp cnt_R, r1
966 ja copy_match_cross
967@@:
968 ; if (curLen <= dicBufSize - pos)
969
970; ---------- COPY MATCH FAST ----------
971 ; Byte *dest = dic + dicPos;
972 ; mov r1, dic
973 ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
974 ; sub t0_R, dicPos
975 ; dicPos += curLen;
976
977 ; const Byte *lim = dest + curLen;
978 add t0_R, dic
979 movzx sym, byte ptr[t0_R]
980 add t0_R, cnt_R
981 neg cnt_R
982 ; lea r1, [dicPos - 1]
983copy_common:
984 dec dicPos
985 ; cmp LOC rep0, 1
986 ; je rep0Label
987
988 ; t0_R - src_lim
989 ; r1 - dest_lim - 1
990 ; cnt_R - (-cnt)
991
992 IsMatchBranch_Pre
993 inc cnt_R
994 jz copy_end
995MY_ALIGN_16
996@@:
997 mov byte ptr[cnt_R * 1 + dicPos], sym_L
998 movzx sym, byte ptr[cnt_R * 1 + t0_R]
999 inc cnt_R
1000 jnz @b
1001
1002copy_end:
1003lz_end_match:
1004 mov byte ptr[dicPos], sym_L
1005 inc dicPos
1006
1007 ; IsMatchBranch_Pre
1008 CheckLimits
1009lz_end:
1010 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1011
1012
1013
1014; ---------- LITERAL MATCHED ----------
1015
1016 LIT_PROBS LOC lpMask
1017
1018 ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1019 mov x1, LOC rep0
1020 ; mov dic, LOC dic_Spec
1021 mov LOC dicPos_Spec, dicPos
1022
1023 ; state -= (state < 10) ? 3 : 6;
1024 lea t0, [state_R - 6 * PMULT]
1025 sub state, 3 * PMULT
1026 cmp state, 7 * PMULT
1027 cmovae state, t0
1028
1029 sub dicPos, dic
1030 sub dicPos, r1
1031 jae @f
1032 add dicPos, LOC dicBufSize
1033@@:
1034 comment ~
1035 xor t0, t0
1036 sub dicPos, r1
1037 cmovb t0_R, LOC dicBufSize
1038 ~
1039
1040 movzx match, byte ptr[dic + dicPos * 1]
1041
1042 ifdef _LZMA_SIZE_OPT
1043
1044 mov offs, 256 * PMULT
1045 shl match, (PSHIFT + 1)
1046 mov bit, match
1047 mov sym, 1
1048MY_ALIGN_16
1049litm_loop:
1050 LITM
1051 cmp sym, 256
1052 jb litm_loop
1053 sub sym, 256
1054
1055 else
1056
1057 LITM_0
1058 LITM
1059 LITM
1060 LITM
1061 LITM
1062 LITM
1063 LITM
1064 LITM_2
1065
1066 endif
1067
1068 mov probs, LOC probs_Spec
1069 IsMatchBranch_Pre
1070 ; mov dic, LOC dic_Spec
1071 mov dicPos, LOC dicPos_Spec
1072 mov byte ptr[dicPos], sym_L
1073 inc dicPos
1074
1075 CheckLimits
1076lit_matched_end:
1077 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1078 ; IsMatchBranch
1079 mov lpMask_reg, LOC lpMask
1080 sub state, 3 * PMULT
1081 jmp lit_start_2
1082
1083
1084
1085; ---------- REP 0 LITERAL ----------
1086MY_ALIGN_32
1087IsRep0Short_label:
1088 UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
1089
1090 ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1091 mov dic, LOC dic_Spec
1092 mov t0_R, dicPos
1093 mov probBranch, LOC rep0
1094 sub t0_R, dic
1095
1096 sub probs, RepLenCoder * PMULT
1097
1098 ; state = state < kNumLitStates ? 9 : 11;
1099 or state, 1 * PMULT
1100
1101 ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
1102 ; so we don't need the following (dicPos == limit) check here:
1103 ; cmp dicPos, LOC limit
1104 ; jae fin_dicPos_LIMIT_REP_SHORT
1105
1106 inc processedPos
1107
1108 IsMatchBranch_Pre
1109
1110; xor sym, sym
1111; sub t0_R, probBranch_R
1112; cmovb sym_R, LOC dicBufSize
1113; add t0_R, sym_R
1114 sub t0_R, probBranch_R
1115 jae @f
1116 add t0_R, LOC dicBufSize
1117@@:
1118 movzx sym, byte ptr[dic + t0_R * 1]
1119 jmp lz_end_match
1120
1121
1122MY_ALIGN_32
1123IsRep_label:
1124 UPDATE_1 probs_state_R, 0, IsRep
1125
1126 ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1127 ; So we don't check it here.
1128
1129 ; mov t0, processedPos
1130 ; or t0, LOC checkDicSize
1131 ; jz fin_ERROR_2
1132
1133 ; state = state < kNumLitStates ? 8 : 11;
1134 cmp state, kNumLitStates * PMULT
1135 mov state, 8 * PMULT
1136 mov probBranch, 11 * PMULT
1137 cmovae state, probBranch
1138
1139 ; prob = probs + RepLenCoder;
1140 add probs, RepLenCoder * PMULT
1141
1142 IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
1143 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
1144 UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
1145 jmp len_decode
1146
1147MY_ALIGN_32
1148IsRepG0_label:
1149 UPDATE_1 probs_state_R, 0, IsRepG0
1150 mov dist2, LOC rep0
1151 mov dist, LOC rep1
1152 mov LOC rep1, dist2
1153
1154 IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
1155 mov LOC rep0, dist
1156 jmp len_decode
1157
1158; MY_ALIGN_32
1159IsRepG1_label:
1160 UPDATE_1 probs_state_R, 0, IsRepG1
1161 mov dist2, LOC rep2
1162 mov LOC rep2, dist
1163
1164 IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
1165 mov LOC rep0, dist2
1166 jmp len_decode
1167
1168; MY_ALIGN_32
1169IsRepG2_label:
1170 UPDATE_1 probs_state_R, 0, IsRepG2
1171 mov dist, LOC rep3
1172 mov LOC rep3, dist2
1173 mov LOC rep0, dist
1174 jmp len_decode
1175
1176
1177
1178; ---------- SPEC SHORT DISTANCE ----------
1179
1180MY_ALIGN_32
1181short_dist:
1182 sub x1, 32 + 1
1183 jbe decode_dist_end
1184 or sym, 2
1185 shl sym, x1_L
1186 lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
1187 mov sym2, PMULT ; step
1188MY_ALIGN_32
1189spec_loop:
1190 REV_1_VAR x2
1191 dec x1
1192 jnz spec_loop
1193
1194 mov probs, LOC probs_Spec
1195 sub sym, sym2
1196 sub sym, SpecPos * PMULT
1197 sub sym_R, probs
1198 shr sym, PSHIFT
1199
1200 jmp decode_dist_end
1201
1202
1203; ---------- COPY MATCH CROSS ----------
1204copy_match_cross:
1205 ; t0_R - src pos
1206 ; r1 - len to dicBufSize
1207 ; cnt_R - total copy len
1208
1209 mov t1_R, t0_R ; srcPos
1210 mov t0_R, dic
1211 mov r1, LOC dicBufSize ;
1212 neg cnt_R
1213@@:
1214 movzx sym, byte ptr[t1_R * 1 + t0_R]
1215 inc t1_R
1216 mov byte ptr[cnt_R * 1 + dicPos], sym_L
1217 inc cnt_R
1218 cmp t1_R, r1
1219 jne @b
1220
1221 movzx sym, byte ptr[t0_R]
1222 sub t0_R, cnt_R
1223 jmp copy_common
1224
1225
1226
1227
1228; fin_dicPos_LIMIT_REP_SHORT:
1229 ; mov sym, 1
1230
1231fin_dicPos_LIMIT:
1232 mov LOC remainLen, sym
1233 jmp fin_OK
1234 ; For more strict mode we can stop decoding with error
1235 ; mov sym, 1
1236 ; jmp fin
1237
1238
1239fin_ERROR_MATCH_DIST:
1240
1241 ; rep3 = rep2;
1242 ; rep2 = rep1;
1243 ; rep1 = rep0;
1244 ; rep0 = distance + 1;
1245
1246 add len_temp, kMatchSpecLen_Error_Data
1247 mov LOC remainLen, len_temp
1248
1249 mov LOC rep0, sym
1250 mov LOC rep1, t1
1251 mov LOC rep2, x1
1252 mov LOC rep3, x2
1253
1254 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
1255 cmp state, (kNumStates + kNumLitStates) * PMULT
1256 mov state, kNumLitStates * PMULT
1257 mov t0, (kNumLitStates + 3) * PMULT
1258 cmovae state, t0
1259
1260 ; jmp fin_OK
1261 mov sym, 1
1262 jmp fin
1263
1264end_of_payload:
1265 inc sym
1266 jnz fin_ERROR_MATCH_DIST
1267
1268 mov LOC remainLen, kMatchSpecLenStart
1269 sub state, kNumStates * PMULT
1270
1271fin_OK:
1272 xor sym, sym
1273
1274fin:
1275 NORM
1276
1277 mov r1, LOC lzmaPtr
1278
1279 sub dicPos, LOC dic_Spec
1280 mov GLOB dicPos_Spec, dicPos
1281 mov GLOB buf_Spec, buf
1282 mov GLOB range_Spec, range
1283 mov GLOB code_Spec, cod
1284 shr state, PSHIFT
1285 mov GLOB state_Spec, state
1286 mov GLOB processedPos_Spec, processedPos
1287
1288 RESTORE_VAR(remainLen)
1289 RESTORE_VAR(rep0)
1290 RESTORE_VAR(rep1)
1291 RESTORE_VAR(rep2)
1292 RESTORE_VAR(rep3)
1293
1294 mov x0, sym
1295
1296 mov RSP, LOC Old_RSP
1297
1298MY_POP_PRESERVED_ABI_REGS
1299MY_ENDP
1300
1301_TEXT$LZMADECOPT ENDS
1302
1303end
diff --git a/Asm/x86/Sha1Opt.asm b/Asm/x86/Sha1Opt.asm
new file mode 100644
index 0000000..3495fd1
--- /dev/null
+++ b/Asm/x86/Sha1Opt.asm
@@ -0,0 +1,263 @@
1; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions
2; 2021-03-10 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23CONST SEGMENT
24
25align 16
26Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49CONST ENDS
50
51; _TEXT$SHA1OPT SEGMENT 'CODE'
52
53ifndef x64
54 .686
55 .xmm
56endif
57
58ifdef x64
59 rNum equ REG_ABI_PARAM_2
60 if (IS_LINUX eq 0)
61 LOCAL_SIZE equ (16 * 2)
62 endif
63else
64 rNum equ r0
65 LOCAL_SIZE equ (16 * 1)
66endif
67
68rState equ REG_ABI_PARAM_0
69rData equ REG_ABI_PARAM_1
70
71
72MY_sha1rnds4 macro a1, a2, imm
73 db 0fH, 03aH, 0ccH, (0c0H + a1 * 8 + a2), imm
74endm
75
76MY_SHA_INSTR macro cmd, a1, a2
77 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
78endm
79
80cmd_sha1nexte equ 0c8H
81cmd_sha1msg1 equ 0c9H
82cmd_sha1msg2 equ 0caH
83
84MY_sha1nexte macro a1, a2
85 MY_SHA_INSTR cmd_sha1nexte, a1, a2
86endm
87
88MY_sha1msg1 macro a1, a2
89 MY_SHA_INSTR cmd_sha1msg1, a1, a2
90endm
91
92MY_sha1msg2 macro a1, a2
93 MY_SHA_INSTR cmd_sha1msg2, a1, a2
94endm
95
96MY_PROLOG macro
97 ifdef x64
98 if (IS_LINUX eq 0)
99 movdqa [r4 + 8], xmm6
100 movdqa [r4 + 8 + 16], xmm7
101 sub r4, LOCAL_SIZE + 8
102 movdqa [r4 ], xmm8
103 movdqa [r4 + 16], xmm9
104 endif
105 else ; x86
106 if (IS_CDECL gt 0)
107 mov rState, [r4 + REG_SIZE * 1]
108 mov rData, [r4 + REG_SIZE * 2]
109 mov rNum, [r4 + REG_SIZE * 3]
110 else ; fastcall
111 mov rNum, [r4 + REG_SIZE * 1]
112 endif
113 push r5
114 mov r5, r4
115 and r4, -16
116 sub r4, LOCAL_SIZE
117 endif
118endm
119
120MY_EPILOG macro
121 ifdef x64
122 if (IS_LINUX eq 0)
123 movdqa xmm8, [r4]
124 movdqa xmm9, [r4 + 16]
125 add r4, LOCAL_SIZE + 8
126 movdqa xmm6, [r4 + 8]
127 movdqa xmm7, [r4 + 8 + 16]
128 endif
129 else ; x86
130 mov r4, r5
131 pop r5
132 endif
133 MY_ENDP
134endm
135
136
137e0_N equ 0
138e1_N equ 1
139abcd_N equ 2
140e0_save_N equ 3
141w_regs equ 4
142
143e0 equ @CatStr(xmm, %e0_N)
144e1 equ @CatStr(xmm, %e1_N)
145abcd equ @CatStr(xmm, %abcd_N)
146e0_save equ @CatStr(xmm, %e0_save_N)
147
148
149ifdef x64
150 abcd_save equ xmm8
151 mask2 equ xmm9
152else
153 abcd_save equ [r4]
154 mask2 equ e1
155endif
156
157LOAD_MASK macro
158 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
159endm
160
161LOAD_W macro k:req
162 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
163 pshufb @CatStr(xmm, %(w_regs + k)), mask2
164endm
165
166
167; pre2 can be 2 or 3 (recommended)
168pre2 equ 3
169pre1 equ (pre2 + 1)
170
171NUM_ROUNDS4 equ 20
172
173RND4 macro k
174 movdqa @CatStr(xmm, %(e0_N + ((k + 1) mod 2))), abcd
175 MY_sha1rnds4 abcd_N, (e0_N + (k mod 2)), k / 5
176
177 nextM = (w_regs + ((k + 1) mod 4))
178
179 if (k EQ NUM_ROUNDS4 - 1)
180 nextM = e0_save_N
181 endif
182
183 MY_sha1nexte (e0_N + ((k + 1) mod 2)), nextM
184
185 if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
186 pxor @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4)))
187 endif
188
189 if (k GE (4 - pre1)) AND (k LT (NUM_ROUNDS4 - pre1))
190 MY_sha1msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
191 endif
192
193 if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2))
194 MY_sha1msg2 (w_regs + ((k + pre2) mod 4)), (w_regs + ((k + pre2 - 1) mod 4))
195 endif
196endm
197
198
199REVERSE_STATE macro
200 ; abcd ; dcba
201 ; e0 ; 000e
202 pshufd abcd, abcd, 01bH ; abcd
203 pshufd e0, e0, 01bH ; e000
204endm
205
206
207
208
209
210MY_PROC Sha1_UpdateBlocks_HW, 3
211 MY_PROLOG
212
213 cmp rNum, 0
214 je end_c
215
216 movdqu abcd, [rState] ; dcba
217 movd e0, dword ptr [rState + 16] ; 000e
218
219 REVERSE_STATE
220
221 ifdef x64
222 LOAD_MASK
223 endif
224
225 align 16
226 nextBlock:
227 movdqa abcd_save, abcd
228 movdqa e0_save, e0
229
230 ifndef x64
231 LOAD_MASK
232 endif
233
234 LOAD_W 0
235 LOAD_W 1
236 LOAD_W 2
237 LOAD_W 3
238
239 paddd e0, @CatStr(xmm, %(w_regs))
240 k = 0
241 rept NUM_ROUNDS4
242 RND4 k
243 k = k + 1
244 endm
245
246 paddd abcd, abcd_save
247
248
249 add rData, 64
250 sub rNum, 1
251 jnz nextBlock
252
253 REVERSE_STATE
254
255 movdqu [rState], abcd
256 movd dword ptr [rState + 16], e0
257
258 end_c:
259MY_EPILOG
260
261; _TEXT$SHA1OPT ENDS
262
263end
diff --git a/Asm/x86/Sha256Opt.asm b/Asm/x86/Sha256Opt.asm
new file mode 100644
index 0000000..5d02c90
--- /dev/null
+++ b/Asm/x86/Sha256Opt.asm
@@ -0,0 +1,263 @@
1; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
2; 2021-03-10 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8; .data
9; public K
10
11; we can use external SHA256_K_ARRAY defined in Sha256.c
12; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes
13
14COMMENT @
15ifdef x64
16K_CONST equ SHA256_K_ARRAY
17else
18K_CONST equ _SHA256_K_ARRAY
19endif
20EXTRN K_CONST:xmmword
21@
22
23CONST SEGMENT
24
25align 16
26Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
27
28; COMMENT @
29align 16
30K_CONST \
31DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H
32DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H
33DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H
34DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H
35DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH
36DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH
37DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H
38DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H
39DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H
40DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H
41DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H
42DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H
43DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H
44DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H
45DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H
46DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H
47; @
48
49CONST ENDS
50
51; _TEXT$SHA256OPT SEGMENT 'CODE'
52
53ifndef x64
54 .686
55 .xmm
56endif
57
58ifdef x64
59 rNum equ REG_ABI_PARAM_2
60 if (IS_LINUX eq 0)
61 LOCAL_SIZE equ (16 * 2)
62 endif
63else
64 rNum equ r0
65 LOCAL_SIZE equ (16 * 1)
66endif
67
68rState equ REG_ABI_PARAM_0
69rData equ REG_ABI_PARAM_1
70
71
72
73
74
75
76MY_SHA_INSTR macro cmd, a1, a2
77 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
78endm
79
80cmd_sha256rnds2 equ 0cbH
81cmd_sha256msg1 equ 0ccH
82cmd_sha256msg2 equ 0cdH
83
84MY_sha256rnds2 macro a1, a2
85 MY_SHA_INSTR cmd_sha256rnds2, a1, a2
86endm
87
88MY_sha256msg1 macro a1, a2
89 MY_SHA_INSTR cmd_sha256msg1, a1, a2
90endm
91
92MY_sha256msg2 macro a1, a2
93 MY_SHA_INSTR cmd_sha256msg2, a1, a2
94endm
95
96MY_PROLOG macro
97 ifdef x64
98 if (IS_LINUX eq 0)
99 movdqa [r4 + 8], xmm6
100 movdqa [r4 + 8 + 16], xmm7
101 sub r4, LOCAL_SIZE + 8
102 movdqa [r4 ], xmm8
103 movdqa [r4 + 16], xmm9
104 endif
105 else ; x86
106 if (IS_CDECL gt 0)
107 mov rState, [r4 + REG_SIZE * 1]
108 mov rData, [r4 + REG_SIZE * 2]
109 mov rNum, [r4 + REG_SIZE * 3]
110 else ; fastcall
111 mov rNum, [r4 + REG_SIZE * 1]
112 endif
113 push r5
114 mov r5, r4
115 and r4, -16
116 sub r4, LOCAL_SIZE
117 endif
118endm
119
120MY_EPILOG macro
121 ifdef x64
122 if (IS_LINUX eq 0)
123 movdqa xmm8, [r4]
124 movdqa xmm9, [r4 + 16]
125 add r4, LOCAL_SIZE + 8
126 movdqa xmm6, [r4 + 8]
127 movdqa xmm7, [r4 + 8 + 16]
128 endif
129 else ; x86
130 mov r4, r5
131 pop r5
132 endif
133 MY_ENDP
134endm
135
136
137msg equ xmm0
138tmp equ xmm0
139state0_N equ 2
140state1_N equ 3
141w_regs equ 4
142
143
144state1_save equ xmm1
145state0 equ @CatStr(xmm, %state0_N)
146state1 equ @CatStr(xmm, %state1_N)
147
148
149ifdef x64
150 state0_save equ xmm8
151 mask2 equ xmm9
152else
153 state0_save equ [r4]
154 mask2 equ xmm0
155endif
156
157LOAD_MASK macro
158 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
159endm
160
161LOAD_W macro k:req
162 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
163 pshufb @CatStr(xmm, %(w_regs + k)), mask2
164endm
165
166
167; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1
168pre1 equ 3
169pre2 equ 2
170
171
172
173RND4 macro k
174 movdqa msg, xmmword ptr [K_CONST + (k) * 16]
175 paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
176 MY_sha256rnds2 state0_N, state1_N
177 pshufd msg, msg, 0eH
178
179 if (k GE (4 - pre1)) AND (k LT (16 - pre1))
180 ; w4[0] = msg1(w4[-4], w4[-3])
181 MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
182 endif
183
184 MY_sha256rnds2 state1_N, state0_N
185
186 if (k GE (4 - pre2)) AND (k LT (16 - pre2))
187 movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))
188 palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4
189 paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp
190 ; w4[0] = msg2(w4[0], w4[-1])
191 MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))
192 endif
193endm
194
195
196
197
198
199REVERSE_STATE macro
200 ; state0 ; dcba
201 ; state1 ; hgfe
202 pshufd tmp, state0, 01bH ; abcd
203 pshufd state0, state1, 01bH ; efgh
204 movdqa state1, state0 ; efgh
205 punpcklqdq state0, tmp ; cdgh
206 punpckhqdq state1, tmp ; abef
207endm
208
209
210MY_PROC Sha256_UpdateBlocks_HW, 3
211 MY_PROLOG
212
213 cmp rNum, 0
214 je end_c
215
216 movdqu state0, [rState] ; dcba
217 movdqu state1, [rState + 16] ; hgfe
218
219 REVERSE_STATE
220
221 ifdef x64
222 LOAD_MASK
223 endif
224
225 align 16
226 nextBlock:
227 movdqa state0_save, state0
228 movdqa state1_save, state1
229
230 ifndef x64
231 LOAD_MASK
232 endif
233
234 LOAD_W 0
235 LOAD_W 1
236 LOAD_W 2
237 LOAD_W 3
238
239
240 k = 0
241 rept 16
242 RND4 k
243 k = k + 1
244 endm
245
246 paddd state0, state0_save
247 paddd state1, state1_save
248
249 add rData, 64
250 sub rNum, 1
251 jnz nextBlock
252
253 REVERSE_STATE
254
255 movdqu [rState], state0
256 movdqu [rState + 16], state1
257
258 end_c:
259MY_EPILOG
260
261; _TEXT$SHA256OPT ENDS
262
263end
diff --git a/Asm/x86/XzCrc64Opt.asm b/Asm/x86/XzCrc64Opt.asm
new file mode 100644
index 0000000..ad22cc2
--- /dev/null
+++ b/Asm/x86/XzCrc64Opt.asm
@@ -0,0 +1,239 @@
1; XzCrc64Opt.asm -- CRC64 calculation : optimized version
2; 2021-02-06 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8ifdef x64
9
10rD equ r9
11rN equ r10
12rT equ r5
13num_VAR equ r8
14
15SRCDAT4 equ dword ptr [rD + rN * 1]
16
17CRC_XOR macro dest:req, src:req, t:req
18 xor dest, QWORD PTR [rT + src * 8 + 0800h * t]
19endm
20
21CRC1b macro
22 movzx x6, BYTE PTR [rD]
23 inc rD
24 movzx x3, x0_L
25 xor x6, x3
26 shr r0, 8
27 CRC_XOR r0, r6, 0
28 dec rN
29endm
30
31MY_PROLOG macro crc_end:req
32 ifdef ABI_LINUX
33 MY_PUSH_2_REGS
34 else
35 MY_PUSH_4_REGS
36 endif
37 mov r0, REG_ABI_PARAM_0
38 mov rN, REG_ABI_PARAM_2
39 mov rT, REG_ABI_PARAM_3
40 mov rD, REG_ABI_PARAM_1
41 test rN, rN
42 jz crc_end
43 @@:
44 test rD, 3
45 jz @F
46 CRC1b
47 jnz @B
48 @@:
49 cmp rN, 8
50 jb crc_end
51 add rN, rD
52 mov num_VAR, rN
53 sub rN, 4
54 and rN, NOT 3
55 sub rD, rN
56 mov x1, SRCDAT4
57 xor r0, r1
58 add rN, 4
59endm
60
61MY_EPILOG macro crc_end:req
62 sub rN, 4
63 mov x1, SRCDAT4
64 xor r0, r1
65 mov rD, rN
66 mov rN, num_VAR
67 sub rN, rD
68 crc_end:
69 test rN, rN
70 jz @F
71 CRC1b
72 jmp crc_end
73 @@:
74 ifdef ABI_LINUX
75 MY_POP_2_REGS
76 else
77 MY_POP_4_REGS
78 endif
79endm
80
81MY_PROC XzCrc64UpdateT4, 4
82 MY_PROLOG crc_end_4
83 align 16
84 main_loop_4:
85 mov x1, SRCDAT4
86 movzx x2, x0_L
87 movzx x3, x0_H
88 shr r0, 16
89 movzx x6, x0_L
90 movzx x7, x0_H
91 shr r0, 16
92 CRC_XOR r1, r2, 3
93 CRC_XOR r0, r3, 2
94 CRC_XOR r1, r6, 1
95 CRC_XOR r0, r7, 0
96 xor r0, r1
97
98 add rD, 4
99 jnz main_loop_4
100
101 MY_EPILOG crc_end_4
102MY_ENDP
103
104else
105; x86 (32-bit)
106
107rD equ r1
108rN equ r7
109rT equ r5
110
111crc_OFFS equ (REG_SIZE * 5)
112
113if (IS_CDECL gt 0) or (IS_LINUX gt 0)
114 ; cdecl or (GNU fastcall) stack:
115 ; (UInt32 *) table
116 ; size_t size
117 ; void * data
118 ; (UInt64) crc
119 ; ret-ip <-(r4)
120 data_OFFS equ (8 + crc_OFFS)
121 size_OFFS equ (REG_SIZE + data_OFFS)
122 table_OFFS equ (REG_SIZE + size_OFFS)
123 num_VAR equ [r4 + size_OFFS]
124 table_VAR equ [r4 + table_OFFS]
125else
126 ; Windows fastcall:
127 ; r1 = data, r2 = size
128 ; stack:
129 ; (UInt32 *) table
130 ; (UInt64) crc
131 ; ret-ip <-(r4)
132 table_OFFS equ (8 + crc_OFFS)
133 table_VAR equ [r4 + table_OFFS]
134 num_VAR equ table_VAR
135endif
136
137SRCDAT4 equ dword ptr [rD + rN * 1]
138
139CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
140 op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t]
141 op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4]
142endm
143
144CRC_XOR macro dest0:req, dest1:req, src:req, t:req
145 CRC xor, xor, dest0, dest1, src, t
146endm
147
148
149CRC1b macro
150 movzx x6, BYTE PTR [rD]
151 inc rD
152 movzx x3, x0_L
153 xor x6, x3
154 shrd r0, r2, 8
155 shr r2, 8
156 CRC_XOR r0, r2, r6, 0
157 dec rN
158endm
159
160MY_PROLOG macro crc_end:req
161 MY_PUSH_4_REGS
162
163 if (IS_CDECL gt 0) or (IS_LINUX gt 0)
164 proc_numParams = proc_numParams + 2 ; for ABI_LINUX
165 mov rN, [r4 + size_OFFS]
166 mov rD, [r4 + data_OFFS]
167 else
168 mov rN, r2
169 endif
170
171 mov x0, [r4 + crc_OFFS]
172 mov x2, [r4 + crc_OFFS + 4]
173 mov rT, table_VAR
174 test rN, rN
175 jz crc_end
176 @@:
177 test rD, 3
178 jz @F
179 CRC1b
180 jnz @B
181 @@:
182 cmp rN, 8
183 jb crc_end
184 add rN, rD
185
186 mov num_VAR, rN
187
188 sub rN, 4
189 and rN, NOT 3
190 sub rD, rN
191 xor r0, SRCDAT4
192 add rN, 4
193endm
194
195MY_EPILOG macro crc_end:req
196 sub rN, 4
197 xor r0, SRCDAT4
198
199 mov rD, rN
200 mov rN, num_VAR
201 sub rN, rD
202 crc_end:
203 test rN, rN
204 jz @F
205 CRC1b
206 jmp crc_end
207 @@:
208 MY_POP_4_REGS
209endm
210
211MY_PROC XzCrc64UpdateT4, 5
212 MY_PROLOG crc_end_4
213 movzx x6, x0_L
214 align 16
215 main_loop_4:
216 mov r3, SRCDAT4
217 xor r3, r2
218
219 CRC xor, mov, r3, r2, r6, 3
220 movzx x6, x0_H
221 shr r0, 16
222 CRC_XOR r3, r2, r6, 2
223
224 movzx x6, x0_L
225 movzx x0, x0_H
226 CRC_XOR r3, r2, r6, 1
227 CRC_XOR r3, r2, r0, 0
228 movzx x6, x3_L
229 mov r0, r3
230
231 add rD, 4
232 jnz main_loop_4
233
234 MY_EPILOG crc_end_4
235MY_ENDP
236
237endif ; ! x64
238
239end