summaryrefslogtreecommitdiff
path: root/contrib/amd64
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:26:40 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:26:40 -0700
commitf6194ef39af5864f792412460c354cc339dde7d1 (patch)
tree5ea1e6849128e9b2194c66ee3d82afa36b4ac07c /contrib/amd64
parent639be997883d9016baaf46017a2802b2ce1698bd (diff)
downloadzlib-1.2.3.4.tar.gz
zlib-1.2.3.4.tar.bz2
zlib-1.2.3.4.zip
zlib 1.2.3.4v1.2.3.4
Diffstat (limited to 'contrib/amd64')
-rw-r--r--contrib/amd64/amd64-match.S357
1 files changed, 357 insertions, 0 deletions
diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S
new file mode 100644
index 0000000..b3bf1ac
--- /dev/null
+++ b/contrib/amd64/amd64-match.S
@@ -0,0 +1,357 @@
1/*
2 * match.S -- optimized version of longest_match()
3 * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
4 *
5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the BSD License. Use by owners of Che Guevarra
7 * parafernalia is prohibited, where possible, and highly discouraged
8 * elsewhere.
9 */
10
11#ifndef NO_UNDERLINE
12# define match_init _match_init
13# define longest_match _longest_match
14#endif
15
16#define scanend ebx
17#define scanendw bx
18#define chainlenwmask edx /* high word: current chain len low word: s->wmask */
19#define curmatch rsi
20#define curmatchd esi
21#define windowbestlen r8
22#define scanalign r9
23#define scanalignd r9d
24#define window r10
25#define bestlen r11
26#define bestlend r11d
27#define scanstart r12d
28#define scanstartw r12w
29#define scan r13
30#define nicematch r14d
31#define limit r15
32#define limitd r15d
33#define prev rcx
34
35/*
36 * The 258 is a "magic number, not a parameter -- changing it
37 * breaks the hell loose
38 */
39#define MAX_MATCH (258)
40#define MIN_MATCH (3)
41#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
42#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
43
44/* stack frame offsets */
45#define LocalVarsSize (112)
46#define _chainlenwmask ( 8-LocalVarsSize)(%rsp)
47#define _windowbestlen (16-LocalVarsSize)(%rsp)
48#define save_r14 (24-LocalVarsSize)(%rsp)
49#define save_rsi (32-LocalVarsSize)(%rsp)
50#define save_rbx (40-LocalVarsSize)(%rsp)
51#define save_r12 (56-LocalVarsSize)(%rsp)
52#define save_r13 (64-LocalVarsSize)(%rsp)
53#define save_r15 (80-LocalVarsSize)(%rsp)
54
55/*
56 * On AMD64 the first argument of a function (in our case -- the pointer to
57 * deflate_state structure) is passed in %rdi, hence our offsets below are
58 * all off of that.
59 */
60#ifndef STRUCT_OFFSET
61# define STRUCT_OFFSET (0)
62#endif
63#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
64#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
65#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
66#define dsPrev ( 88 + STRUCT_OFFSET)(%rdi)
67#define dsMatchLen (136 + STRUCT_OFFSET)(%rdi)
68#define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
69#define dsStrStart (148 + STRUCT_OFFSET)(%rdi)
70#define dsMatchStart (152 + STRUCT_OFFSET)(%rdi)
71#define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
72#define dsPrevLen (160 + STRUCT_OFFSET)(%rdi)
73#define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi)
74#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
75#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
76
77.globl match_init, longest_match
78
79.text
80
81/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
82
83longest_match:
84/*
85 * Retrieve the function arguments. %curmatch will hold cur_match
86 * throughout the entire function (passed via rsi on amd64).
87 * rdi will hold the pointer to the deflate_state (first arg on amd64)
88 */
89 mov %rsi, save_rsi
90 mov %rbx, save_rbx
91 mov %r12, save_r12
92 mov %r13, save_r13
93 mov %r14, save_r14
94 mov %r15, save_r15
95
96/* uInt wmask = s->w_mask; */
97/* unsigned chain_length = s->max_chain_length; */
98/* if (s->prev_length >= s->good_match) { */
99/* chain_length >>= 2; */
100/* } */
101
102 movl dsPrevLen, %eax
103 movl dsGoodMatch, %ebx
104 cmpl %ebx, %eax
105 movl dsWMask, %eax
106 movl dsMaxChainLen, %chainlenwmask
107 jl LastMatchGood
108 shrl $2, %chainlenwmask
109LastMatchGood:
110
111/* chainlen is decremented once beforehand so that the function can */
112/* use the sign flag instead of the zero flag for the exit test. */
113/* It is then shifted into the high word, to make room for the wmask */
114/* value, which it will always accompany. */
115
116 decl %chainlenwmask
117 shll $16, %chainlenwmask
118 orl %eax, %chainlenwmask
119
120/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
121
122 movl dsNiceMatch, %eax
123 movl dsLookahead, %ebx
124 cmpl %eax, %ebx
125 jl LookaheadLess
126 movl %eax, %ebx
127LookaheadLess: movl %ebx, %nicematch
128
129/* register Bytef *scan = s->window + s->strstart; */
130
131 mov dsWindow, %window
132 movl dsStrStart, %limitd
133 lea (%limit, %window), %scan
134
135/* Determine how many bytes the scan ptr is off from being */
136/* dword-aligned. */
137
138 mov %scan, %scanalign
139 negl %scanalignd
140 andl $3, %scanalignd
141
142/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
143/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
144
145 movl dsWSize, %eax
146 subl $MIN_LOOKAHEAD, %eax
147 xorl %ecx, %ecx
148 subl %eax, %limitd
149 cmovng %ecx, %limitd
150
151/* int best_len = s->prev_length; */
152
153 movl dsPrevLen, %bestlend
154
155/* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */
156
157 lea (%window, %bestlen), %windowbestlen
158 mov %windowbestlen, _windowbestlen
159
160/* register ush scan_start = *(ushf*)scan; */
161/* register ush scan_end = *(ushf*)(scan+best_len-1); */
162/* Posf *prev = s->prev; */
163
164 movzwl (%scan), %scanstart
165 movzwl -1(%scan, %bestlen), %scanend
166 mov dsPrev, %prev
167
168/* Jump into the main loop. */
169
170 movl %chainlenwmask, _chainlenwmask
171 jmp LoopEntry
172
173.balign 16
174
175/* do {
176 * match = s->window + cur_match;
177 * if (*(ushf*)(match+best_len-1) != scan_end ||
178 * *(ushf*)match != scan_start) continue;
179 * [...]
180 * } while ((cur_match = prev[cur_match & wmask]) > limit
181 * && --chain_length != 0);
182 *
183 * Here is the inner loop of the function. The function will spend the
184 * majority of its time in this loop, and majority of that time will
185 * be spent in the first ten instructions.
186 */
187LookupLoop:
188 andl %chainlenwmask, %curmatchd
189 movzwl (%prev, %curmatch, 2), %curmatchd
190 cmpl %limitd, %curmatchd
191 jbe LeaveNow
192 subl $0x00010000, %chainlenwmask
193 js LeaveNow
194LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
195 jne LookupLoop
196 cmpw %scanstartw, (%window, %curmatch)
197 jne LookupLoop
198
199/* Store the current value of chainlen. */
200 movl %chainlenwmask, _chainlenwmask
201
202/* %scan is the string under scrutiny, and %prev to the string we */
203/* are hoping to match it up with. In actuality, %esi and %edi are */
204/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
205/* initialized to -(MAX_MATCH_8 - scanalign). */
206
207 mov $(-MAX_MATCH_8), %rdx
208 lea (%curmatch, %window), %windowbestlen
209 lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
210 lea MAX_MATCH_8(%scan, %scanalign), %prev
211
212/* the prefetching below makes very little difference... */
213 prefetcht1 (%windowbestlen, %rdx)
214 prefetcht1 (%prev, %rdx)
215
216/*
217 * Test the strings for equality, 8 bytes at a time. At the end,
218 * adjust %rdx so that it is offset to the exact byte that mismatched.
219 *
220 * It should be confessed that this loop usually does not represent
221 * much of the total running time. Replacing it with a more
222 * straightforward "rep cmpsb" would not drastically degrade
223 * performance -- unrolling it, for example, makes no difference.
224 */
225#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
226LoopCmps:
227#ifdef USE_SSE
228 /* Preload the SSE registers */
229 movdqu (%windowbestlen, %rdx), %xmm1
230 movdqu (%prev, %rdx), %xmm2
231 pcmpeqb %xmm2, %xmm1
232 movdqu 16(%windowbestlen, %rdx), %xmm3
233 movdqu 16(%prev, %rdx), %xmm4
234 pcmpeqb %xmm4, %xmm3
235 movdqu 32(%windowbestlen, %rdx), %xmm5
236 movdqu 32(%prev, %rdx), %xmm6
237 pcmpeqb %xmm6, %xmm5
238 movdqu 48(%windowbestlen, %rdx), %xmm7
239 movdqu 48(%prev, %rdx), %xmm8
240 pcmpeqb %xmm8, %xmm7
241
242 /* Check the comparisions' results */
243 pmovmskb %xmm1, %rax
244 notw %ax
245 bsfw %ax, %ax
246 jnz LeaveLoopCmps
247 add $16, %rdx
248 pmovmskb %xmm3, %rax
249 notw %ax
250 bsfw %ax, %ax
251 jnz LeaveLoopCmps
252 add $16, %rdx
253 pmovmskb %xmm5, %rax
254 notw %ax
255 bsfw %ax, %ax
256 jnz LeaveLoopCmps
257 add $16, %rdx
258 pmovmskb %xmm7, %rax
259 notw %ax
260 bsfw %ax, %ax
261 jnz LeaveLoopCmps
262 add $16, %rdx
263 jmp LoopCmps
264LeaveLoopCmps: add %rax, %rdx
265#else
266 mov (%windowbestlen, %rdx), %rax
267 xor (%prev, %rdx), %rax
268 jnz LeaveLoopCmps
269 add $8, %rdx
270 jnz LoopCmps
271 jmp LenMaximum
272# if 0
273/*
274 * This three-liner is tantalizingly simple, but bsf is a slow instruction,
275 * and the complicated alternative down below is quite a bit faster. Sad...
276 */
277LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
278 shrl $3, %eax /* divide by 8 to get the byte */
279 add %rax, %rdx
280# else
281LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
282 jnz Check16
283 add $4, %rdx
284 shr $32, %rax
285Check16: testw $0xFFFF, %ax
286 jnz LenLower
287 add $2, %rdx
288 shrl $16, %eax
289LenLower: subb $1, %al
290 adc $0, %rdx
291# endif
292#endif
293
294/* Calculate the length of the match. If it is longer than MAX_MATCH, */
295/* then automatically accept it as the best possible match and leave. */
296
297 lea (%prev, %rdx), %rax
298 sub %scan, %rax
299 cmpl $MAX_MATCH, %eax
300 jge LenMaximum
301
302/* If the length of the match is not longer than the best match we */
303/* have so far, then forget it and return to the lookup loop. */
304
305 cmpl %bestlend, %eax
306 jg LongerMatch
307 mov _windowbestlen, %windowbestlen
308 mov dsPrev, %prev
309 movl _chainlenwmask, %edx
310 jmp LookupLoop
311
312/* s->match_start = cur_match; */
313/* best_len = len; */
314/* if (len >= nice_match) break; */
315/* scan_end = *(ushf*)(scan+best_len-1); */
316
317LongerMatch:
318 movl %eax, %bestlend
319 movl %curmatchd, dsMatchStart
320 cmpl %nicematch, %eax
321 jge LeaveNow
322
323 lea (%window, %bestlen), %windowbestlen
324 mov %windowbestlen, _windowbestlen
325
326 movzwl -1(%scan, %rax), %scanend
327 mov dsPrev, %prev
328 movl _chainlenwmask, %chainlenwmask
329 jmp LookupLoop
330
331/* Accept the current string, with the maximum possible length. */
332
333LenMaximum:
334 movl $MAX_MATCH, %bestlend
335 movl %curmatchd, dsMatchStart
336
337/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
338/* return s->lookahead; */
339
340LeaveNow:
341 movl dsLookahead, %eax
342 cmpl %eax, %bestlend
343 cmovngl %bestlend, %eax
344LookaheadRet:
345
346/* Restore the registers and return from whence we came. */
347
348 mov save_rsi, %rsi
349 mov save_rbx, %rbx
350 mov save_r12, %r12
351 mov save_r13, %r13
352 mov save_r14, %r14
353 mov save_r15, %r15
354
355 ret
356
357match_init: ret