diff options
Diffstat (limited to 'Asm')
-rw-r--r-- | Asm/x86/Sort.asm | 860 |
1 files changed, 860 insertions, 0 deletions
diff --git a/Asm/x86/Sort.asm b/Asm/x86/Sort.asm new file mode 100644 index 0000000..517c615 --- /dev/null +++ b/Asm/x86/Sort.asm | |||
@@ -0,0 +1,860 @@ | |||
1 | ; SortTest.asm -- ASM version of HeapSort() function | ||
2 | ; Igor Pavlov : Public domain | ||
3 | |||
4 | include ../../../../Asm/x86/7zAsm.asm | ||
5 | |||
6 | MY_ASM_START | ||
7 | |||
8 | ifndef Z7_SORT_ASM_USE_SEGMENT | ||
9 | if (IS_LINUX gt 0) | ||
10 | ; Z7_SORT_ASM_USE_SEGMENT equ 1 | ||
11 | else | ||
12 | ; Z7_SORT_ASM_USE_SEGMENT equ 1 | ||
13 | endif | ||
14 | endif | ||
15 | |||
16 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
17 | _TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE' | ||
18 | MY_ALIGN macro num:req | ||
19 | align num | ||
20 | endm | ||
21 | else | ||
22 | MY_ALIGN macro num:req | ||
23 | ; We expect that ".text" is aligned for 16-bytes. | ||
24 | ; So we don't need large alignment inside our function. | ||
25 | align 16 | ||
26 | endm | ||
27 | endif | ||
28 | |||
29 | |||
30 | MY_ALIGN_16 macro | ||
31 | MY_ALIGN 16 | ||
32 | endm | ||
33 | |||
34 | MY_ALIGN_32 macro | ||
35 | MY_ALIGN 32 | ||
36 | endm | ||
37 | |||
38 | MY_ALIGN_64 macro | ||
39 | MY_ALIGN 64 | ||
40 | endm | ||
41 | |||
42 | ifdef x64 | ||
43 | |||
44 | NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases) | ||
45 | ; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays) | ||
46 | |||
47 | acc equ x0 | ||
48 | k equ r0 | ||
49 | k_x equ x0 | ||
50 | |||
51 | p equ r1 | ||
52 | |||
53 | s equ r2 | ||
54 | s_x equ x2 | ||
55 | |||
56 | a0 equ x3 | ||
57 | t0 equ a0 | ||
58 | |||
59 | a3 equ x5 | ||
60 | qq equ a3 | ||
61 | |||
62 | a1 equ x6 | ||
63 | t1 equ a1 | ||
64 | t1_r equ r6 | ||
65 | |||
66 | a2 equ x7 | ||
67 | t2 equ a2 | ||
68 | |||
69 | i equ r8 | ||
70 | e0 equ x8 | ||
71 | |||
72 | e1 equ x9 | ||
73 | |||
74 | num_last equ r10 | ||
75 | num_last_x equ x10 | ||
76 | |||
77 | next4_lim equ r11 | ||
78 | pref_lim equ r12 | ||
79 | |||
80 | |||
81 | |||
82 | SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg | ||
83 | mov temp_reg, b0 | ||
84 | cmp b0, b1 | ||
85 | cmovae b0, b1 ; min | ||
86 | cmovae b1, temp_reg ; max | ||
87 | endm | ||
88 | |||
89 | SORT macro b0, b1 | ||
90 | SORT_2_WITH_TEMP_REG b0, b1, acc | ||
91 | endm | ||
92 | |||
93 | LOAD macro dest:req, index:req | ||
94 | mov dest, [p + 4 * index] | ||
95 | endm | ||
96 | |||
97 | STORE macro reg:req, index:req | ||
98 | mov [p + 4 * index], reg | ||
99 | endm | ||
100 | |||
101 | |||
102 | if (NUM_PREFETCH_LEVELS gt 3) | ||
103 | num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3)) | ||
104 | else | ||
105 | num_prefetches equ 1 | ||
106 | endif | ||
107 | |||
108 | PREFETCH_OP macro offs | ||
109 | cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line. | ||
110 | ; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes. | ||
111 | rept num_prefetches | ||
112 | if 1 | ||
113 | prefetcht0 byte ptr [p + offs + cur_offset] | ||
114 | else | ||
115 | mov pref_x, dword ptr [p + offs + cur_offset] | ||
116 | endif | ||
117 | cur_offset = cur_offset + 64 | ||
118 | endm | ||
119 | endm | ||
120 | |||
121 | PREFETCH_MY macro | ||
122 | if 1 | ||
123 | if 1 | ||
124 | shl k, NUM_PREFETCH_LEVELS + 3 | ||
125 | else | ||
126 | ; we delay prefetch instruction to improve main loads | ||
127 | shl k, NUM_PREFETCH_LEVELS | ||
128 | shl k, 3 | ||
129 | ; shl k, 0 | ||
130 | endif | ||
131 | PREFETCH_OP k | ||
132 | elseif 1 | ||
133 | shl k, 3 | ||
134 | PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it | ||
135 | endif | ||
136 | endm | ||
137 | |||
138 | |||
139 | STEP_1 macro exit_label, prefetch_macro | ||
140 | use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases | ||
141 | ; set 0 for LOAD after adc s, 0 | ||
142 | cmp t0, t1 | ||
143 | if use_cmov_1 | ||
144 | cmovb t0, t1 | ||
145 | ; STORE t0, k | ||
146 | endif | ||
147 | adc s, 0 | ||
148 | if use_cmov_1 eq 0 | ||
149 | LOAD t0, s | ||
150 | endif | ||
151 | cmp qq, t0 | ||
152 | jae exit_label | ||
153 | if 1 ; use_cmov_1 eq 0 | ||
154 | STORE t0, k | ||
155 | endif | ||
156 | prefetch_macro | ||
157 | mov t0, [p + s * 8] | ||
158 | mov t1, [p + s * 8 + 4] | ||
159 | mov k, s | ||
160 | add s, s ; slower for some cpus | ||
161 | ; lea s, dword ptr [s + s] ; slower for some cpus | ||
162 | ; shl s, 1 ; faster for some cpus | ||
163 | ; lea s, dword ptr [s * 2] ; faster for some cpus | ||
164 | rept 0 ; 1000 for debug : 0 for normal | ||
165 | ; number of calls in generate_stage : ~0.6 of number of items | ||
166 | shl k, 0 | ||
167 | endm | ||
168 | endm | ||
169 | |||
170 | |||
171 | STEP_2 macro exit_label, prefetch_macro | ||
172 | use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases | ||
173 | ; set 0 for LOAD after adc s, 0 | ||
174 | cmp t0, t1 | ||
175 | if use_cmov_2 | ||
176 | mov t2, t0 | ||
177 | cmovb t2, t1 | ||
178 | ; STORE t2, k | ||
179 | endif | ||
180 | mov t0, [p + s * 8] | ||
181 | mov t1, [p + s * 8 + 4] | ||
182 | cmovb t0, [p + s * 8 + 8] | ||
183 | cmovb t1, [p + s * 8 + 12] | ||
184 | adc s, 0 | ||
185 | if use_cmov_2 eq 0 | ||
186 | LOAD t2, s | ||
187 | endif | ||
188 | cmp qq, t2 | ||
189 | jae exit_label | ||
190 | if 1 ; use_cmov_2 eq 0 | ||
191 | STORE t2, k | ||
192 | endif | ||
193 | prefetch_macro | ||
194 | mov k, s | ||
195 | ; add s, s | ||
196 | ; lea s, [s + s] | ||
197 | shl s, 1 | ||
198 | ; lea s, [s * 2] | ||
199 | endm | ||
200 | |||
201 | |||
202 | MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls | ||
203 | LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes | ||
204 | |||
205 | ; s == k * 2 | ||
206 | ; t0 == (p)[s] | ||
207 | ; t1 == (p)[s + 1] | ||
208 | cmp k, next4_lim | ||
209 | jae leaves | ||
210 | |||
211 | rept num_unrolls | ||
212 | STEP exit_2 | ||
213 | cmp k, next4_lim | ||
214 | jae leaves | ||
215 | endm | ||
216 | |||
217 | if use_prefetch | ||
218 | prefetch_macro equ PREFETCH_MY | ||
219 | pref_lim_2 equ pref_lim | ||
220 | ; lea pref_lim, dword ptr [num_last + 1] | ||
221 | ; shr pref_lim, NUM_PREFETCH_LEVELS + 1 | ||
222 | cmp k, pref_lim_2 | ||
223 | jae last_nodes | ||
224 | else | ||
225 | prefetch_macro equ | ||
226 | pref_lim_2 equ next4_lim | ||
227 | endif | ||
228 | |||
229 | MY_ALIGN_16 | ||
230 | opt_loop: | ||
231 | STEP exit_2, prefetch_macro | ||
232 | cmp k, pref_lim_2 | ||
233 | jb opt_loop | ||
234 | |||
235 | last_nodes: | ||
236 | ; k >= pref_lim_2 | ||
237 | ; 2 cases are possible: | ||
238 | ; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2 | ||
239 | ; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 && | ||
240 | ; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1) | ||
241 | if use_prefetch | ||
242 | yyy = NUM_PREFETCH_LEVELS - 1 | ||
243 | while yyy | ||
244 | yyy = yyy - 1 | ||
245 | STEP exit_2 | ||
246 | if yyy | ||
247 | cmp k, next4_lim | ||
248 | jae leaves | ||
249 | endif | ||
250 | endm | ||
251 | endif | ||
252 | |||
253 | leaves: | ||
254 | ; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code. | ||
255 | ; we have 2 nodes in (s) level : always | ||
256 | ; we can have some nodes in (s * 2) level : low probability case | ||
257 | ; we have no nodes in (s * 4) level | ||
258 | ; s == k * 2 | ||
259 | ; t0 == (p)[s] | ||
260 | ; t1 == (p)[s + 1] | ||
261 | cmp t0, t1 | ||
262 | cmovb t0, t1 | ||
263 | adc s, 0 | ||
264 | STORE t0, k | ||
265 | |||
266 | ; t0 == (p)[s] | ||
267 | ; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1] | ||
268 | ; we have 3 possible cases here: | ||
269 | ; s * 2 > num_last : (s) node has no childs | ||
270 | ; s * 2 == num_last : (s) node has 1 leaf child that is last item of array | ||
271 | ; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last) | ||
272 | ; we check for (s * 2 > num_last) before "cmp qq, t0" check, because | ||
273 | ; we will replace conditional jump with cmov instruction later. | ||
274 | lea t1_r, dword ptr [s + s] | ||
275 | cmp t1_r, num_last | ||
276 | ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch | ||
277 | |||
278 | ; it's low probability branch | ||
279 | ; s * 2 <= num_last | ||
280 | cmp qq, t0 | ||
281 | jae exit_2 | ||
282 | |||
283 | ; qq < t0, so we go to next level | ||
284 | ; we check 1 or 2 childs in next level | ||
285 | mov t0, [p + s * 8] | ||
286 | mov k, s | ||
287 | mov s, t1_r | ||
288 | cmp t1_r, num_last | ||
289 | je @F ; (s == num_last) means that we have single child in tree | ||
290 | |||
291 | ; (s < num_last) : so we must read both childs and select max of them. | ||
292 | mov t1, [p + k * 8 + 4] | ||
293 | cmp t0, t1 | ||
294 | cmovb t0, t1 | ||
295 | adc s, 0 | ||
296 | @@: | ||
297 | STORE t0, k | ||
298 | exit_1: | ||
299 | ; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1] | ||
300 | cmp qq, t0 | ||
301 | cmovb k, s | ||
302 | exit_2: | ||
303 | STORE qq, k | ||
304 | endm | ||
305 | |||
306 | |||
307 | |||
308 | |||
309 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
310 | ; MY_ALIGN_64 | ||
311 | else | ||
312 | MY_ALIGN_16 | ||
313 | endif | ||
314 | |||
315 | MY_PROC HeapSort, 2 | ||
316 | |||
317 | if (IS_LINUX gt 0) | ||
318 | mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux | ||
319 | endif | ||
320 | mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux | ||
321 | ; r10 <- r2 : win64 | ||
322 | cmp num_last, 2 | ||
323 | jb end_1 | ||
324 | |||
325 | ; MY_PUSH_PRESERVED_ABI_REGS | ||
326 | MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
327 | push r12 | ||
328 | |||
329 | cmp num_last, 4 | ||
330 | ja sort_5 | ||
331 | |||
332 | LOAD a0, 0 | ||
333 | LOAD a1, 1 | ||
334 | SORT a0, a1 | ||
335 | cmp num_last, 3 | ||
336 | jb end_2 | ||
337 | |||
338 | LOAD a2, 2 | ||
339 | je sort_3 | ||
340 | |||
341 | LOAD a3, 3 | ||
342 | SORT a2, a3 | ||
343 | SORT a1, a3 | ||
344 | STORE a3, 3 | ||
345 | sort_3: | ||
346 | SORT a0, a2 | ||
347 | SORT a1, a2 | ||
348 | STORE a2, 2 | ||
349 | jmp end_2 | ||
350 | |||
351 | sort_5: | ||
352 | ; (num_last > 4) is required here | ||
353 | ; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1 | ||
354 | mov next4_lim, num_last | ||
355 | shr next4_lim, 2 | ||
356 | |||
357 | dec num_last | ||
358 | mov k, num_last | ||
359 | shr k, 1 | ||
360 | mov i, num_last | ||
361 | shr i, 2 | ||
362 | test num_last, 1 | ||
363 | jnz size_even | ||
364 | |||
365 | ; ODD number of items. So we compare parent with single child | ||
366 | LOAD t1, num_last | ||
367 | LOAD t0, k | ||
368 | SORT_2_WITH_TEMP_REG t1, t0, t2 | ||
369 | STORE t1, num_last | ||
370 | STORE t0, k | ||
371 | dec k | ||
372 | |||
373 | size_even: | ||
374 | cmp k, i | ||
375 | jbe loop_down ; jump for num_last == 4 case | ||
376 | |||
377 | if 0 ; 1 for debug | ||
378 | mov r15, k | ||
379 | mov r14d, 1 ; 100 | ||
380 | loop_benchmark: | ||
381 | endif | ||
382 | ; optimized loop for leaf nodes: | ||
383 | mov t0, [p + k * 8] | ||
384 | mov t1, [p + k * 8 + 4] | ||
385 | |||
386 | MY_ALIGN_16 | ||
387 | loop_down_1: | ||
388 | ; we compare parent with max of childs: | ||
389 | ; lea s, dword ptr [2 * k] | ||
390 | mov s, k | ||
391 | cmp t0, t1 | ||
392 | cmovb t0, t1 | ||
393 | adc s, s | ||
394 | LOAD t2, k | ||
395 | STORE t0, k | ||
396 | cmp t2, t0 | ||
397 | cmovae s, k | ||
398 | dec k | ||
399 | ; we preload next items before STORE operation for calculated address | ||
400 | mov t0, [p + k * 8] | ||
401 | mov t1, [p + k * 8 + 4] | ||
402 | STORE t2, s | ||
403 | cmp k, i | ||
404 | jne loop_down_1 | ||
405 | |||
406 | if 0 ; 1 for debug | ||
407 | mov k, r15 | ||
408 | dec r14d | ||
409 | jnz loop_benchmark | ||
410 | ; jmp end_debug | ||
411 | endif | ||
412 | |||
413 | MY_ALIGN_16 | ||
414 | loop_down: | ||
415 | mov t0, [p + i * 8] | ||
416 | mov t1, [p + i * 8 + 4] | ||
417 | LOAD qq, i | ||
418 | mov k, i | ||
419 | lea s, dword ptr [i + i] | ||
420 | ; jmp end_debug | ||
421 | DOWN_use_prefetch equ 0 | ||
422 | DOWN_num_unrolls equ 0 | ||
423 | MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls | ||
424 | sub i, 1 | ||
425 | jnb loop_down | ||
426 | |||
427 | ; jmp end_debug | ||
428 | LOAD e0, 0 | ||
429 | LOAD e1, 1 | ||
430 | |||
431 | LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work | ||
432 | |||
433 | cmp num_last, LEVEL_3_LIMIT + 1 | ||
434 | jb main_loop_sort_5 | ||
435 | |||
436 | MY_ALIGN_16 | ||
437 | main_loop_sort: | ||
438 | ; num_last > LEVEL_3_LIMIT | ||
439 | ; p[size--] = p[0]; | ||
440 | LOAD qq, num_last | ||
441 | STORE e0, num_last | ||
442 | mov e0, e1 | ||
443 | |||
444 | mov next4_lim, num_last | ||
445 | shr next4_lim, 2 | ||
446 | mov pref_lim, num_last | ||
447 | shr pref_lim, NUM_PREFETCH_LEVELS + 1 | ||
448 | |||
449 | dec num_last | ||
450 | if 0 ; 1 for debug | ||
451 | ; that optional optimization can improve the performance, if there are identical items in array | ||
452 | ; 3 times improvement : if all items in array are identical | ||
453 | ; 20% improvement : if items are different for 1 bit only | ||
454 | ; 1-10% improvement : if items are different for (2+) bits | ||
455 | ; no gain : if items are different | ||
456 | cmp qq, e1 | ||
457 | jae next_iter_main | ||
458 | endif | ||
459 | LOAD e1, 2 | ||
460 | LOAD t0, 3 | ||
461 | mov k_x, 2 | ||
462 | cmp e1, t0 | ||
463 | cmovb e1, t0 | ||
464 | mov t0, [p + 4 * (4 + 0)] | ||
465 | mov t1, [p + 4 * (4 + 1)] | ||
466 | cmovb t0, [p + 4 * (4 + 2)] | ||
467 | cmovb t1, [p + 4 * (4 + 3)] | ||
468 | adc k_x, 0 | ||
469 | ; (qq <= e1), because the tree is correctly sorted | ||
470 | ; also here we could check (qq >= e1) or (qq == e1) for faster exit | ||
471 | lea s, dword ptr [k + k] | ||
472 | MAIN_use_prefetch equ 1 | ||
473 | MAIN_num_unrolls equ 0 | ||
474 | MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls | ||
475 | |||
476 | next_iter_main: | ||
477 | cmp num_last, LEVEL_3_LIMIT | ||
478 | jne main_loop_sort | ||
479 | |||
480 | ; num_last == LEVEL_3_LIMIT | ||
481 | main_loop_sort_5: | ||
482 | ; 4 <= num_last <= LEVEL_3_LIMIT | ||
483 | ; p[size--] = p[0]; | ||
484 | LOAD qq, num_last | ||
485 | STORE e0, num_last | ||
486 | mov e0, e1 | ||
487 | dec num_last_x | ||
488 | |||
489 | LOAD e1, 2 | ||
490 | LOAD t0, 3 | ||
491 | mov k_x, 2 | ||
492 | cmp e1, t0 | ||
493 | cmovb e1, t0 | ||
494 | adc k_x, 0 | ||
495 | |||
496 | lea s_x, dword ptr [k * 2] | ||
497 | cmp s_x, num_last_x | ||
498 | ja exit_2 | ||
499 | |||
500 | mov t0, [p + k * 8] | ||
501 | je exit_1 | ||
502 | |||
503 | ; s < num_last | ||
504 | mov t1, [p + k * 8 + 4] | ||
505 | cmp t0, t1 | ||
506 | cmovb t0, t1 | ||
507 | adc s_x, 0 | ||
508 | exit_1: | ||
509 | STORE t0, k | ||
510 | cmp qq, t0 | ||
511 | cmovb k_x, s_x | ||
512 | exit_2: | ||
513 | STORE qq, k | ||
514 | cmp num_last_x, 3 | ||
515 | jne main_loop_sort_5 | ||
516 | |||
517 | ; num_last == 3 (real_size == 4) | ||
518 | LOAD a0, 2 | ||
519 | LOAD a1, 3 | ||
520 | STORE e1, 2 | ||
521 | STORE e0, 3 | ||
522 | SORT a0, a1 | ||
523 | end_2: | ||
524 | STORE a0, 0 | ||
525 | STORE a1, 1 | ||
526 | ; end_debug: | ||
527 | ; MY_POP_PRESERVED_ABI_REGS | ||
528 | pop r12 | ||
529 | MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
530 | end_1: | ||
531 | MY_ENDP | ||
532 | |||
533 | |||
534 | |||
535 | else | ||
536 | ; ------------ x86 32-bit ------------ | ||
537 | |||
538 | ifdef x64 | ||
539 | IS_CDECL = 0 | ||
540 | endif | ||
541 | |||
542 | acc equ x0 | ||
543 | k equ r0 | ||
544 | k_x equ acc | ||
545 | |||
546 | p equ r1 | ||
547 | |||
548 | num_last equ r2 | ||
549 | num_last_x equ x2 | ||
550 | |||
551 | a0 equ x3 | ||
552 | t0 equ a0 | ||
553 | |||
554 | a3 equ x5 | ||
555 | i equ r5 | ||
556 | e0 equ a3 | ||
557 | |||
558 | a1 equ x6 | ||
559 | qq equ a1 | ||
560 | |||
561 | a2 equ x7 | ||
562 | s equ r7 | ||
563 | s_x equ a2 | ||
564 | |||
565 | |||
566 | SORT macro b0, b1 | ||
567 | cmp b1, b0 | ||
568 | jae @F | ||
569 | if 1 | ||
570 | xchg b0, b1 | ||
571 | else | ||
572 | mov acc, b0 | ||
573 | mov b0, b1 ; min | ||
574 | mov b1, acc ; max | ||
575 | endif | ||
576 | @@: | ||
577 | endm | ||
578 | |||
579 | LOAD macro dest:req, index:req | ||
580 | mov dest, [p + 4 * index] | ||
581 | endm | ||
582 | |||
583 | STORE macro reg:req, index:req | ||
584 | mov [p + 4 * index], reg | ||
585 | endm | ||
586 | |||
587 | |||
588 | STEP_1 macro exit_label | ||
589 | mov t0, [p + k * 8] | ||
590 | cmp t0, [p + k * 8 + 4] | ||
591 | adc s, 0 | ||
592 | LOAD t0, s | ||
593 | STORE t0, k ; we lookahed stooring for most expected branch | ||
594 | cmp qq, t0 | ||
595 | jae exit_label | ||
596 | ; STORE t0, k ; use if | ||
597 | mov k, s | ||
598 | add s, s | ||
599 | ; lea s, dword ptr [s + s] | ||
600 | ; shl s, 1 | ||
601 | ; lea s, dword ptr [s * 2] | ||
602 | endm | ||
603 | |||
604 | STEP_BRANCH macro exit_label | ||
605 | mov t0, [p + k * 8] | ||
606 | cmp t0, [p + k * 8 + 4] | ||
607 | jae @F | ||
608 | inc s | ||
609 | mov t0, [p + k * 8 + 4] | ||
610 | @@: | ||
611 | cmp qq, t0 | ||
612 | jae exit_label | ||
613 | STORE t0, k | ||
614 | mov k, s | ||
615 | add s, s | ||
616 | endm | ||
617 | |||
618 | |||
619 | |||
620 | MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2 | ||
621 | LOCAL leaves, opt_loop, single | ||
622 | |||
623 | ; s == k * 2 | ||
624 | rept num_unrolls | ||
625 | cmp s, num_last | ||
626 | jae leaves | ||
627 | STEP_1 exit_2 | ||
628 | endm | ||
629 | cmp s, num_last | ||
630 | jb opt_loop | ||
631 | |||
632 | leaves: | ||
633 | ; (s >= num_last) | ||
634 | jne exit_2 | ||
635 | single: | ||
636 | ; (s == num_last) | ||
637 | mov t0, [p + k * 8] | ||
638 | cmp qq, t0 | ||
639 | jae exit_2 | ||
640 | STORE t0, k | ||
641 | mov k, s | ||
642 | jmp exit_2 | ||
643 | |||
644 | MY_ALIGN_16 | ||
645 | opt_loop: | ||
646 | STEP exit_2 | ||
647 | cmp s, num_last | ||
648 | jb opt_loop | ||
649 | je single | ||
650 | exit_2: | ||
651 | STORE qq, k | ||
652 | endm | ||
653 | |||
654 | |||
655 | |||
656 | |||
657 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
658 | ; MY_ALIGN_64 | ||
659 | else | ||
660 | MY_ALIGN_16 | ||
661 | endif | ||
662 | |||
663 | MY_PROC HeapSort, 2 | ||
664 | ifdef x64 | ||
665 | if (IS_LINUX gt 0) | ||
666 | mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux | ||
667 | mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux | ||
668 | endif | ||
669 | elseif (IS_CDECL gt 0) | ||
670 | mov num_last, [r4 + REG_SIZE * 2] | ||
671 | mov p, [r4 + REG_SIZE * 1] | ||
672 | endif | ||
673 | cmp num_last, 2 | ||
674 | jb end_1 | ||
675 | MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
676 | |||
677 | cmp num_last, 4 | ||
678 | ja sort_5 | ||
679 | |||
680 | LOAD a0, 0 | ||
681 | LOAD a1, 1 | ||
682 | SORT a0, a1 | ||
683 | cmp num_last, 3 | ||
684 | jb end_2 | ||
685 | |||
686 | LOAD a2, 2 | ||
687 | je sort_3 | ||
688 | |||
689 | LOAD a3, 3 | ||
690 | SORT a2, a3 | ||
691 | SORT a1, a3 | ||
692 | STORE a3, 3 | ||
693 | sort_3: | ||
694 | SORT a0, a2 | ||
695 | SORT a1, a2 | ||
696 | STORE a2, 2 | ||
697 | jmp end_2 | ||
698 | |||
699 | sort_5: | ||
700 | ; num_last > 4 | ||
701 | lea i, dword ptr [num_last - 2] | ||
702 | dec num_last | ||
703 | test i, 1 | ||
704 | jz loop_down | ||
705 | |||
706 | ; single child | ||
707 | mov t0, [p + num_last * 4] | ||
708 | mov qq, [p + num_last * 2] | ||
709 | dec i | ||
710 | cmp qq, t0 | ||
711 | jae loop_down | ||
712 | |||
713 | mov [p + num_last * 2], t0 | ||
714 | mov [p + num_last * 4], qq | ||
715 | |||
716 | MY_ALIGN_16 | ||
717 | loop_down: | ||
718 | mov t0, [p + i * 4] | ||
719 | cmp t0, [p + i * 4 + 4] | ||
720 | mov k, i | ||
721 | mov qq, [p + i * 2] | ||
722 | adc k, 0 | ||
723 | LOAD t0, k | ||
724 | cmp qq, t0 | ||
725 | jae down_next | ||
726 | mov [p + i * 2], t0 | ||
727 | lea s, dword ptr [k + k] | ||
728 | |||
729 | DOWN_num_unrolls equ 0 | ||
730 | MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label | ||
731 | down_next: | ||
732 | sub i, 2 | ||
733 | jnb loop_down | ||
734 | ; jmp end_debug | ||
735 | |||
736 | LOAD e0, 0 | ||
737 | |||
738 | MY_ALIGN_16 | ||
739 | main_loop_sort: | ||
740 | ; num_last > 3 | ||
741 | mov t0, [p + 2 * 4] | ||
742 | cmp t0, [p + 3 * 4] | ||
743 | LOAD qq, num_last | ||
744 | STORE e0, num_last | ||
745 | LOAD e0, 1 | ||
746 | mov s_x, 2 | ||
747 | mov k_x, 1 | ||
748 | adc s, 0 | ||
749 | LOAD t0, s | ||
750 | dec num_last | ||
751 | cmp qq, t0 | ||
752 | jae main_exit_label | ||
753 | STORE t0, 1 | ||
754 | mov k, s | ||
755 | add s, s | ||
756 | if 1 | ||
757 | ; for branch data prefetch mode : | ||
758 | ; it's faster for large arrays : larger than (1 << 13) items. | ||
759 | MAIN_num_unrolls equ 10 | ||
760 | STEP_LOOP equ STEP_BRANCH | ||
761 | else | ||
762 | MAIN_num_unrolls equ 0 | ||
763 | STEP_LOOP equ STEP_1 | ||
764 | endif | ||
765 | |||
766 | MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label | ||
767 | |||
768 | ; jmp end_debug | ||
769 | cmp num_last, 3 | ||
770 | jne main_loop_sort | ||
771 | |||
772 | ; num_last == 3 (real_size == 4) | ||
773 | LOAD a0, 2 | ||
774 | LOAD a1, 3 | ||
775 | LOAD a2, 1 | ||
776 | STORE e0, 3 ; e0 is alias for a3 | ||
777 | STORE a2, 2 | ||
778 | SORT a0, a1 | ||
779 | end_2: | ||
780 | STORE a0, 0 | ||
781 | STORE a1, 1 | ||
782 | ; end_debug: | ||
783 | MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
784 | end_1: | ||
785 | MY_ENDP | ||
786 | |||
787 | endif | ||
788 | |||
789 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
790 | _TEXT$Z7_SORT ENDS | ||
791 | endif | ||
792 | |||
793 | if 0 | ||
794 | LEA_IS_D8 (R64) [R2 * 4 + 16] | ||
795 | Lat : TP | ||
796 | 2 : 1 : adl-e | ||
797 | 2 : 3 p056 adl-p | ||
798 | 1 : 2 : p15 hsw-rocket | ||
799 | 1 : 2 : p01 snb-ivb | ||
800 | 1 : 1 : p1 conroe-wsm | ||
801 | 1 : 4 : zen3,zen4 | ||
802 | 2 : 4 : zen1,zen2 | ||
803 | |||
804 | LEA_B_IS (R64) [R2 + R3 * 4] | ||
805 | Lat : TP | ||
806 | 1 : 1 : adl-e | ||
807 | 2 : 3 p056 adl-p | ||
808 | 1 : 2 : p15 hsw-rocket | ||
809 | 1 : 2 : p01 snb-ivb | ||
810 | 1 : 1 : p1 nhm-wsm | ||
811 | 1 : 1 : p0 conroe-wsm | ||
812 | 1 : 4 : zen3,zen4 | ||
813 | 2 :2,4 : zen1,zen2 | ||
814 | |||
815 | LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16] | ||
816 | Lat : TP | ||
817 | 2 : 1 : adl-e | ||
818 | 2 : 3 p056 adl-p | ||
819 | 1 : 2 : p15 ice-rocket | ||
820 | 3 : 1 : p1/p15 hsw-rocket | ||
821 | 3 : 1 : p01 snb-ivb | ||
822 | 1 : 1 : p1 nhm-wsm | ||
823 | 1 : 1 : p0 conroe-wsm | ||
824 | 2,1 : 2 : zen3,zen4 | ||
825 | 2 : 2 : zen1,zen2 | ||
826 | |||
827 | CMOVB (R64, R64) | ||
828 | Lat : TP | ||
829 | 1,2 : 2 : adl-e | ||
830 | 1 : 2 p06 adl-p | ||
831 | 1 : 2 : p06 bwd-rocket | ||
832 | 1,2 : 2 : p0156+p06 hsw | ||
833 | 1,2 :1.5 : p015+p05 snb-ivb | ||
834 | 1,2 : 1 : p015+p05 nhm | ||
835 | 1 : 1 : 2*p015 conroe | ||
836 | 1 : 2 : zen3,zen4 | ||
837 | 1 : 4 : zen1,zen2 | ||
838 | |||
839 | ADC (R64, 0) | ||
840 | Lat : TP | ||
841 | 1,2 : 2 : adl-e | ||
842 | 1 : 2 p06 adl-p | ||
843 | 1 : 2 : p06 bwd-rocket | ||
844 | 1 :1.5 : p0156+p06 hsw | ||
845 | 1 :1.5 : p015+p05 snb-ivb | ||
846 | 2 : 1 : 2*p015 conroe-wstm | ||
847 | 1 : 2 : zen1,zen2,zen3,zen4 | ||
848 | |||
849 | PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution. | ||
850 | L1 : Pentium3 | ||
851 | L2 : NetBurst | ||
852 | L1, not L2: Core duo, Core 2, Atom processors | ||
853 | L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ... | ||
854 | NEHALEM: Fills L1/L3, L1 LRU is not updated | ||
855 | L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ... | ||
856 | PREFETCHT0 : fetch data into all cache levels. | ||
857 | PREFETCHT1 : fetch data into L2 and L3 | ||
858 | endif | ||
859 | |||
860 | end | ||