aboutsummaryrefslogtreecommitdiff
path: root/Asm
diff options
context:
space:
mode:
Diffstat (limited to 'Asm')
-rw-r--r--Asm/x86/Sort.asm860
1 files changed, 860 insertions, 0 deletions
diff --git a/Asm/x86/Sort.asm b/Asm/x86/Sort.asm
new file mode 100644
index 0000000..517c615
--- /dev/null
+++ b/Asm/x86/Sort.asm
@@ -0,0 +1,860 @@
1; SortTest.asm -- ASM version of HeapSort() function
2; Igor Pavlov : Public domain
3
4include ../../../../Asm/x86/7zAsm.asm
5
6MY_ASM_START
7
8ifndef Z7_SORT_ASM_USE_SEGMENT
9if (IS_LINUX gt 0)
10 ; Z7_SORT_ASM_USE_SEGMENT equ 1
11else
12 ; Z7_SORT_ASM_USE_SEGMENT equ 1
13endif
14endif
15
16ifdef Z7_SORT_ASM_USE_SEGMENT
17_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
18MY_ALIGN macro num:req
19 align num
20endm
21else
22MY_ALIGN macro num:req
23 ; We expect that ".text" is aligned for 16-bytes.
24 ; So we don't need large alignment inside our function.
25 align 16
26endm
27endif
28
29
30MY_ALIGN_16 macro
31 MY_ALIGN 16
32endm
33
34MY_ALIGN_32 macro
35 MY_ALIGN 32
36endm
37
38MY_ALIGN_64 macro
39 MY_ALIGN 64
40endm
41
42ifdef x64
43
44NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
45; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)
46
47acc equ x0
48k equ r0
49k_x equ x0
50
51p equ r1
52
53s equ r2
54s_x equ x2
55
56a0 equ x3
57t0 equ a0
58
59a3 equ x5
60qq equ a3
61
62a1 equ x6
63t1 equ a1
64t1_r equ r6
65
66a2 equ x7
67t2 equ a2
68
69i equ r8
70e0 equ x8
71
72e1 equ x9
73
74num_last equ r10
75num_last_x equ x10
76
77next4_lim equ r11
78pref_lim equ r12
79
80
81
82SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
83 mov temp_reg, b0
84 cmp b0, b1
85 cmovae b0, b1 ; min
86 cmovae b1, temp_reg ; max
87endm
88
89SORT macro b0, b1
90 SORT_2_WITH_TEMP_REG b0, b1, acc
91endm
92
93LOAD macro dest:req, index:req
94 mov dest, [p + 4 * index]
95endm
96
97STORE macro reg:req, index:req
98 mov [p + 4 * index], reg
99endm
100
101
102if (NUM_PREFETCH_LEVELS gt 3)
103 num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
104else
105 num_prefetches equ 1
106endif
107
108PREFETCH_OP macro offs
109 cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line.
110 ; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
111 rept num_prefetches
112 if 1
113 prefetcht0 byte ptr [p + offs + cur_offset]
114 else
115 mov pref_x, dword ptr [p + offs + cur_offset]
116 endif
117 cur_offset = cur_offset + 64
118 endm
119endm
120
121PREFETCH_MY macro
122if 1
123 if 1
124 shl k, NUM_PREFETCH_LEVELS + 3
125 else
126 ; we delay prefetch instruction to improve main loads
127 shl k, NUM_PREFETCH_LEVELS
128 shl k, 3
129 ; shl k, 0
130 endif
131 PREFETCH_OP k
132elseif 1
133 shl k, 3
134 PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
135endif
136endm
137
138
139STEP_1 macro exit_label, prefetch_macro
140use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases
141 ; set 0 for LOAD after adc s, 0
142 cmp t0, t1
143 if use_cmov_1
144 cmovb t0, t1
145 ; STORE t0, k
146 endif
147 adc s, 0
148 if use_cmov_1 eq 0
149 LOAD t0, s
150 endif
151 cmp qq, t0
152 jae exit_label
153 if 1 ; use_cmov_1 eq 0
154 STORE t0, k
155 endif
156 prefetch_macro
157 mov t0, [p + s * 8]
158 mov t1, [p + s * 8 + 4]
159 mov k, s
160 add s, s ; slower for some cpus
161 ; lea s, dword ptr [s + s] ; slower for some cpus
162 ; shl s, 1 ; faster for some cpus
163 ; lea s, dword ptr [s * 2] ; faster for some cpus
164 rept 0 ; 1000 for debug : 0 for normal
165 ; number of calls in generate_stage : ~0.6 of number of items
166 shl k, 0
167 endm
168endm
169
170
171STEP_2 macro exit_label, prefetch_macro
172use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases
173 ; set 0 for LOAD after adc s, 0
174 cmp t0, t1
175 if use_cmov_2
176 mov t2, t0
177 cmovb t2, t1
178 ; STORE t2, k
179 endif
180 mov t0, [p + s * 8]
181 mov t1, [p + s * 8 + 4]
182 cmovb t0, [p + s * 8 + 8]
183 cmovb t1, [p + s * 8 + 12]
184 adc s, 0
185 if use_cmov_2 eq 0
186 LOAD t2, s
187 endif
188 cmp qq, t2
189 jae exit_label
190 if 1 ; use_cmov_2 eq 0
191 STORE t2, k
192 endif
193 prefetch_macro
194 mov k, s
195 ; add s, s
196 ; lea s, [s + s]
197 shl s, 1
198 ; lea s, [s * 2]
199endm
200
201
202MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
203 LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes
204
205 ; s == k * 2
206 ; t0 == (p)[s]
207 ; t1 == (p)[s + 1]
208 cmp k, next4_lim
209 jae leaves
210
211 rept num_unrolls
212 STEP exit_2
213 cmp k, next4_lim
214 jae leaves
215 endm
216
217 if use_prefetch
218 prefetch_macro equ PREFETCH_MY
219 pref_lim_2 equ pref_lim
220 ; lea pref_lim, dword ptr [num_last + 1]
221 ; shr pref_lim, NUM_PREFETCH_LEVELS + 1
222 cmp k, pref_lim_2
223 jae last_nodes
224 else
225 prefetch_macro equ
226 pref_lim_2 equ next4_lim
227 endif
228
229MY_ALIGN_16
230opt_loop:
231 STEP exit_2, prefetch_macro
232 cmp k, pref_lim_2
233 jb opt_loop
234
235last_nodes:
236 ; k >= pref_lim_2
237 ; 2 cases are possible:
238 ; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
239 ; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
240 ; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
241 if use_prefetch
242 yyy = NUM_PREFETCH_LEVELS - 1
243 while yyy
244 yyy = yyy - 1
245 STEP exit_2
246 if yyy
247 cmp k, next4_lim
248 jae leaves
249 endif
250 endm
251 endif
252
253leaves:
254 ; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
255 ; we have 2 nodes in (s) level : always
256 ; we can have some nodes in (s * 2) level : low probability case
257 ; we have no nodes in (s * 4) level
258 ; s == k * 2
259 ; t0 == (p)[s]
260 ; t1 == (p)[s + 1]
261 cmp t0, t1
262 cmovb t0, t1
263 adc s, 0
264 STORE t0, k
265
266 ; t0 == (p)[s]
267 ; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
268 ; we have 3 possible cases here:
269 ; s * 2 > num_last : (s) node has no childs
270 ; s * 2 == num_last : (s) node has 1 leaf child that is last item of array
271 ; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
272 ; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
273 ; we will replace conditional jump with cmov instruction later.
274 lea t1_r, dword ptr [s + s]
275 cmp t1_r, num_last
276 ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
277
278 ; it's low probability branch
279 ; s * 2 <= num_last
280 cmp qq, t0
281 jae exit_2
282
283 ; qq < t0, so we go to next level
284 ; we check 1 or 2 childs in next level
285 mov t0, [p + s * 8]
286 mov k, s
287 mov s, t1_r
288 cmp t1_r, num_last
289 je @F ; (s == num_last) means that we have single child in tree
290
291 ; (s < num_last) : so we must read both childs and select max of them.
292 mov t1, [p + k * 8 + 4]
293 cmp t0, t1
294 cmovb t0, t1
295 adc s, 0
296@@:
297 STORE t0, k
298exit_1:
299 ; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
300 cmp qq, t0
301 cmovb k, s
302exit_2:
303 STORE qq, k
304endm
305
306
307
308
309ifdef Z7_SORT_ASM_USE_SEGMENT
310; MY_ALIGN_64
311else
312 MY_ALIGN_16
313endif
314
315MY_PROC HeapSort, 2
316
317if (IS_LINUX gt 0)
318 mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
319endif
320 mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux
321 ; r10 <- r2 : win64
322 cmp num_last, 2
323 jb end_1
324
325 ; MY_PUSH_PRESERVED_ABI_REGS
326 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
327 push r12
328
329 cmp num_last, 4
330 ja sort_5
331
332 LOAD a0, 0
333 LOAD a1, 1
334 SORT a0, a1
335 cmp num_last, 3
336 jb end_2
337
338 LOAD a2, 2
339 je sort_3
340
341 LOAD a3, 3
342 SORT a2, a3
343 SORT a1, a3
344 STORE a3, 3
345sort_3:
346 SORT a0, a2
347 SORT a1, a2
348 STORE a2, 2
349 jmp end_2
350
351sort_5:
352 ; (num_last > 4) is required here
353 ; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
354 mov next4_lim, num_last
355 shr next4_lim, 2
356
357 dec num_last
358 mov k, num_last
359 shr k, 1
360 mov i, num_last
361 shr i, 2
362 test num_last, 1
363 jnz size_even
364
365 ; ODD number of items. So we compare parent with single child
366 LOAD t1, num_last
367 LOAD t0, k
368 SORT_2_WITH_TEMP_REG t1, t0, t2
369 STORE t1, num_last
370 STORE t0, k
371 dec k
372
373size_even:
374 cmp k, i
375 jbe loop_down ; jump for num_last == 4 case
376
377if 0 ; 1 for debug
378 mov r15, k
379 mov r14d, 1 ; 100
380loop_benchmark:
381endif
382 ; optimized loop for leaf nodes:
383 mov t0, [p + k * 8]
384 mov t1, [p + k * 8 + 4]
385
386MY_ALIGN_16
387loop_down_1:
388 ; we compare parent with max of childs:
389 ; lea s, dword ptr [2 * k]
390 mov s, k
391 cmp t0, t1
392 cmovb t0, t1
393 adc s, s
394 LOAD t2, k
395 STORE t0, k
396 cmp t2, t0
397 cmovae s, k
398 dec k
399 ; we preload next items before STORE operation for calculated address
400 mov t0, [p + k * 8]
401 mov t1, [p + k * 8 + 4]
402 STORE t2, s
403 cmp k, i
404 jne loop_down_1
405
406if 0 ; 1 for debug
407 mov k, r15
408 dec r14d
409 jnz loop_benchmark
410 ; jmp end_debug
411endif
412
413MY_ALIGN_16
414loop_down:
415 mov t0, [p + i * 8]
416 mov t1, [p + i * 8 + 4]
417 LOAD qq, i
418 mov k, i
419 lea s, dword ptr [i + i]
420 ; jmp end_debug
421 DOWN_use_prefetch equ 0
422 DOWN_num_unrolls equ 0
423 MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
424 sub i, 1
425 jnb loop_down
426
427 ; jmp end_debug
428 LOAD e0, 0
429 LOAD e1, 1
430
431 LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work
432
433 cmp num_last, LEVEL_3_LIMIT + 1
434 jb main_loop_sort_5
435
436MY_ALIGN_16
437main_loop_sort:
438 ; num_last > LEVEL_3_LIMIT
439 ; p[size--] = p[0];
440 LOAD qq, num_last
441 STORE e0, num_last
442 mov e0, e1
443
444 mov next4_lim, num_last
445 shr next4_lim, 2
446 mov pref_lim, num_last
447 shr pref_lim, NUM_PREFETCH_LEVELS + 1
448
449 dec num_last
450if 0 ; 1 for debug
451 ; that optional optimization can improve the performance, if there are identical items in array
452 ; 3 times improvement : if all items in array are identical
453 ; 20% improvement : if items are different for 1 bit only
454 ; 1-10% improvement : if items are different for (2+) bits
455 ; no gain : if items are different
456 cmp qq, e1
457 jae next_iter_main
458endif
459 LOAD e1, 2
460 LOAD t0, 3
461 mov k_x, 2
462 cmp e1, t0
463 cmovb e1, t0
464 mov t0, [p + 4 * (4 + 0)]
465 mov t1, [p + 4 * (4 + 1)]
466 cmovb t0, [p + 4 * (4 + 2)]
467 cmovb t1, [p + 4 * (4 + 3)]
468 adc k_x, 0
469 ; (qq <= e1), because the tree is correctly sorted
470 ; also here we could check (qq >= e1) or (qq == e1) for faster exit
471 lea s, dword ptr [k + k]
472 MAIN_use_prefetch equ 1
473 MAIN_num_unrolls equ 0
474 MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls
475
476next_iter_main:
477 cmp num_last, LEVEL_3_LIMIT
478 jne main_loop_sort
479
480 ; num_last == LEVEL_3_LIMIT
481main_loop_sort_5:
482 ; 4 <= num_last <= LEVEL_3_LIMIT
483 ; p[size--] = p[0];
484 LOAD qq, num_last
485 STORE e0, num_last
486 mov e0, e1
487 dec num_last_x
488
489 LOAD e1, 2
490 LOAD t0, 3
491 mov k_x, 2
492 cmp e1, t0
493 cmovb e1, t0
494 adc k_x, 0
495
496 lea s_x, dword ptr [k * 2]
497 cmp s_x, num_last_x
498 ja exit_2
499
500 mov t0, [p + k * 8]
501 je exit_1
502
503 ; s < num_last
504 mov t1, [p + k * 8 + 4]
505 cmp t0, t1
506 cmovb t0, t1
507 adc s_x, 0
508exit_1:
509 STORE t0, k
510 cmp qq, t0
511 cmovb k_x, s_x
512exit_2:
513 STORE qq, k
514 cmp num_last_x, 3
515 jne main_loop_sort_5
516
517 ; num_last == 3 (real_size == 4)
518 LOAD a0, 2
519 LOAD a1, 3
520 STORE e1, 2
521 STORE e0, 3
522 SORT a0, a1
523end_2:
524 STORE a0, 0
525 STORE a1, 1
526; end_debug:
527 ; MY_POP_PRESERVED_ABI_REGS
528 pop r12
529 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
530end_1:
531MY_ENDP
532
533
534
535else
536; ------------ x86 32-bit ------------
537
538ifdef x64
539IS_CDECL = 0
540endif
541
542acc equ x0
543k equ r0
544k_x equ acc
545
546p equ r1
547
548num_last equ r2
549num_last_x equ x2
550
551a0 equ x3
552t0 equ a0
553
554a3 equ x5
555i equ r5
556e0 equ a3
557
558a1 equ x6
559qq equ a1
560
561a2 equ x7
562s equ r7
563s_x equ a2
564
565
566SORT macro b0, b1
567 cmp b1, b0
568 jae @F
569 if 1
570 xchg b0, b1
571 else
572 mov acc, b0
573 mov b0, b1 ; min
574 mov b1, acc ; max
575 endif
576@@:
577endm
578
579LOAD macro dest:req, index:req
580 mov dest, [p + 4 * index]
581endm
582
583STORE macro reg:req, index:req
584 mov [p + 4 * index], reg
585endm
586
587
588STEP_1 macro exit_label
589 mov t0, [p + k * 8]
590 cmp t0, [p + k * 8 + 4]
591 adc s, 0
592 LOAD t0, s
593 STORE t0, k ; we lookahed stooring for most expected branch
594 cmp qq, t0
595 jae exit_label
596 ; STORE t0, k ; use if
597 mov k, s
598 add s, s
599 ; lea s, dword ptr [s + s]
600 ; shl s, 1
601 ; lea s, dword ptr [s * 2]
602endm
603
604STEP_BRANCH macro exit_label
605 mov t0, [p + k * 8]
606 cmp t0, [p + k * 8 + 4]
607 jae @F
608 inc s
609 mov t0, [p + k * 8 + 4]
610@@:
611 cmp qq, t0
612 jae exit_label
613 STORE t0, k
614 mov k, s
615 add s, s
616endm
617
618
619
620MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
621 LOCAL leaves, opt_loop, single
622
623 ; s == k * 2
624 rept num_unrolls
625 cmp s, num_last
626 jae leaves
627 STEP_1 exit_2
628 endm
629 cmp s, num_last
630 jb opt_loop
631
632leaves:
633 ; (s >= num_last)
634 jne exit_2
635single:
636 ; (s == num_last)
637 mov t0, [p + k * 8]
638 cmp qq, t0
639 jae exit_2
640 STORE t0, k
641 mov k, s
642 jmp exit_2
643
644MY_ALIGN_16
645opt_loop:
646 STEP exit_2
647 cmp s, num_last
648 jb opt_loop
649 je single
650exit_2:
651 STORE qq, k
652endm
653
654
655
656
657ifdef Z7_SORT_ASM_USE_SEGMENT
658; MY_ALIGN_64
659else
660 MY_ALIGN_16
661endif
662
663MY_PROC HeapSort, 2
664 ifdef x64
665 if (IS_LINUX gt 0)
666 mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux
667 mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
668 endif
669 elseif (IS_CDECL gt 0)
670 mov num_last, [r4 + REG_SIZE * 2]
671 mov p, [r4 + REG_SIZE * 1]
672 endif
673 cmp num_last, 2
674 jb end_1
675 MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
676
677 cmp num_last, 4
678 ja sort_5
679
680 LOAD a0, 0
681 LOAD a1, 1
682 SORT a0, a1
683 cmp num_last, 3
684 jb end_2
685
686 LOAD a2, 2
687 je sort_3
688
689 LOAD a3, 3
690 SORT a2, a3
691 SORT a1, a3
692 STORE a3, 3
693sort_3:
694 SORT a0, a2
695 SORT a1, a2
696 STORE a2, 2
697 jmp end_2
698
699sort_5:
700 ; num_last > 4
701 lea i, dword ptr [num_last - 2]
702 dec num_last
703 test i, 1
704 jz loop_down
705
706 ; single child
707 mov t0, [p + num_last * 4]
708 mov qq, [p + num_last * 2]
709 dec i
710 cmp qq, t0
711 jae loop_down
712
713 mov [p + num_last * 2], t0
714 mov [p + num_last * 4], qq
715
716MY_ALIGN_16
717loop_down:
718 mov t0, [p + i * 4]
719 cmp t0, [p + i * 4 + 4]
720 mov k, i
721 mov qq, [p + i * 2]
722 adc k, 0
723 LOAD t0, k
724 cmp qq, t0
725 jae down_next
726 mov [p + i * 2], t0
727 lea s, dword ptr [k + k]
728
729 DOWN_num_unrolls equ 0
730 MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label
731down_next:
732 sub i, 2
733 jnb loop_down
734 ; jmp end_debug
735
736 LOAD e0, 0
737
738MY_ALIGN_16
739main_loop_sort:
740 ; num_last > 3
741 mov t0, [p + 2 * 4]
742 cmp t0, [p + 3 * 4]
743 LOAD qq, num_last
744 STORE e0, num_last
745 LOAD e0, 1
746 mov s_x, 2
747 mov k_x, 1
748 adc s, 0
749 LOAD t0, s
750 dec num_last
751 cmp qq, t0
752 jae main_exit_label
753 STORE t0, 1
754 mov k, s
755 add s, s
756 if 1
757 ; for branch data prefetch mode :
758 ; it's faster for large arrays : larger than (1 << 13) items.
759 MAIN_num_unrolls equ 10
760 STEP_LOOP equ STEP_BRANCH
761 else
762 MAIN_num_unrolls equ 0
763 STEP_LOOP equ STEP_1
764 endif
765
766 MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label
767
768 ; jmp end_debug
769 cmp num_last, 3
770 jne main_loop_sort
771
772 ; num_last == 3 (real_size == 4)
773 LOAD a0, 2
774 LOAD a1, 3
775 LOAD a2, 1
776 STORE e0, 3 ; e0 is alias for a3
777 STORE a2, 2
778 SORT a0, a1
779end_2:
780 STORE a0, 0
781 STORE a1, 1
782; end_debug:
783 MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
784end_1:
785MY_ENDP
786
787endif
788
789ifdef Z7_SORT_ASM_USE_SEGMENT
790_TEXT$Z7_SORT ENDS
791endif
792
793if 0
794LEA_IS_D8 (R64) [R2 * 4 + 16]
795 Lat : TP
796 2 : 1 : adl-e
797 2 : 3 p056 adl-p
798 1 : 2 : p15 hsw-rocket
799 1 : 2 : p01 snb-ivb
800 1 : 1 : p1 conroe-wsm
801 1 : 4 : zen3,zen4
802 2 : 4 : zen1,zen2
803
804LEA_B_IS (R64) [R2 + R3 * 4]
805 Lat : TP
806 1 : 1 : adl-e
807 2 : 3 p056 adl-p
808 1 : 2 : p15 hsw-rocket
809 1 : 2 : p01 snb-ivb
810 1 : 1 : p1 nhm-wsm
811 1 : 1 : p0 conroe-wsm
812 1 : 4 : zen3,zen4
813 2 :2,4 : zen1,zen2
814
815LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
816 Lat : TP
817 2 : 1 : adl-e
818 2 : 3 p056 adl-p
819 1 : 2 : p15 ice-rocket
820 3 : 1 : p1/p15 hsw-rocket
821 3 : 1 : p01 snb-ivb
822 1 : 1 : p1 nhm-wsm
823 1 : 1 : p0 conroe-wsm
824 2,1 : 2 : zen3,zen4
825 2 : 2 : zen1,zen2
826
827CMOVB (R64, R64)
828 Lat : TP
829 1,2 : 2 : adl-e
830 1 : 2 p06 adl-p
831 1 : 2 : p06 bwd-rocket
832 1,2 : 2 : p0156+p06 hsw
833 1,2 :1.5 : p015+p05 snb-ivb
834 1,2 : 1 : p015+p05 nhm
835 1 : 1 : 2*p015 conroe
836 1 : 2 : zen3,zen4
837 1 : 4 : zen1,zen2
838
839ADC (R64, 0)
840 Lat : TP
841 1,2 : 2 : adl-e
842 1 : 2 p06 adl-p
843 1 : 2 : p06 bwd-rocket
844 1 :1.5 : p0156+p06 hsw
845 1 :1.5 : p015+p05 snb-ivb
846 2 : 1 : 2*p015 conroe-wstm
847 1 : 2 : zen1,zen2,zen3,zen4
848
849PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
850 L1 : Pentium3
851 L2 : NetBurst
852 L1, not L2: Core duo, Core 2, Atom processors
853 L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
854 NEHALEM: Fills L1/L3, L1 LRU is not updated
855 L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
856PREFETCHT0 : fetch data into all cache levels.
857PREFETCHT1 : fetch data into L2 and L3
858endif
859
860end