diff options
Diffstat (limited to 'Asm/x86/Sort.asm')
| -rw-r--r-- | Asm/x86/Sort.asm | 860 |
1 files changed, 860 insertions, 0 deletions
diff --git a/Asm/x86/Sort.asm b/Asm/x86/Sort.asm new file mode 100644 index 0000000..517c615 --- /dev/null +++ b/Asm/x86/Sort.asm | |||
| @@ -0,0 +1,860 @@ | |||
| 1 | ; SortTest.asm -- ASM version of HeapSort() function | ||
| 2 | ; Igor Pavlov : Public domain | ||
| 3 | |||
| 4 | include ../../../../Asm/x86/7zAsm.asm | ||
| 5 | |||
| 6 | MY_ASM_START | ||
| 7 | |||
| 8 | ifndef Z7_SORT_ASM_USE_SEGMENT | ||
| 9 | if (IS_LINUX gt 0) | ||
| 10 | ; Z7_SORT_ASM_USE_SEGMENT equ 1 | ||
| 11 | else | ||
| 12 | ; Z7_SORT_ASM_USE_SEGMENT equ 1 | ||
| 13 | endif | ||
| 14 | endif | ||
| 15 | |||
| 16 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
| 17 | _TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE' | ||
| 18 | MY_ALIGN macro num:req | ||
| 19 | align num | ||
| 20 | endm | ||
| 21 | else | ||
| 22 | MY_ALIGN macro num:req | ||
| 23 | ; We expect that ".text" is aligned for 16-bytes. | ||
| 24 | ; So we don't need large alignment inside our function. | ||
| 25 | align 16 | ||
| 26 | endm | ||
| 27 | endif | ||
| 28 | |||
| 29 | |||
| 30 | MY_ALIGN_16 macro | ||
| 31 | MY_ALIGN 16 | ||
| 32 | endm | ||
| 33 | |||
| 34 | MY_ALIGN_32 macro | ||
| 35 | MY_ALIGN 32 | ||
| 36 | endm | ||
| 37 | |||
| 38 | MY_ALIGN_64 macro | ||
| 39 | MY_ALIGN 64 | ||
| 40 | endm | ||
| 41 | |||
| 42 | ifdef x64 | ||
| 43 | |||
| 44 | NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases) | ||
| 45 | ; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays) | ||
| 46 | |||
| 47 | acc equ x0 | ||
| 48 | k equ r0 | ||
| 49 | k_x equ x0 | ||
| 50 | |||
| 51 | p equ r1 | ||
| 52 | |||
| 53 | s equ r2 | ||
| 54 | s_x equ x2 | ||
| 55 | |||
| 56 | a0 equ x3 | ||
| 57 | t0 equ a0 | ||
| 58 | |||
| 59 | a3 equ x5 | ||
| 60 | qq equ a3 | ||
| 61 | |||
| 62 | a1 equ x6 | ||
| 63 | t1 equ a1 | ||
| 64 | t1_r equ r6 | ||
| 65 | |||
| 66 | a2 equ x7 | ||
| 67 | t2 equ a2 | ||
| 68 | |||
| 69 | i equ r8 | ||
| 70 | e0 equ x8 | ||
| 71 | |||
| 72 | e1 equ x9 | ||
| 73 | |||
| 74 | num_last equ r10 | ||
| 75 | num_last_x equ x10 | ||
| 76 | |||
| 77 | next4_lim equ r11 | ||
| 78 | pref_lim equ r12 | ||
| 79 | |||
| 80 | |||
| 81 | |||
| 82 | SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg | ||
| 83 | mov temp_reg, b0 | ||
| 84 | cmp b0, b1 | ||
| 85 | cmovae b0, b1 ; min | ||
| 86 | cmovae b1, temp_reg ; max | ||
| 87 | endm | ||
| 88 | |||
| 89 | SORT macro b0, b1 | ||
| 90 | SORT_2_WITH_TEMP_REG b0, b1, acc | ||
| 91 | endm | ||
| 92 | |||
| 93 | LOAD macro dest:req, index:req | ||
| 94 | mov dest, [p + 4 * index] | ||
| 95 | endm | ||
| 96 | |||
| 97 | STORE macro reg:req, index:req | ||
| 98 | mov [p + 4 * index], reg | ||
| 99 | endm | ||
| 100 | |||
| 101 | |||
| 102 | if (NUM_PREFETCH_LEVELS gt 3) | ||
| 103 | num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3)) | ||
| 104 | else | ||
| 105 | num_prefetches equ 1 | ||
| 106 | endif | ||
| 107 | |||
| 108 | PREFETCH_OP macro offs | ||
| 109 | cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line. | ||
| 110 | ; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes. | ||
| 111 | rept num_prefetches | ||
| 112 | if 1 | ||
| 113 | prefetcht0 byte ptr [p + offs + cur_offset] | ||
| 114 | else | ||
| 115 | mov pref_x, dword ptr [p + offs + cur_offset] | ||
| 116 | endif | ||
| 117 | cur_offset = cur_offset + 64 | ||
| 118 | endm | ||
| 119 | endm | ||
| 120 | |||
| 121 | PREFETCH_MY macro | ||
| 122 | if 1 | ||
| 123 | if 1 | ||
| 124 | shl k, NUM_PREFETCH_LEVELS + 3 | ||
| 125 | else | ||
| 126 | ; we delay prefetch instruction to improve main loads | ||
| 127 | shl k, NUM_PREFETCH_LEVELS | ||
| 128 | shl k, 3 | ||
| 129 | ; shl k, 0 | ||
| 130 | endif | ||
| 131 | PREFETCH_OP k | ||
| 132 | elseif 1 | ||
| 133 | shl k, 3 | ||
| 134 | PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it | ||
| 135 | endif | ||
| 136 | endm | ||
| 137 | |||
| 138 | |||
| 139 | STEP_1 macro exit_label, prefetch_macro | ||
| 140 | use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases | ||
| 141 | ; set 0 for LOAD after adc s, 0 | ||
| 142 | cmp t0, t1 | ||
| 143 | if use_cmov_1 | ||
| 144 | cmovb t0, t1 | ||
| 145 | ; STORE t0, k | ||
| 146 | endif | ||
| 147 | adc s, 0 | ||
| 148 | if use_cmov_1 eq 0 | ||
| 149 | LOAD t0, s | ||
| 150 | endif | ||
| 151 | cmp qq, t0 | ||
| 152 | jae exit_label | ||
| 153 | if 1 ; use_cmov_1 eq 0 | ||
| 154 | STORE t0, k | ||
| 155 | endif | ||
| 156 | prefetch_macro | ||
| 157 | mov t0, [p + s * 8] | ||
| 158 | mov t1, [p + s * 8 + 4] | ||
| 159 | mov k, s | ||
| 160 | add s, s ; slower for some cpus | ||
| 161 | ; lea s, dword ptr [s + s] ; slower for some cpus | ||
| 162 | ; shl s, 1 ; faster for some cpus | ||
| 163 | ; lea s, dword ptr [s * 2] ; faster for some cpus | ||
| 164 | rept 0 ; 1000 for debug : 0 for normal | ||
| 165 | ; number of calls in generate_stage : ~0.6 of number of items | ||
| 166 | shl k, 0 | ||
| 167 | endm | ||
| 168 | endm | ||
| 169 | |||
| 170 | |||
| 171 | STEP_2 macro exit_label, prefetch_macro | ||
| 172 | use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases | ||
| 173 | ; set 0 for LOAD after adc s, 0 | ||
| 174 | cmp t0, t1 | ||
| 175 | if use_cmov_2 | ||
| 176 | mov t2, t0 | ||
| 177 | cmovb t2, t1 | ||
| 178 | ; STORE t2, k | ||
| 179 | endif | ||
| 180 | mov t0, [p + s * 8] | ||
| 181 | mov t1, [p + s * 8 + 4] | ||
| 182 | cmovb t0, [p + s * 8 + 8] | ||
| 183 | cmovb t1, [p + s * 8 + 12] | ||
| 184 | adc s, 0 | ||
| 185 | if use_cmov_2 eq 0 | ||
| 186 | LOAD t2, s | ||
| 187 | endif | ||
| 188 | cmp qq, t2 | ||
| 189 | jae exit_label | ||
| 190 | if 1 ; use_cmov_2 eq 0 | ||
| 191 | STORE t2, k | ||
| 192 | endif | ||
| 193 | prefetch_macro | ||
| 194 | mov k, s | ||
| 195 | ; add s, s | ||
| 196 | ; lea s, [s + s] | ||
| 197 | shl s, 1 | ||
| 198 | ; lea s, [s * 2] | ||
| 199 | endm | ||
| 200 | |||
| 201 | |||
| 202 | MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls | ||
| 203 | LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes | ||
| 204 | |||
| 205 | ; s == k * 2 | ||
| 206 | ; t0 == (p)[s] | ||
| 207 | ; t1 == (p)[s + 1] | ||
| 208 | cmp k, next4_lim | ||
| 209 | jae leaves | ||
| 210 | |||
| 211 | rept num_unrolls | ||
| 212 | STEP exit_2 | ||
| 213 | cmp k, next4_lim | ||
| 214 | jae leaves | ||
| 215 | endm | ||
| 216 | |||
| 217 | if use_prefetch | ||
| 218 | prefetch_macro equ PREFETCH_MY | ||
| 219 | pref_lim_2 equ pref_lim | ||
| 220 | ; lea pref_lim, dword ptr [num_last + 1] | ||
| 221 | ; shr pref_lim, NUM_PREFETCH_LEVELS + 1 | ||
| 222 | cmp k, pref_lim_2 | ||
| 223 | jae last_nodes | ||
| 224 | else | ||
| 225 | prefetch_macro equ | ||
| 226 | pref_lim_2 equ next4_lim | ||
| 227 | endif | ||
| 228 | |||
| 229 | MY_ALIGN_16 | ||
| 230 | opt_loop: | ||
| 231 | STEP exit_2, prefetch_macro | ||
| 232 | cmp k, pref_lim_2 | ||
| 233 | jb opt_loop | ||
| 234 | |||
| 235 | last_nodes: | ||
| 236 | ; k >= pref_lim_2 | ||
| 237 | ; 2 cases are possible: | ||
| 238 | ; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2 | ||
| 239 | ; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 && | ||
| 240 | ; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1) | ||
| 241 | if use_prefetch | ||
| 242 | yyy = NUM_PREFETCH_LEVELS - 1 | ||
| 243 | while yyy | ||
| 244 | yyy = yyy - 1 | ||
| 245 | STEP exit_2 | ||
| 246 | if yyy | ||
| 247 | cmp k, next4_lim | ||
| 248 | jae leaves | ||
| 249 | endif | ||
| 250 | endm | ||
| 251 | endif | ||
| 252 | |||
| 253 | leaves: | ||
| 254 | ; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code. | ||
| 255 | ; we have 2 nodes in (s) level : always | ||
| 256 | ; we can have some nodes in (s * 2) level : low probability case | ||
| 257 | ; we have no nodes in (s * 4) level | ||
| 258 | ; s == k * 2 | ||
| 259 | ; t0 == (p)[s] | ||
| 260 | ; t1 == (p)[s + 1] | ||
| 261 | cmp t0, t1 | ||
| 262 | cmovb t0, t1 | ||
| 263 | adc s, 0 | ||
| 264 | STORE t0, k | ||
| 265 | |||
| 266 | ; t0 == (p)[s] | ||
| 267 | ; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1] | ||
| 268 | ; we have 3 possible cases here: | ||
| 269 | ; s * 2 > num_last : (s) node has no childs | ||
| 270 | ; s * 2 == num_last : (s) node has 1 leaf child that is last item of array | ||
| 271 | ; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last) | ||
| 272 | ; we check for (s * 2 > num_last) before "cmp qq, t0" check, because | ||
| 273 | ; we will replace conditional jump with cmov instruction later. | ||
| 274 | lea t1_r, dword ptr [s + s] | ||
| 275 | cmp t1_r, num_last | ||
| 276 | ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch | ||
| 277 | |||
| 278 | ; it's low probability branch | ||
| 279 | ; s * 2 <= num_last | ||
| 280 | cmp qq, t0 | ||
| 281 | jae exit_2 | ||
| 282 | |||
| 283 | ; qq < t0, so we go to next level | ||
| 284 | ; we check 1 or 2 childs in next level | ||
| 285 | mov t0, [p + s * 8] | ||
| 286 | mov k, s | ||
| 287 | mov s, t1_r | ||
| 288 | cmp t1_r, num_last | ||
| 289 | je @F ; (s == num_last) means that we have single child in tree | ||
| 290 | |||
| 291 | ; (s < num_last) : so we must read both childs and select max of them. | ||
| 292 | mov t1, [p + k * 8 + 4] | ||
| 293 | cmp t0, t1 | ||
| 294 | cmovb t0, t1 | ||
| 295 | adc s, 0 | ||
| 296 | @@: | ||
| 297 | STORE t0, k | ||
| 298 | exit_1: | ||
| 299 | ; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1] | ||
| 300 | cmp qq, t0 | ||
| 301 | cmovb k, s | ||
| 302 | exit_2: | ||
| 303 | STORE qq, k | ||
| 304 | endm | ||
| 305 | |||
| 306 | |||
| 307 | |||
| 308 | |||
| 309 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
| 310 | ; MY_ALIGN_64 | ||
| 311 | else | ||
| 312 | MY_ALIGN_16 | ||
| 313 | endif | ||
| 314 | |||
| 315 | MY_PROC HeapSort, 2 | ||
| 316 | |||
| 317 | if (IS_LINUX gt 0) | ||
| 318 | mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux | ||
| 319 | endif | ||
| 320 | mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux | ||
| 321 | ; r10 <- r2 : win64 | ||
| 322 | cmp num_last, 2 | ||
| 323 | jb end_1 | ||
| 324 | |||
| 325 | ; MY_PUSH_PRESERVED_ABI_REGS | ||
| 326 | MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
| 327 | push r12 | ||
| 328 | |||
| 329 | cmp num_last, 4 | ||
| 330 | ja sort_5 | ||
| 331 | |||
| 332 | LOAD a0, 0 | ||
| 333 | LOAD a1, 1 | ||
| 334 | SORT a0, a1 | ||
| 335 | cmp num_last, 3 | ||
| 336 | jb end_2 | ||
| 337 | |||
| 338 | LOAD a2, 2 | ||
| 339 | je sort_3 | ||
| 340 | |||
| 341 | LOAD a3, 3 | ||
| 342 | SORT a2, a3 | ||
| 343 | SORT a1, a3 | ||
| 344 | STORE a3, 3 | ||
| 345 | sort_3: | ||
| 346 | SORT a0, a2 | ||
| 347 | SORT a1, a2 | ||
| 348 | STORE a2, 2 | ||
| 349 | jmp end_2 | ||
| 350 | |||
| 351 | sort_5: | ||
| 352 | ; (num_last > 4) is required here | ||
| 353 | ; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1 | ||
| 354 | mov next4_lim, num_last | ||
| 355 | shr next4_lim, 2 | ||
| 356 | |||
| 357 | dec num_last | ||
| 358 | mov k, num_last | ||
| 359 | shr k, 1 | ||
| 360 | mov i, num_last | ||
| 361 | shr i, 2 | ||
| 362 | test num_last, 1 | ||
| 363 | jnz size_even | ||
| 364 | |||
| 365 | ; ODD number of items. So we compare parent with single child | ||
| 366 | LOAD t1, num_last | ||
| 367 | LOAD t0, k | ||
| 368 | SORT_2_WITH_TEMP_REG t1, t0, t2 | ||
| 369 | STORE t1, num_last | ||
| 370 | STORE t0, k | ||
| 371 | dec k | ||
| 372 | |||
| 373 | size_even: | ||
| 374 | cmp k, i | ||
| 375 | jbe loop_down ; jump for num_last == 4 case | ||
| 376 | |||
| 377 | if 0 ; 1 for debug | ||
| 378 | mov r15, k | ||
| 379 | mov r14d, 1 ; 100 | ||
| 380 | loop_benchmark: | ||
| 381 | endif | ||
| 382 | ; optimized loop for leaf nodes: | ||
| 383 | mov t0, [p + k * 8] | ||
| 384 | mov t1, [p + k * 8 + 4] | ||
| 385 | |||
| 386 | MY_ALIGN_16 | ||
| 387 | loop_down_1: | ||
| 388 | ; we compare parent with max of childs: | ||
| 389 | ; lea s, dword ptr [2 * k] | ||
| 390 | mov s, k | ||
| 391 | cmp t0, t1 | ||
| 392 | cmovb t0, t1 | ||
| 393 | adc s, s | ||
| 394 | LOAD t2, k | ||
| 395 | STORE t0, k | ||
| 396 | cmp t2, t0 | ||
| 397 | cmovae s, k | ||
| 398 | dec k | ||
| 399 | ; we preload next items before STORE operation for calculated address | ||
| 400 | mov t0, [p + k * 8] | ||
| 401 | mov t1, [p + k * 8 + 4] | ||
| 402 | STORE t2, s | ||
| 403 | cmp k, i | ||
| 404 | jne loop_down_1 | ||
| 405 | |||
| 406 | if 0 ; 1 for debug | ||
| 407 | mov k, r15 | ||
| 408 | dec r14d | ||
| 409 | jnz loop_benchmark | ||
| 410 | ; jmp end_debug | ||
| 411 | endif | ||
| 412 | |||
| 413 | MY_ALIGN_16 | ||
| 414 | loop_down: | ||
| 415 | mov t0, [p + i * 8] | ||
| 416 | mov t1, [p + i * 8 + 4] | ||
| 417 | LOAD qq, i | ||
| 418 | mov k, i | ||
| 419 | lea s, dword ptr [i + i] | ||
| 420 | ; jmp end_debug | ||
| 421 | DOWN_use_prefetch equ 0 | ||
| 422 | DOWN_num_unrolls equ 0 | ||
| 423 | MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls | ||
| 424 | sub i, 1 | ||
| 425 | jnb loop_down | ||
| 426 | |||
| 427 | ; jmp end_debug | ||
| 428 | LOAD e0, 0 | ||
| 429 | LOAD e1, 1 | ||
| 430 | |||
| 431 | LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work | ||
| 432 | |||
| 433 | cmp num_last, LEVEL_3_LIMIT + 1 | ||
| 434 | jb main_loop_sort_5 | ||
| 435 | |||
| 436 | MY_ALIGN_16 | ||
| 437 | main_loop_sort: | ||
| 438 | ; num_last > LEVEL_3_LIMIT | ||
| 439 | ; p[size--] = p[0]; | ||
| 440 | LOAD qq, num_last | ||
| 441 | STORE e0, num_last | ||
| 442 | mov e0, e1 | ||
| 443 | |||
| 444 | mov next4_lim, num_last | ||
| 445 | shr next4_lim, 2 | ||
| 446 | mov pref_lim, num_last | ||
| 447 | shr pref_lim, NUM_PREFETCH_LEVELS + 1 | ||
| 448 | |||
| 449 | dec num_last | ||
| 450 | if 0 ; 1 for debug | ||
| 451 | ; that optional optimization can improve the performance, if there are identical items in array | ||
| 452 | ; 3 times improvement : if all items in array are identical | ||
| 453 | ; 20% improvement : if items are different for 1 bit only | ||
| 454 | ; 1-10% improvement : if items are different for (2+) bits | ||
| 455 | ; no gain : if items are different | ||
| 456 | cmp qq, e1 | ||
| 457 | jae next_iter_main | ||
| 458 | endif | ||
| 459 | LOAD e1, 2 | ||
| 460 | LOAD t0, 3 | ||
| 461 | mov k_x, 2 | ||
| 462 | cmp e1, t0 | ||
| 463 | cmovb e1, t0 | ||
| 464 | mov t0, [p + 4 * (4 + 0)] | ||
| 465 | mov t1, [p + 4 * (4 + 1)] | ||
| 466 | cmovb t0, [p + 4 * (4 + 2)] | ||
| 467 | cmovb t1, [p + 4 * (4 + 3)] | ||
| 468 | adc k_x, 0 | ||
| 469 | ; (qq <= e1), because the tree is correctly sorted | ||
| 470 | ; also here we could check (qq >= e1) or (qq == e1) for faster exit | ||
| 471 | lea s, dword ptr [k + k] | ||
| 472 | MAIN_use_prefetch equ 1 | ||
| 473 | MAIN_num_unrolls equ 0 | ||
| 474 | MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls | ||
| 475 | |||
| 476 | next_iter_main: | ||
| 477 | cmp num_last, LEVEL_3_LIMIT | ||
| 478 | jne main_loop_sort | ||
| 479 | |||
| 480 | ; num_last == LEVEL_3_LIMIT | ||
| 481 | main_loop_sort_5: | ||
| 482 | ; 4 <= num_last <= LEVEL_3_LIMIT | ||
| 483 | ; p[size--] = p[0]; | ||
| 484 | LOAD qq, num_last | ||
| 485 | STORE e0, num_last | ||
| 486 | mov e0, e1 | ||
| 487 | dec num_last_x | ||
| 488 | |||
| 489 | LOAD e1, 2 | ||
| 490 | LOAD t0, 3 | ||
| 491 | mov k_x, 2 | ||
| 492 | cmp e1, t0 | ||
| 493 | cmovb e1, t0 | ||
| 494 | adc k_x, 0 | ||
| 495 | |||
| 496 | lea s_x, dword ptr [k * 2] | ||
| 497 | cmp s_x, num_last_x | ||
| 498 | ja exit_2 | ||
| 499 | |||
| 500 | mov t0, [p + k * 8] | ||
| 501 | je exit_1 | ||
| 502 | |||
| 503 | ; s < num_last | ||
| 504 | mov t1, [p + k * 8 + 4] | ||
| 505 | cmp t0, t1 | ||
| 506 | cmovb t0, t1 | ||
| 507 | adc s_x, 0 | ||
| 508 | exit_1: | ||
| 509 | STORE t0, k | ||
| 510 | cmp qq, t0 | ||
| 511 | cmovb k_x, s_x | ||
| 512 | exit_2: | ||
| 513 | STORE qq, k | ||
| 514 | cmp num_last_x, 3 | ||
| 515 | jne main_loop_sort_5 | ||
| 516 | |||
| 517 | ; num_last == 3 (real_size == 4) | ||
| 518 | LOAD a0, 2 | ||
| 519 | LOAD a1, 3 | ||
| 520 | STORE e1, 2 | ||
| 521 | STORE e0, 3 | ||
| 522 | SORT a0, a1 | ||
| 523 | end_2: | ||
| 524 | STORE a0, 0 | ||
| 525 | STORE a1, 1 | ||
| 526 | ; end_debug: | ||
| 527 | ; MY_POP_PRESERVED_ABI_REGS | ||
| 528 | pop r12 | ||
| 529 | MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
| 530 | end_1: | ||
| 531 | MY_ENDP | ||
| 532 | |||
| 533 | |||
| 534 | |||
| 535 | else | ||
| 536 | ; ------------ x86 32-bit ------------ | ||
| 537 | |||
| 538 | ifdef x64 | ||
| 539 | IS_CDECL = 0 | ||
| 540 | endif | ||
| 541 | |||
| 542 | acc equ x0 | ||
| 543 | k equ r0 | ||
| 544 | k_x equ acc | ||
| 545 | |||
| 546 | p equ r1 | ||
| 547 | |||
| 548 | num_last equ r2 | ||
| 549 | num_last_x equ x2 | ||
| 550 | |||
| 551 | a0 equ x3 | ||
| 552 | t0 equ a0 | ||
| 553 | |||
| 554 | a3 equ x5 | ||
| 555 | i equ r5 | ||
| 556 | e0 equ a3 | ||
| 557 | |||
| 558 | a1 equ x6 | ||
| 559 | qq equ a1 | ||
| 560 | |||
| 561 | a2 equ x7 | ||
| 562 | s equ r7 | ||
| 563 | s_x equ a2 | ||
| 564 | |||
| 565 | |||
| 566 | SORT macro b0, b1 | ||
| 567 | cmp b1, b0 | ||
| 568 | jae @F | ||
| 569 | if 1 | ||
| 570 | xchg b0, b1 | ||
| 571 | else | ||
| 572 | mov acc, b0 | ||
| 573 | mov b0, b1 ; min | ||
| 574 | mov b1, acc ; max | ||
| 575 | endif | ||
| 576 | @@: | ||
| 577 | endm | ||
| 578 | |||
| 579 | LOAD macro dest:req, index:req | ||
| 580 | mov dest, [p + 4 * index] | ||
| 581 | endm | ||
| 582 | |||
| 583 | STORE macro reg:req, index:req | ||
| 584 | mov [p + 4 * index], reg | ||
| 585 | endm | ||
| 586 | |||
| 587 | |||
| 588 | STEP_1 macro exit_label | ||
| 589 | mov t0, [p + k * 8] | ||
| 590 | cmp t0, [p + k * 8 + 4] | ||
| 591 | adc s, 0 | ||
| 592 | LOAD t0, s | ||
| 593 | STORE t0, k ; we lookahed stooring for most expected branch | ||
| 594 | cmp qq, t0 | ||
| 595 | jae exit_label | ||
| 596 | ; STORE t0, k ; use if | ||
| 597 | mov k, s | ||
| 598 | add s, s | ||
| 599 | ; lea s, dword ptr [s + s] | ||
| 600 | ; shl s, 1 | ||
| 601 | ; lea s, dword ptr [s * 2] | ||
| 602 | endm | ||
| 603 | |||
| 604 | STEP_BRANCH macro exit_label | ||
| 605 | mov t0, [p + k * 8] | ||
| 606 | cmp t0, [p + k * 8 + 4] | ||
| 607 | jae @F | ||
| 608 | inc s | ||
| 609 | mov t0, [p + k * 8 + 4] | ||
| 610 | @@: | ||
| 611 | cmp qq, t0 | ||
| 612 | jae exit_label | ||
| 613 | STORE t0, k | ||
| 614 | mov k, s | ||
| 615 | add s, s | ||
| 616 | endm | ||
| 617 | |||
| 618 | |||
| 619 | |||
| 620 | MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2 | ||
| 621 | LOCAL leaves, opt_loop, single | ||
| 622 | |||
| 623 | ; s == k * 2 | ||
| 624 | rept num_unrolls | ||
| 625 | cmp s, num_last | ||
| 626 | jae leaves | ||
| 627 | STEP_1 exit_2 | ||
| 628 | endm | ||
| 629 | cmp s, num_last | ||
| 630 | jb opt_loop | ||
| 631 | |||
| 632 | leaves: | ||
| 633 | ; (s >= num_last) | ||
| 634 | jne exit_2 | ||
| 635 | single: | ||
| 636 | ; (s == num_last) | ||
| 637 | mov t0, [p + k * 8] | ||
| 638 | cmp qq, t0 | ||
| 639 | jae exit_2 | ||
| 640 | STORE t0, k | ||
| 641 | mov k, s | ||
| 642 | jmp exit_2 | ||
| 643 | |||
| 644 | MY_ALIGN_16 | ||
| 645 | opt_loop: | ||
| 646 | STEP exit_2 | ||
| 647 | cmp s, num_last | ||
| 648 | jb opt_loop | ||
| 649 | je single | ||
| 650 | exit_2: | ||
| 651 | STORE qq, k | ||
| 652 | endm | ||
| 653 | |||
| 654 | |||
| 655 | |||
| 656 | |||
| 657 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
| 658 | ; MY_ALIGN_64 | ||
| 659 | else | ||
| 660 | MY_ALIGN_16 | ||
| 661 | endif | ||
| 662 | |||
| 663 | MY_PROC HeapSort, 2 | ||
| 664 | ifdef x64 | ||
| 665 | if (IS_LINUX gt 0) | ||
| 666 | mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux | ||
| 667 | mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux | ||
| 668 | endif | ||
| 669 | elseif (IS_CDECL gt 0) | ||
| 670 | mov num_last, [r4 + REG_SIZE * 2] | ||
| 671 | mov p, [r4 + REG_SIZE * 1] | ||
| 672 | endif | ||
| 673 | cmp num_last, 2 | ||
| 674 | jb end_1 | ||
| 675 | MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
| 676 | |||
| 677 | cmp num_last, 4 | ||
| 678 | ja sort_5 | ||
| 679 | |||
| 680 | LOAD a0, 0 | ||
| 681 | LOAD a1, 1 | ||
| 682 | SORT a0, a1 | ||
| 683 | cmp num_last, 3 | ||
| 684 | jb end_2 | ||
| 685 | |||
| 686 | LOAD a2, 2 | ||
| 687 | je sort_3 | ||
| 688 | |||
| 689 | LOAD a3, 3 | ||
| 690 | SORT a2, a3 | ||
| 691 | SORT a1, a3 | ||
| 692 | STORE a3, 3 | ||
| 693 | sort_3: | ||
| 694 | SORT a0, a2 | ||
| 695 | SORT a1, a2 | ||
| 696 | STORE a2, 2 | ||
| 697 | jmp end_2 | ||
| 698 | |||
| 699 | sort_5: | ||
| 700 | ; num_last > 4 | ||
| 701 | lea i, dword ptr [num_last - 2] | ||
| 702 | dec num_last | ||
| 703 | test i, 1 | ||
| 704 | jz loop_down | ||
| 705 | |||
| 706 | ; single child | ||
| 707 | mov t0, [p + num_last * 4] | ||
| 708 | mov qq, [p + num_last * 2] | ||
| 709 | dec i | ||
| 710 | cmp qq, t0 | ||
| 711 | jae loop_down | ||
| 712 | |||
| 713 | mov [p + num_last * 2], t0 | ||
| 714 | mov [p + num_last * 4], qq | ||
| 715 | |||
| 716 | MY_ALIGN_16 | ||
| 717 | loop_down: | ||
| 718 | mov t0, [p + i * 4] | ||
| 719 | cmp t0, [p + i * 4 + 4] | ||
| 720 | mov k, i | ||
| 721 | mov qq, [p + i * 2] | ||
| 722 | adc k, 0 | ||
| 723 | LOAD t0, k | ||
| 724 | cmp qq, t0 | ||
| 725 | jae down_next | ||
| 726 | mov [p + i * 2], t0 | ||
| 727 | lea s, dword ptr [k + k] | ||
| 728 | |||
| 729 | DOWN_num_unrolls equ 0 | ||
| 730 | MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label | ||
| 731 | down_next: | ||
| 732 | sub i, 2 | ||
| 733 | jnb loop_down | ||
| 734 | ; jmp end_debug | ||
| 735 | |||
| 736 | LOAD e0, 0 | ||
| 737 | |||
| 738 | MY_ALIGN_16 | ||
| 739 | main_loop_sort: | ||
| 740 | ; num_last > 3 | ||
| 741 | mov t0, [p + 2 * 4] | ||
| 742 | cmp t0, [p + 3 * 4] | ||
| 743 | LOAD qq, num_last | ||
| 744 | STORE e0, num_last | ||
| 745 | LOAD e0, 1 | ||
| 746 | mov s_x, 2 | ||
| 747 | mov k_x, 1 | ||
| 748 | adc s, 0 | ||
| 749 | LOAD t0, s | ||
| 750 | dec num_last | ||
| 751 | cmp qq, t0 | ||
| 752 | jae main_exit_label | ||
| 753 | STORE t0, 1 | ||
| 754 | mov k, s | ||
| 755 | add s, s | ||
| 756 | if 1 | ||
| 757 | ; for branch data prefetch mode : | ||
| 758 | ; it's faster for large arrays : larger than (1 << 13) items. | ||
| 759 | MAIN_num_unrolls equ 10 | ||
| 760 | STEP_LOOP equ STEP_BRANCH | ||
| 761 | else | ||
| 762 | MAIN_num_unrolls equ 0 | ||
| 763 | STEP_LOOP equ STEP_1 | ||
| 764 | endif | ||
| 765 | |||
| 766 | MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label | ||
| 767 | |||
| 768 | ; jmp end_debug | ||
| 769 | cmp num_last, 3 | ||
| 770 | jne main_loop_sort | ||
| 771 | |||
| 772 | ; num_last == 3 (real_size == 4) | ||
| 773 | LOAD a0, 2 | ||
| 774 | LOAD a1, 3 | ||
| 775 | LOAD a2, 1 | ||
| 776 | STORE e0, 3 ; e0 is alias for a3 | ||
| 777 | STORE a2, 2 | ||
| 778 | SORT a0, a1 | ||
| 779 | end_2: | ||
| 780 | STORE a0, 0 | ||
| 781 | STORE a1, 1 | ||
| 782 | ; end_debug: | ||
| 783 | MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 | ||
| 784 | end_1: | ||
| 785 | MY_ENDP | ||
| 786 | |||
| 787 | endif | ||
| 788 | |||
| 789 | ifdef Z7_SORT_ASM_USE_SEGMENT | ||
| 790 | _TEXT$Z7_SORT ENDS | ||
| 791 | endif | ||
| 792 | |||
| 793 | if 0 | ||
| 794 | LEA_IS_D8 (R64) [R2 * 4 + 16] | ||
| 795 | Lat : TP | ||
| 796 | 2 : 1 : adl-e | ||
| 797 | 2 : 3 p056 adl-p | ||
| 798 | 1 : 2 : p15 hsw-rocket | ||
| 799 | 1 : 2 : p01 snb-ivb | ||
| 800 | 1 : 1 : p1 conroe-wsm | ||
| 801 | 1 : 4 : zen3,zen4 | ||
| 802 | 2 : 4 : zen1,zen2 | ||
| 803 | |||
| 804 | LEA_B_IS (R64) [R2 + R3 * 4] | ||
| 805 | Lat : TP | ||
| 806 | 1 : 1 : adl-e | ||
| 807 | 2 : 3 p056 adl-p | ||
| 808 | 1 : 2 : p15 hsw-rocket | ||
| 809 | 1 : 2 : p01 snb-ivb | ||
| 810 | 1 : 1 : p1 nhm-wsm | ||
| 811 | 1 : 1 : p0 conroe-wsm | ||
| 812 | 1 : 4 : zen3,zen4 | ||
| 813 | 2 :2,4 : zen1,zen2 | ||
| 814 | |||
| 815 | LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16] | ||
| 816 | Lat : TP | ||
| 817 | 2 : 1 : adl-e | ||
| 818 | 2 : 3 p056 adl-p | ||
| 819 | 1 : 2 : p15 ice-rocket | ||
| 820 | 3 : 1 : p1/p15 hsw-rocket | ||
| 821 | 3 : 1 : p01 snb-ivb | ||
| 822 | 1 : 1 : p1 nhm-wsm | ||
| 823 | 1 : 1 : p0 conroe-wsm | ||
| 824 | 2,1 : 2 : zen3,zen4 | ||
| 825 | 2 : 2 : zen1,zen2 | ||
| 826 | |||
| 827 | CMOVB (R64, R64) | ||
| 828 | Lat : TP | ||
| 829 | 1,2 : 2 : adl-e | ||
| 830 | 1 : 2 p06 adl-p | ||
| 831 | 1 : 2 : p06 bwd-rocket | ||
| 832 | 1,2 : 2 : p0156+p06 hsw | ||
| 833 | 1,2 :1.5 : p015+p05 snb-ivb | ||
| 834 | 1,2 : 1 : p015+p05 nhm | ||
| 835 | 1 : 1 : 2*p015 conroe | ||
| 836 | 1 : 2 : zen3,zen4 | ||
| 837 | 1 : 4 : zen1,zen2 | ||
| 838 | |||
| 839 | ADC (R64, 0) | ||
| 840 | Lat : TP | ||
| 841 | 1,2 : 2 : adl-e | ||
| 842 | 1 : 2 p06 adl-p | ||
| 843 | 1 : 2 : p06 bwd-rocket | ||
| 844 | 1 :1.5 : p0156+p06 hsw | ||
| 845 | 1 :1.5 : p015+p05 snb-ivb | ||
| 846 | 2 : 1 : 2*p015 conroe-wstm | ||
| 847 | 1 : 2 : zen1,zen2,zen3,zen4 | ||
| 848 | |||
| 849 | PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution. | ||
| 850 | L1 : Pentium3 | ||
| 851 | L2 : NetBurst | ||
| 852 | L1, not L2: Core duo, Core 2, Atom processors | ||
| 853 | L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ... | ||
| 854 | NEHALEM: Fills L1/L3, L1 LRU is not updated | ||
| 855 | L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ... | ||
| 856 | PREFETCHT0 : fetch data into all cache levels. | ||
| 857 | PREFETCHT1 : fetch data into L2 and L3 | ||
| 858 | endif | ||
| 859 | |||
| 860 | end | ||
