diff options
author | Mike Pall <mike> | 2016-11-29 19:30:40 +0100 |
---|---|---|
committer | Mike Pall <mike> | 2016-11-29 19:30:40 +0100 |
commit | 3ad2bbf58600f8ba2918b56b0a7ab305df19cfe5 (patch) | |
tree | 28771b8efec0975a95831ebcac212f7955166191 | |
parent | 6538c8a18711a6eb009def36050acd5f02e42aec (diff) | |
download | luajit-3ad2bbf58600f8ba2918b56b0a7ab305df19cfe5.tar.gz luajit-3ad2bbf58600f8ba2918b56b0a7ab305df19cfe5.tar.bz2 luajit-3ad2bbf58600f8ba2918b56b0a7ab305df19cfe5.zip |
ARM64: Make use of tbz/tbnz and cbz/cbnz.
Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
-rw-r--r-- | src/lj_asm_arm64.h | 83 | ||||
-rw-r--r-- | src/lj_emit_arm64.h | 19 | ||||
-rw-r--r-- | src/lj_target_arm64.h | 6 |
3 files changed, 91 insertions, 17 deletions
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 19b3331d..eea957b5 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h | |||
@@ -84,6 +84,34 @@ static void asm_guardcc(ASMState *as, A64CC cc) | |||
84 | emit_cond_branch(as, cc, target); | 84 | emit_cond_branch(as, cc, target); |
85 | } | 85 | } |
86 | 86 | ||
87 | /* Emit test and branch instruction to exit for guard. */ | ||
88 | static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit) | ||
89 | { | ||
90 | MCode *target = asm_exitstub_addr(as, as->snapno); | ||
91 | MCode *p = as->mcp; | ||
92 | if (LJ_UNLIKELY(p == as->invmcp)) { | ||
93 | as->loopinv = 1; | ||
94 | *p = A64I_B | ((target-p) & 0x03ffffffu); | ||
95 | emit_tnb(as, ai^0x01000000u, r, bit, p-1); | ||
96 | return; | ||
97 | } | ||
98 | emit_tnb(as, ai, r, bit, target); | ||
99 | } | ||
100 | |||
101 | /* Emit compare and branch instruction to exit for guard. */ | ||
102 | static void asm_guardcnb(ASMState *as, A64Ins ai, Reg r) | ||
103 | { | ||
104 | MCode *target = asm_exitstub_addr(as, as->snapno); | ||
105 | MCode *p = as->mcp; | ||
106 | if (LJ_UNLIKELY(p == as->invmcp)) { | ||
107 | as->loopinv = 1; | ||
108 | *p = A64I_B | ((target-p) & 0x03ffffffu); | ||
109 | emit_cnb(as, ai^0x01000000u, r, p-1); | ||
110 | return; | ||
111 | } | ||
112 | emit_cnb(as, ai, r, target); | ||
113 | } | ||
114 | |||
87 | /* -- Operand fusion ------------------------------------------------------ */ | 115 | /* -- Operand fusion ------------------------------------------------------ */ |
88 | 116 | ||
89 | /* Limit linear search to this distance. Avoids O(n^2) behavior. */ | 117 | /* Limit linear search to this distance. Avoids O(n^2) behavior. */ |
@@ -482,10 +510,9 @@ static void asm_strto(ASMState *as, IRIns *ir) | |||
482 | dest = ra_dest(as, ir, RSET_FPR); | 510 | dest = ra_dest(as, ir, RSET_FPR); |
483 | } | 511 | } |
484 | } | 512 | } |
485 | asm_guardcc(as, CC_EQ); | ||
486 | if (destused) | 513 | if (destused) |
487 | emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); | 514 | emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); |
488 | emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET); | 515 | asm_guardcnb(as, A64I_CBZ, RID_RET); |
489 | args[0] = ir->op1; /* GCstr *str */ | 516 | args[0] = ir->op1; /* GCstr *str */ |
490 | args[1] = ASMREF_TMP1; /* TValue *n */ | 517 | args[1] = ASMREF_TMP1; /* TValue *n */ |
491 | asm_gencall(as, ci, args); | 518 | asm_gencall(as, ci, args); |
@@ -1465,13 +1492,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir) | |||
1465 | else if (cc > CC_NE) cc ^= 11; /* LO <-> HI, LS <-> HS */ | 1492 | else if (cc > CC_NE) cc ^= 11; /* LO <-> HI, LS <-> HS */ |
1466 | } | 1493 | } |
1467 | oldcc = cc; | 1494 | oldcc = cc; |
1468 | if (irref_isk(rref) && IR(rref)->i == 0) { | 1495 | if (irref_isk(rref) && get_k64val(IR(rref)) == 0) { |
1469 | IRIns *irl = IR(lref); | 1496 | IRIns *irl = IR(lref); |
1470 | if (cc == CC_GE) cc = CC_PL; | 1497 | if (cc == CC_GE) cc = CC_PL; |
1471 | else if (cc == CC_LT) cc = CC_MI; | 1498 | else if (cc == CC_LT) cc = CC_MI; |
1472 | else if (cc > CC_NE) goto notst; /* Other conds don't work with tst. */ | 1499 | else if (cc > CC_NE) goto nocombine; /* Other conds don't work with tst. */ |
1473 | cmpprev0 = (irl+1 == ir); | 1500 | cmpprev0 = (irl+1 == ir); |
1474 | /* Combine comp(BAND(left, right), 0) into tst left, right. */ | 1501 | /* Combine and-cmp-bcc into tbz/tbnz or and-cmp into tst. */ |
1475 | if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) { | 1502 | if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) { |
1476 | IRRef blref = irl->op1, brref = irl->op2; | 1503 | IRRef blref = irl->op1, brref = irl->op2; |
1477 | uint32_t m2 = 0; | 1504 | uint32_t m2 = 0; |
@@ -1480,10 +1507,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir) | |||
1480 | Reg tmp = blref; blref = brref; brref = tmp; | 1507 | Reg tmp = blref; blref = brref; brref = tmp; |
1481 | } | 1508 | } |
1482 | if (irref_isk(brref)) { | 1509 | if (irref_isk(brref)) { |
1483 | /* NYI: use tbz/tbnz, if applicable. */ | 1510 | uint64_t k = get_k64val(IR(brref)); |
1484 | m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t)); | 1511 | if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) { |
1485 | if (!m2) | 1512 | asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, |
1486 | goto notst; /* Not beneficial if we miss a constant operand. */ | 1513 | ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k)); |
1514 | return; | ||
1515 | } | ||
1516 | m2 = emit_isk13(k, irt_is64(irl->t)); | ||
1487 | } | 1517 | } |
1488 | bleft = ra_alloc1(as, blref, RSET_GPR); | 1518 | bleft = ra_alloc1(as, blref, RSET_GPR); |
1489 | ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw); | 1519 | ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw); |
@@ -1493,9 +1523,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir) | |||
1493 | emit_n(as, ai^m2, bleft); | 1523 | emit_n(as, ai^m2, bleft); |
1494 | return; | 1524 | return; |
1495 | } | 1525 | } |
1496 | /* NYI: use cbz/cbnz for EQ/NE 0. */ | 1526 | if (cc == CC_EQ || cc == CC_NE) { |
1527 | /* Combine cmp-bcc into cbz/cbnz. */ | ||
1528 | ai = cc == CC_EQ ? A64I_CBZ : A64I_CBNZ; | ||
1529 | if (irt_is64(ir->t)) ai |= A64I_X; | ||
1530 | asm_guardcnb(as, ai, ra_alloc1(as, lref, RSET_GPR)); | ||
1531 | return; | ||
1532 | } | ||
1497 | } | 1533 | } |
1498 | notst: | 1534 | nocombine: |
1499 | left = ra_alloc1(as, lref, RSET_GPR); | 1535 | left = ra_alloc1(as, lref, RSET_GPR); |
1500 | m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left)); | 1536 | m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left)); |
1501 | asm_guardcc(as, cc); | 1537 | asm_guardcc(as, cc); |
@@ -1638,8 +1674,7 @@ static void asm_gc_check(ASMState *as) | |||
1638 | ra_evictset(as, RSET_SCRATCH); | 1674 | ra_evictset(as, RSET_SCRATCH); |
1639 | l_end = emit_label(as); | 1675 | l_end = emit_label(as); |
1640 | /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */ | 1676 | /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */ |
1641 | asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */ | 1677 | asm_guardcnb(as, A64I_CBNZ, RID_RET); /* Assumes asm_snap_prep() is done. */ |
1642 | emit_n(as, A64I_CMPx^A64I_K12, RID_RET); | ||
1643 | args[0] = ASMREF_TMP1; /* global_State *g */ | 1678 | args[0] = ASMREF_TMP1; /* global_State *g */ |
1644 | args[1] = ASMREF_TMP2; /* MSize steps */ | 1679 | args[1] = ASMREF_TMP2; /* MSize steps */ |
1645 | asm_gencall(as, ci, args); | 1680 | asm_gencall(as, ci, args); |
@@ -1666,10 +1701,10 @@ static void asm_loop_fixup(ASMState *as) | |||
1666 | MCode *p = as->mctop; | 1701 | MCode *p = as->mctop; |
1667 | MCode *target = as->mcp; | 1702 | MCode *target = as->mcp; |
1668 | if (as->loopinv) { /* Inverted loop branch? */ | 1703 | if (as->loopinv) { /* Inverted loop branch? */ |
1704 | uint32_t mask = (p[-2] & 0x7e000000) == 0x36000000 ? 0x3fffu : 0x7ffffu; | ||
1669 | ptrdiff_t delta = target - (p - 2); | 1705 | ptrdiff_t delta = target - (p - 2); |
1670 | lua_assert(((delta + 0x40000) >> 19) == 0); | 1706 | /* asm_guard* already inverted the bcc/tnb/cnb and patched the final b. */ |
1671 | /* asm_guardcc already inverted the b.cc and patched the final bl. */ | 1707 | p[-2] |= ((uint32_t)delta & mask) << 5; |
1672 | p[-2] |= ((uint32_t)delta & 0x7ffff) << 5; | ||
1673 | } else { | 1708 | } else { |
1674 | ptrdiff_t delta = target - (p - 1); | 1709 | ptrdiff_t delta = target - (p - 1); |
1675 | p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu); | 1710 | p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu); |
@@ -1795,18 +1830,32 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) | |||
1795 | MCode *mcarea = lj_mcode_patch(J, p, 0); | 1830 | MCode *mcarea = lj_mcode_patch(J, p, 0); |
1796 | MCode *px = exitstub_trace_addr(T, exitno); | 1831 | MCode *px = exitstub_trace_addr(T, exitno); |
1797 | for (; p < pe; p++) { | 1832 | for (; p < pe; p++) { |
1798 | /* Look for bcc/b exitstub, replace with bcc/b target. */ | 1833 | /* Look for exitstub branch, replace with branch to target. */ |
1799 | uint32_t ins = *p; | 1834 | uint32_t ins = *p; |
1800 | if ((ins & 0xff000000u) == 0x54000000u && | 1835 | if ((ins & 0xff000000u) == 0x54000000u && |
1801 | ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { | 1836 | ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { |
1837 | /* Patch bcc exitstub. */ | ||
1802 | *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); | 1838 | *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); |
1803 | cend = p+1; | 1839 | cend = p+1; |
1804 | if (!cstart) cstart = p; | 1840 | if (!cstart) cstart = p; |
1805 | } else if ((ins & 0xfc000000u) == 0x14000000u && | 1841 | } else if ((ins & 0xfc000000u) == 0x14000000u && |
1806 | ((ins ^ (px-p)) & 0x03ffffffu) == 0) { | 1842 | ((ins ^ (px-p)) & 0x03ffffffu) == 0) { |
1843 | /* Patch b exitstub. */ | ||
1807 | *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); | 1844 | *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); |
1808 | cend = p+1; | 1845 | cend = p+1; |
1809 | if (!cstart) cstart = p; | 1846 | if (!cstart) cstart = p; |
1847 | } else if ((ins & 0x7e000000u) == 0x34000000u && | ||
1848 | ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { | ||
1849 | /* Patch cbz/cbnz exitstub. */ | ||
1850 | *p = (ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u); | ||
1851 | cend = p+1; | ||
1852 | if (!cstart) cstart = p; | ||
1853 | } else if ((ins & 0x7e000000u) == 0x36000000u && | ||
1854 | ((ins ^ ((px-p)<<5)) & 0x0007ffe0u) == 0) { | ||
1855 | /* Patch tbz/tbnz exitstub. */ | ||
1856 | *p = (ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u); | ||
1857 | cend = p+1; | ||
1858 | if (!cstart) cstart = p; | ||
1810 | } | 1859 | } |
1811 | } | 1860 | } |
1812 | lua_assert(cstart != NULL); | 1861 | lua_assert(cstart != NULL); |
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 52e75559..1eb14204 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h | |||
@@ -321,6 +321,25 @@ static void emit_branch(ASMState *as, A64Ins ai, MCode *target) | |||
321 | as->mcp = p; | 321 | as->mcp = p; |
322 | } | 322 | } |
323 | 323 | ||
324 | static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target) | ||
325 | { | ||
326 | MCode *p = as->mcp; | ||
327 | ptrdiff_t delta = target - (p - 1); | ||
328 | lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0); | ||
329 | if (bit > 31) ai |= A64I_X; | ||
330 | *--p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; | ||
331 | as->mcp = p; | ||
332 | } | ||
333 | |||
334 | static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) | ||
335 | { | ||
336 | MCode *p = as->mcp; | ||
337 | ptrdiff_t delta = target - (p - 1); | ||
338 | lua_assert(((delta + 0x40000) >> 19) == 0); | ||
339 | *--p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; | ||
340 | as->mcp = p; | ||
341 | } | ||
342 | |||
324 | #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) | 343 | #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) |
325 | 344 | ||
326 | static void emit_call(ASMState *as, void *target) | 345 | static void emit_call(ASMState *as, void *target) |
diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 1cd02fe8..6c8771c6 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h | |||
@@ -127,7 +127,9 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) | |||
127 | #define A64F_U12(x) ((x) << 10) | 127 | #define A64F_U12(x) ((x) << 10) |
128 | #define A64F_S26(x) (x) | 128 | #define A64F_S26(x) (x) |
129 | #define A64F_S19(x) ((x) << 5) | 129 | #define A64F_S19(x) ((x) << 5) |
130 | #define A64F_S14(x) ((x) << 5) | ||
130 | #define A64F_S9(x) ((x) << 12) | 131 | #define A64F_S9(x) ((x) << 12) |
132 | #define A64F_BIT(x) ((x) << 19) | ||
131 | #define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10)) | 133 | #define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10)) |
132 | #define A64F_EX(ex) (A64I_EX | ((ex) << 13)) | 134 | #define A64F_EX(ex) (A64I_EX | ((ex) << 13)) |
133 | #define A64F_EXSH(ex,x) (A64I_EX | ((ex) << 13) | ((x) << 10)) | 135 | #define A64F_EXSH(ex,x) (A64I_EX | ((ex) << 13) | ((x) << 10)) |
@@ -235,6 +237,10 @@ typedef enum A64Ins { | |||
235 | A64I_BL = 0x94000000, | 237 | A64I_BL = 0x94000000, |
236 | A64I_BR = 0xd61f0000, | 238 | A64I_BR = 0xd61f0000, |
237 | A64I_BLR = 0xd63f0000, | 239 | A64I_BLR = 0xd63f0000, |
240 | A64I_TBZ = 0x36000000, | ||
241 | A64I_TBNZ = 0x37000000, | ||
242 | A64I_CBZ = 0x34000000, | ||
243 | A64I_CBNZ = 0x35000000, | ||
238 | 244 | ||
239 | A64I_NOP = 0xd503201f, | 245 | A64I_NOP = 0xd503201f, |
240 | 246 | ||