aboutsummaryrefslogtreecommitdiff
path: root/src/lj_asm.c
diff options
context:
space:
mode:
authorMike Pall <mike>2011-02-02 02:29:37 +0100
committerMike Pall <mike>2011-02-02 02:29:37 +0100
commitb613216efc7447dae645d8834e4d6f3185cd1bcc (patch)
tree0859fed377f00ebeada70ba45d02496b7fb4a249 /src/lj_asm.c
parentc539c0cac8f668e66a5ce9e5fd645cb45e3c5063 (diff)
downloadluajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.gz
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.bz2
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.zip
Add SPLIT pass to split 64 bit IR instructions for 32 bit CPUs.
Add generic HIOP instruction for extra backend functionality. Add support for HIOP to x86 backend. Use POWI for 64 bit integer x^k, too. POWI is lowered to a call by SPLIT or the x64 backend.
Diffstat (limited to 'src/lj_asm.c')
-rw-r--r--src/lj_asm.c444
1 files changed, 363 insertions, 81 deletions
diff --git a/src/lj_asm.c b/src/lj_asm.c
index cc2ae597..441700d4 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -347,6 +347,20 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
347 } 347 }
348} 348}
349 349
350/* op rm/mrm, i */
351static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i)
352{
353 x86Op xo;
354 if (checki8(i)) {
355 emit_i8(as, i);
356 xo = XG_TOXOi8(xg);
357 } else {
358 emit_i32(as, i);
359 xo = XG_TOXOi(xg);
360 }
361 emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64));
362}
363
350/* -- Emit moves ---------------------------------------------------------- */ 364/* -- Emit moves ---------------------------------------------------------- */
351 365
352/* mov [base+ofs], i */ 366/* mov [base+ofs], i */
@@ -371,7 +385,10 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
371/* mov r, i / xor r, r */ 385/* mov r, i / xor r, r */
372static void emit_loadi(ASMState *as, Reg r, int32_t i) 386static void emit_loadi(ASMState *as, Reg r, int32_t i)
373{ 387{
374 if (i == 0) { 388 /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
389 if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP ||
390 (as->curins+1 < as->T->nins &&
391 IR(as->curins+1)->o == IR_HIOP)))) {
375 emit_rr(as, XO_ARITH(XOg_XOR), r, r); 392 emit_rr(as, XO_ARITH(XOg_XOR), r, r);
376 } else { 393 } else {
377 MCode *p = as->mcp; 394 MCode *p = as->mcp;
@@ -422,6 +439,19 @@ static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
422/* Label for short jumps. */ 439/* Label for short jumps. */
423typedef MCode *MCLabel; 440typedef MCode *MCLabel;
424 441
442#if LJ_32 && LJ_HASFFI
443/* jmp short target */
444static void emit_sjmp(ASMState *as, MCLabel target)
445{
446 MCode *p = as->mcp;
447 ptrdiff_t delta = target - p;
448 lua_assert(delta == (int8_t)delta);
449 p[-1] = (MCode)(int8_t)delta;
450 p[-2] = XI_JMPs;
451 as->mcp = p - 2;
452}
453#endif
454
425/* jcc short target */ 455/* jcc short target */
426static void emit_sjcc(ASMState *as, int cc, MCLabel target) 456static void emit_sjcc(ASMState *as, int cc, MCLabel target)
427{ 457{
@@ -630,7 +660,7 @@ static Reg ra_rematk(ASMState *as, IRIns *ir)
630 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */ 660 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
631 lua_assert(irt_isnil(ir->t)); 661 lua_assert(irt_isnil(ir->t));
632 emit_getgl(as, r, jit_L); 662 emit_getgl(as, r, jit_L);
633#if LJ_64 /* NYI: 32 bit register pairs. */ 663#if LJ_64
634 } else if (ir->o == IR_KINT64) { 664 } else if (ir->o == IR_KINT64) {
635 emit_loadu64(as, r, ir_kint64(ir)->u64); 665 emit_loadu64(as, r, ir_kint64(ir)->u64);
636#endif 666#endif
@@ -681,8 +711,7 @@ static Reg ra_releasetmp(ASMState *as, IRRef ref)
681#if LJ_64 711#if LJ_64
682#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) 712#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
683#else 713#else
684/* NYI: 32 bit register pairs. */ 714#define REX_64IR(ir, r) (r)
685#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r))
686#endif 715#endif
687 716
688/* Generic move between two regs. */ 717/* Generic move between two regs. */
@@ -939,7 +968,7 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref)
939 emit_loadn(as, dest, tv); 968 emit_loadn(as, dest, tv);
940 return; 969 return;
941 } 970 }
942#if LJ_64 /* NYI: 32 bit register pairs. */ 971#if LJ_64
943 } else if (ir->o == IR_KINT64) { 972 } else if (ir->o == IR_KINT64) {
944 emit_loadu64(as, dest, ir_kint64(ir)->u64); 973 emit_loadu64(as, dest, ir_kint64(ir)->u64);
945 return; 974 return;
@@ -1463,7 +1492,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
1463#endif 1492#endif
1464 if (r) { /* Argument is in a register. */ 1493 if (r) { /* Argument is in a register. */
1465 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 1494 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
1466#if LJ_64 /* NYI: 32 bit register pairs. */ 1495#if LJ_64
1467 if (ir->o == IR_KINT64) 1496 if (ir->o == IR_KINT64)
1468 emit_loadu64(as, r, ir_kint64(ir)->u64); 1497 emit_loadu64(as, r, ir_kint64(ir)->u64);
1469 else 1498 else
@@ -1519,7 +1548,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
1519 ra_evictset(as, drop); /* Evictions must be performed first. */ 1548 ra_evictset(as, drop); /* Evictions must be performed first. */
1520 if (ra_used(ir)) { 1549 if (ra_used(ir)) {
1521 if (irt_isfp(ir->t)) { 1550 if (irt_isfp(ir->t)) {
1522 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1551 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1523#if LJ_64 1552#if LJ_64
1524 if ((ci->flags & CCI_CASTU64)) { 1553 if ((ci->flags & CCI_CASTU64)) {
1525 Reg dest = ir->r; 1554 Reg dest = ir->r;
@@ -1632,19 +1661,24 @@ static void asm_conv(ASMState *as, IRIns *ir)
1632 int stfp = (st == IRT_NUM || st == IRT_FLOAT); 1661 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
1633 IRRef lref = ir->op1; 1662 IRRef lref = ir->op1;
1634 lua_assert(irt_type(ir->t) != st); 1663 lua_assert(irt_type(ir->t) != st);
1664 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */
1635 if (irt_isfp(ir->t)) { 1665 if (irt_isfp(ir->t)) {
1636 Reg dest = ra_dest(as, ir, RSET_FPR); 1666 Reg dest = ra_dest(as, ir, RSET_FPR);
1637 if (stfp) { /* FP to FP conversion. */ 1667 if (stfp) { /* FP to FP conversion. */
1638 Reg left = asm_fuseload(as, lref, RSET_FPR); 1668 Reg left = asm_fuseload(as, lref, RSET_FPR);
1639 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); 1669 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
1640 if (left == dest) return; /* Avoid the XO_XORPS. */ 1670 if (left == dest) return; /* Avoid the XO_XORPS. */
1641#if LJ_32 1671 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
1642 } else if (st >= IRT_U32) { 1672 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
1643 /* NYI: 64 bit integer or uint32_t to number conversion. */ 1673 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000));
1644 setintV(&as->J->errinfo, ir->o); 1674 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
1645 lj_trace_err_info(as->J, LJ_TRERR_NYIIR); 1675 if (irt_isfloat(ir->t))
1676 emit_rr(as, XO_CVTSD2SS, dest, dest);
1677 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
1678 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
1679 emit_loadn(as, bias, k);
1680 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
1646 return; 1681 return;
1647#endif
1648 } else { /* Integer to FP conversion. */ 1682 } else { /* Integer to FP conversion. */
1649 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? 1683 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
1650 ra_alloc1(as, lref, RSET_GPR) : 1684 ra_alloc1(as, lref, RSET_GPR) :
@@ -1663,41 +1697,47 @@ static void asm_conv(ASMState *as, IRIns *ir)
1663 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ 1697 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
1664 } else if (stfp) { /* FP to integer conversion. */ 1698 } else if (stfp) { /* FP to integer conversion. */
1665 if (irt_isguard(ir->t)) { 1699 if (irt_isguard(ir->t)) {
1666 lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */ 1700 /* Checked conversions are only supported from number to int. */
1701 lua_assert(irt_isint(ir->t) && st == IRT_NUM);
1667 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 1702 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
1668#if LJ_32
1669 } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) {
1670 /* NYI: number to 64 bit integer or uint32_t conversion. */
1671 setintV(&as->J->errinfo, ir->o);
1672 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1673#endif
1674 } else { 1703 } else {
1675 Reg dest = ra_dest(as, ir, RSET_GPR); 1704 Reg dest = ra_dest(as, ir, RSET_GPR);
1676 x86Op op = st == IRT_NUM ? 1705 x86Op op = st == IRT_NUM ?
1677 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) : 1706 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
1678 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI); 1707 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
1679 if (LJ_64 && irt_isu64(ir->t)) { 1708 if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */
1680 const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000)); 1709 /* u32 = (int32_t)(number - 2^31) + 2^31 */
1681 MCLabel l_end = emit_label(as); 1710 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1682 Reg left = IR(lref)->r; 1711 ra_scratch(as, RSET_FPR);
1712 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
1713 emit_rr(as, op, dest, tmp);
1714 if (st == IRT_NUM)
1715 emit_rma(as, XO_ADDSD, tmp,
1716 lj_ir_k64_find(as->J, U64x(c1e00000,00000000)));
1717 else
1718 emit_rma(as, XO_ADDSS, tmp,
1719 lj_ir_k64_find(as->J, U64x(00000000,cf000000)));
1720 ra_left(as, tmp, lref);
1721 } else if (LJ_64 && irt_isu64(ir->t)) {
1683 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ 1722 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1684 if (ra_hasreg(left)) { 1723 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1685 Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left)); 1724 ra_scratch(as, RSET_FPR);
1686 emit_rr(as, op, dest|REX_64, tmpn); 1725 MCLabel l_end = emit_label(as);
1687 emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left); 1726 emit_rr(as, op, dest|REX_64, tmp);
1688 emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k); 1727 if (st == IRT_NUM)
1689 } else { 1728 emit_rma(as, XO_ADDSD, tmp,
1690 left = ra_allocref(as, lref, RSET_FPR); 1729 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1691 emit_rr(as, op, dest|REX_64, left); 1730 else
1692 emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k); 1731 emit_rma(as, XO_ADDSS, tmp,
1693 } 1732 lj_ir_k64_find(as->J, U64x(00000000,df800000)));
1694 emit_sjcc(as, CC_NS, l_end); 1733 emit_sjcc(as, CC_NS, l_end);
1695 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */ 1734 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */
1696 emit_rr(as, op, dest|REX_64, left); 1735 emit_rr(as, op, dest|REX_64, tmp);
1736 ra_left(as, tmp, lref);
1697 } else { 1737 } else {
1698 Reg left = asm_fuseload(as, lref, RSET_FPR); 1738 Reg left = asm_fuseload(as, lref, RSET_FPR);
1699 if (LJ_64 && irt_isu32(ir->t)) 1739 if (LJ_64 && irt_isu32(ir->t))
1700 emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */ 1740 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
1701 emit_mrm(as, op, 1741 emit_mrm(as, op,
1702 dest|((LJ_64 && 1742 dest|((LJ_64 &&
1703 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 1743 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
@@ -1728,12 +1768,10 @@ static void asm_conv(ASMState *as, IRIns *ir)
1728 emit_mrm(as, op, dest, left); 1768 emit_mrm(as, op, dest, left);
1729 } 1769 }
1730 } else { /* 32/64 bit integer conversions. */ 1770 } else { /* 32/64 bit integer conversions. */
1731 if (irt_is64(ir->t)) { 1771 if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
1732#if LJ_32 1772 Reg dest = ra_dest(as, ir, RSET_GPR);
1733 /* NYI: conversion to 64 bit integers. */ 1773 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1734 setintV(&as->J->errinfo, ir->o); 1774 } else if (irt_is64(ir->t)) {
1735 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1736#else
1737 Reg dest = ra_dest(as, ir, RSET_GPR); 1775 Reg dest = ra_dest(as, ir, RSET_GPR);
1738 if (st64 || !(ir->op2 & IRCONV_SEXT)) { 1776 if (st64 || !(ir->op2 & IRCONV_SEXT)) {
1739 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ 1777 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
@@ -1742,21 +1780,14 @@ static void asm_conv(ASMState *as, IRIns *ir)
1742 Reg left = asm_fuseload(as, lref, RSET_GPR); 1780 Reg left = asm_fuseload(as, lref, RSET_GPR);
1743 emit_mrm(as, XO_MOVSXd, dest|REX_64, left); 1781 emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
1744 } 1782 }
1745#endif
1746 } else { 1783 } else {
1747 Reg dest = ra_dest(as, ir, RSET_GPR); 1784 Reg dest = ra_dest(as, ir, RSET_GPR);
1748 if (st64) { 1785 if (st64) {
1749#if LJ_32
1750 /* NYI: conversion from 64 bit integers. */
1751 setintV(&as->J->errinfo, ir->o);
1752 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1753#else
1754 Reg left = asm_fuseload(as, lref, RSET_GPR); 1786 Reg left = asm_fuseload(as, lref, RSET_GPR);
1755 /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits 1787 /* This is either a 32 bit reg/reg mov which zeroes the hiword
1756 ** or a load of the lower 32 bits from a 64 bit address. 1788 ** or a load of the loword from a 64 bit address.
1757 */ 1789 */
1758 emit_mrm(as, XO_MOV, dest, left); 1790 emit_mrm(as, XO_MOV, dest, left);
1759#endif
1760 } else { /* 32/32 bit no-op (cast). */ 1791 } else { /* 32/32 bit no-op (cast). */
1761 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 1792 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1762 } 1793 }
@@ -1764,6 +1795,93 @@ static void asm_conv(ASMState *as, IRIns *ir)
1764 } 1795 }
1765} 1796}
1766 1797
1798#if LJ_32 && LJ_HASFFI
1799/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
1800
1801/* 64 bit integer to FP conversion in 32 bit mode. */
1802static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
1803{
1804 Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
1805 Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
1806 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1807 Reg dest = ir->r;
1808 if (ra_hasreg(dest)) {
1809 ra_free(as, dest);
1810 ra_modified(as, dest);
1811 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
1812 dest, RID_ESP, ofs);
1813 }
1814 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
1815 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
1816 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
1817 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
1818 MCLabel l_end = emit_label(as);
1819 emit_rma(as, XO_FADDq, XOg_FADDq,
1820 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
1821 emit_sjcc(as, CC_NS, l_end);
1822 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
1823 } else {
1824 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64);
1825 }
1826 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
1827 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
1828 emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
1829 emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
1830}
1831
1832/* FP to 64 bit integer conversion in 32 bit mode. */
1833static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
1834{
1835 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
1836 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
1837 Reg lo, hi;
1838 lua_assert(st == IRT_NUM || st == IRT_FLOAT);
1839 lua_assert(dt == IRT_I64 || dt == IRT_U64);
1840 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
1841 hi = ra_dest(as, ir, RSET_GPR);
1842 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
1843 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
1844 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1845 if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */
1846 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
1847 emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
1848 emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
1849 }
1850 if (dt == IRT_U64) {
1851 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1852 MCLabel l_pop, l_end = emit_label(as);
1853 emit_x87op(as, XI_FPOP);
1854 l_pop = emit_label(as);
1855 emit_sjmp(as, l_end);
1856 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1857 if ((as->flags & JIT_F_SSE3))
1858 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1859 else
1860 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1861 emit_rma(as, XO_FADDq, XOg_FADDq,
1862 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1863 emit_sjcc(as, CC_NS, l_pop);
1864 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
1865 }
1866 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1867 if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */
1868 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1869 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1870 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1871 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
1872 emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
1873 emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
1874 emit_loadi(as, lo, 0xc00);
1875 emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
1876 }
1877 if (dt == IRT_U64)
1878 emit_x87op(as, XI_FDUP);
1879 emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
1880 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
1881 asm_fuseload(as, ir->op1, RSET_EMPTY));
1882}
1883#endif
1884
1767static void asm_strto(ASMState *as, IRIns *ir) 1885static void asm_strto(ASMState *as, IRIns *ir)
1768{ 1886{
1769 /* Force a spill slot for the destination register (if any). */ 1887 /* Force a spill slot for the destination register (if any). */
@@ -2644,6 +2762,18 @@ static void asm_powi(ASMState *as, IRIns *ir)
2644 ra_left(as, RID_EAX, ir->op2); 2762 ra_left(as, RID_EAX, ir->op2);
2645} 2763}
2646 2764
2765#if LJ_64 && LJ_HASFFI
2766static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
2767{
2768 const CCallInfo *ci = &lj_ir_callinfo[id];
2769 IRRef args[2];
2770 args[0] = ir->op1;
2771 args[1] = ir->op2;
2772 asm_setupresult(as, ir, ci);
2773 asm_gencall(as, ci, args);
2774}
2775#endif
2776
2647/* Find out whether swapping operands might be beneficial. */ 2777/* Find out whether swapping operands might be beneficial. */
2648static int swapops(ASMState *as, IRIns *ir) 2778static int swapops(ASMState *as, IRIns *ir)
2649{ 2779{
@@ -2877,12 +3007,30 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2877/* -- Comparisons --------------------------------------------------------- */ 3007/* -- Comparisons --------------------------------------------------------- */
2878 3008
2879/* Virtual flags for unordered FP comparisons. */ 3009/* Virtual flags for unordered FP comparisons. */
2880#define VCC_U 0x100 /* Unordered. */ 3010#define VCC_U 0x1000 /* Unordered. */
2881#define VCC_P 0x200 /* Needs extra CC_P branch. */ 3011#define VCC_P 0x2000 /* Needs extra CC_P branch. */
2882#define VCC_S 0x400 /* Swap avoids CC_P branch. */ 3012#define VCC_S 0x4000 /* Swap avoids CC_P branch. */
2883#define VCC_PS (VCC_P|VCC_S) 3013#define VCC_PS (VCC_P|VCC_S)
2884 3014
2885static void asm_comp_(ASMState *as, IRIns *ir, int cc) 3015/* Map of comparisons to flags. ORDER IR. */
3016#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
3017static const uint16_t asm_compmap[IR_ABC+1] = {
3018 /* signed non-eq unsigned flags */
3019 /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS),
3020 /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0),
3021 /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS),
3022 /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0),
3023 /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U),
3024 /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS),
3025 /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U),
3026 /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS),
3027 /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
3028 /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P),
3029 /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */
3030};
3031
3032/* FP and integer comparisons. */
3033static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2886{ 3034{
2887 if (irt_isnum(ir->t)) { 3035 if (irt_isnum(ir->t)) {
2888 IRRef lref = ir->op1; 3036 IRRef lref = ir->op1;
@@ -3008,15 +3156,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3008 if (irl+1 == ir) /* Referencing previous ins? */ 3156 if (irl+1 == ir) /* Referencing previous ins? */
3009 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ 3157 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
3010 } else { 3158 } else {
3011 x86Op xo; 3159 emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
3012 if (checki8(imm)) {
3013 emit_i8(as, imm);
3014 xo = XO_ARITHi8;
3015 } else {
3016 emit_i32(as, imm);
3017 xo = XO_ARITHi;
3018 }
3019 emit_mrm(as, xo, r64 + XOg_CMP, left);
3020 } 3160 }
3021 } 3161 }
3022 } else { 3162 } else {
@@ -3028,8 +3168,133 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3028 } 3168 }
3029} 3169}
3030 3170
3031#define asm_comp(as, ir, ci, cf, cu) \ 3171#if LJ_32 && LJ_HASFFI
3032 asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) 3172/* 64 bit integer comparisons in 32 bit mode. */
3173static void asm_comp_int64(ASMState *as, IRIns *ir)
3174{
3175 uint32_t cc = asm_compmap[(ir-1)->o];
3176 RegSet allow = RSET_GPR;
3177 Reg lefthi = RID_NONE, leftlo = RID_NONE;
3178 Reg righthi = RID_NONE, rightlo = RID_NONE;
3179 MCLabel l_around;
3180 x86ModRM mrm;
3181
3182 as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */
3183
3184 /* Allocate/fuse hiword operands. */
3185 if (irref_isk(ir->op2)) {
3186 lefthi = asm_fuseload(as, ir->op1, allow);
3187 } else {
3188 lefthi = ra_alloc1(as, ir->op1, allow);
3189 righthi = asm_fuseload(as, ir->op2, allow);
3190 if (righthi == RID_MRM) {
3191 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3192 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3193 } else {
3194 rset_clear(allow, righthi);
3195 }
3196 }
3197 mrm = as->mrm; /* Save state for hiword instruction. */
3198
3199 /* Allocate/fuse loword operands. */
3200 if (irref_isk((ir-1)->op2)) {
3201 leftlo = asm_fuseload(as, (ir-1)->op1, allow);
3202 } else {
3203 leftlo = ra_alloc1(as, (ir-1)->op1, allow);
3204 rightlo = asm_fuseload(as, (ir-1)->op2, allow);
3205 if (rightlo == RID_MRM) {
3206 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3207 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3208 } else {
3209 rset_clear(allow, rightlo);
3210 }
3211 }
3212
3213 /* All register allocations must be performed _before_ this point. */
3214 l_around = emit_label(as);
3215 as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */
3216
3217 /* Loword comparison and branch. */
3218 asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */
3219 if (ra_noreg(rightlo)) {
3220 int32_t imm = IR((ir-1)->op2)->i;
3221 if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
3222 emit_rr(as, XO_TEST, leftlo, leftlo);
3223 else
3224 emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
3225 } else {
3226 emit_mrm(as, XO_CMP, leftlo, rightlo);
3227 }
3228
3229 /* Hiword comparison and branches. */
3230 if ((cc & 15) != CC_NE)
3231 emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */
3232 if ((cc & 15) != CC_E)
3233 asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */
3234 as->mrm = mrm; /* Restore state. */
3235 if (ra_noreg(righthi)) {
3236 int32_t imm = IR(ir->op2)->i;
3237 if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
3238 emit_rr(as, XO_TEST, lefthi, lefthi);
3239 else
3240 emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
3241 } else {
3242 emit_mrm(as, XO_CMP, lefthi, righthi);
3243 }
3244}
3245#endif
3246
3247/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
3248
3249/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
3250static void asm_hiop(ASMState *as, IRIns *ir)
3251{
3252#if LJ_32 && LJ_HASFFI
3253 /* HIOP is marked as a store because it needs its own DCE logic. */
3254 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
3255 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
3256 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
3257 if (usehi || uselo) {
3258 if (irt_isfp(ir->t))
3259 asm_conv_fp_int64(as, ir);
3260 else
3261 asm_conv_int64_fp(as, ir);
3262 }
3263 as->curins--; /* Always skip the CONV. */
3264 return;
3265 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
3266 asm_comp_int64(as, ir);
3267 return;
3268 }
3269 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
3270 switch ((ir-1)->o) {
3271 case IR_ADD:
3272 asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD);
3273 break;
3274 case IR_SUB:
3275 asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB);
3276 break;
3277 case IR_NEG: {
3278 Reg dest = ra_dest(as, ir, RSET_GPR);
3279 emit_rr(as, XO_GROUP3, XOg_NEG, dest);
3280 if (uselo) {
3281 emit_i8(as, 0);
3282 emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
3283 }
3284 ra_left(as, dest, ir->op1);
3285 break;
3286 }
3287 case IR_CALLN:
3288 ra_destreg(as, ir, RID_RETHI);
3289 if (!uselo)
3290 ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */
3291 break;
3292 default: lua_assert(0); break;
3293 }
3294#else
3295 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
3296#endif
3297}
3033 3298
3034/* -- Stack handling ------------------------------------------------------ */ 3299/* -- Stack handling ------------------------------------------------------ */
3035 3300
@@ -3682,21 +3947,16 @@ static void asm_ir(ASMState *as, IRIns *ir)
3682 switch ((IROp)ir->o) { 3947 switch ((IROp)ir->o) {
3683 /* Miscellaneous ops. */ 3948 /* Miscellaneous ops. */
3684 case IR_LOOP: asm_loop(as); break; 3949 case IR_LOOP: asm_loop(as); break;
3685 case IR_NOP: break; 3950 case IR_NOP: lua_assert(!ra_used(ir)); break;
3686 case IR_PHI: asm_phi(as, ir); break; 3951 case IR_PHI: asm_phi(as, ir); break;
3952 case IR_HIOP: asm_hiop(as, ir); break;
3687 3953
3688 /* Guarded assertions. */ 3954 /* Guarded assertions. */
3689 case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; 3955 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
3690 case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; 3956 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
3691 case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; 3957 case IR_EQ: case IR_NE: case IR_ABC:
3692 case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; 3958 asm_comp(as, ir, asm_compmap[ir->o]);
3693 case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; 3959 break;
3694 case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break;
3695 case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break;
3696 case IR_ABC:
3697 case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
3698 case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
3699 case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break;
3700 3960
3701 case IR_RETF: asm_retf(as, ir); break; 3961 case IR_RETF: asm_retf(as, ir); break;
3702 3962
@@ -3744,7 +4004,15 @@ static void asm_ir(ASMState *as, IRIns *ir)
3744 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: 4004 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
3745 asm_fpmath(as, ir); 4005 asm_fpmath(as, ir);
3746 break; 4006 break;
3747 case IR_POWI: asm_powi(as, ir); break; 4007 case IR_POWI:
4008#if LJ_64 && LJ_HASFFI
4009 if (!irt_isnum(ir->t))
4010 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
4011 IRCALL_lj_carith_powu64);
4012 else
4013#endif
4014 asm_powi(as, ir);
4015 break;
3748 4016
3749 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ 4017 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3750 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; 4018 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@@ -3801,6 +4069,7 @@ static void asm_trace(ASMState *as)
3801{ 4069{
3802 for (as->curins--; as->curins > as->stopins; as->curins--) { 4070 for (as->curins--; as->curins > as->stopins; as->curins--) {
3803 IRIns *ir = IR(as->curins); 4071 IRIns *ir = IR(as->curins);
4072 lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
3804 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE)) 4073 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
3805 continue; /* Dead-code elimination can be soooo easy. */ 4074 continue; /* Dead-code elimination can be soooo easy. */
3806 if (irt_isguard(ir->t)) 4075 if (irt_isguard(ir->t))
@@ -3864,11 +4133,10 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3864 case IR_CALLN: case IR_CALLL: case IR_CALLS: { 4133 case IR_CALLN: case IR_CALLL: case IR_CALLS: {
3865 const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; 4134 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
3866#if LJ_64 4135#if LJ_64
3867 /* NYI: add stack slots for x64 calls with many args. */
3868 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6)); 4136 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6));
3869 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); 4137 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
3870#else 4138#else
3871 /* NYI: not fastcall-aware, but doesn't matter (yet). */ 4139 lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2);
3872 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */ 4140 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
3873 as->evenspill = (int32_t)CCI_NARGS(ci); 4141 as->evenspill = (int32_t)CCI_NARGS(ci);
3874 ir->prev = REGSP_HINT(RID_RET); 4142 ir->prev = REGSP_HINT(RID_RET);
@@ -3878,6 +4146,12 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3878 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH; 4146 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
3879 continue; 4147 continue;
3880 } 4148 }
4149#if LJ_32 && LJ_HASFFI
4150 case IR_HIOP:
4151 if ((ir-1)->o == IR_CALLN)
4152 ir->prev = REGSP_HINT(RID_RETHI);
4153 break;
4154#endif
3881 /* C calls evict all scratch regs and return results in RID_RET. */ 4155 /* C calls evict all scratch regs and return results in RID_RET. */
3882 case IR_SNEW: case IR_NEWREF: 4156 case IR_SNEW: case IR_NEWREF:
3883#if !LJ_64 4157#if !LJ_64
@@ -3894,6 +4168,14 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3894 as->modset = RSET_SCRATCH; 4168 as->modset = RSET_SCRATCH;
3895 break; 4169 break;
3896 case IR_POWI: 4170 case IR_POWI:
4171#if LJ_64 && LJ_HASFFI
4172 if (!irt_isnum(ir->t)) {
4173 ir->prev = REGSP_HINT(RID_RET);
4174 if (inloop)
4175 as->modset |= (RSET_SCRATCH & RSET_GPR);
4176 continue;
4177 }
4178#endif
3897 ir->prev = REGSP_HINT(RID_XMM0); 4179 ir->prev = REGSP_HINT(RID_XMM0);
3898 if (inloop) 4180 if (inloop)
3899 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); 4181 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);