aboutsummaryrefslogtreecommitdiff
path: root/src/lj_asm.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_asm.c')
-rw-r--r--src/lj_asm.c444
1 files changed, 363 insertions, 81 deletions
diff --git a/src/lj_asm.c b/src/lj_asm.c
index cc2ae597..441700d4 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -347,6 +347,20 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
347 } 347 }
348} 348}
349 349
350/* op rm/mrm, i */
351static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i)
352{
353 x86Op xo;
354 if (checki8(i)) {
355 emit_i8(as, i);
356 xo = XG_TOXOi8(xg);
357 } else {
358 emit_i32(as, i);
359 xo = XG_TOXOi(xg);
360 }
361 emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64));
362}
363
350/* -- Emit moves ---------------------------------------------------------- */ 364/* -- Emit moves ---------------------------------------------------------- */
351 365
352/* mov [base+ofs], i */ 366/* mov [base+ofs], i */
@@ -371,7 +385,10 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
371/* mov r, i / xor r, r */ 385/* mov r, i / xor r, r */
372static void emit_loadi(ASMState *as, Reg r, int32_t i) 386static void emit_loadi(ASMState *as, Reg r, int32_t i)
373{ 387{
374 if (i == 0) { 388 /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
389 if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP ||
390 (as->curins+1 < as->T->nins &&
391 IR(as->curins+1)->o == IR_HIOP)))) {
375 emit_rr(as, XO_ARITH(XOg_XOR), r, r); 392 emit_rr(as, XO_ARITH(XOg_XOR), r, r);
376 } else { 393 } else {
377 MCode *p = as->mcp; 394 MCode *p = as->mcp;
@@ -422,6 +439,19 @@ static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
422/* Label for short jumps. */ 439/* Label for short jumps. */
423typedef MCode *MCLabel; 440typedef MCode *MCLabel;
424 441
442#if LJ_32 && LJ_HASFFI
443/* jmp short target */
444static void emit_sjmp(ASMState *as, MCLabel target)
445{
446 MCode *p = as->mcp;
447 ptrdiff_t delta = target - p;
448 lua_assert(delta == (int8_t)delta);
449 p[-1] = (MCode)(int8_t)delta;
450 p[-2] = XI_JMPs;
451 as->mcp = p - 2;
452}
453#endif
454
425/* jcc short target */ 455/* jcc short target */
426static void emit_sjcc(ASMState *as, int cc, MCLabel target) 456static void emit_sjcc(ASMState *as, int cc, MCLabel target)
427{ 457{
@@ -630,7 +660,7 @@ static Reg ra_rematk(ASMState *as, IRIns *ir)
630 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */ 660 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
631 lua_assert(irt_isnil(ir->t)); 661 lua_assert(irt_isnil(ir->t));
632 emit_getgl(as, r, jit_L); 662 emit_getgl(as, r, jit_L);
633#if LJ_64 /* NYI: 32 bit register pairs. */ 663#if LJ_64
634 } else if (ir->o == IR_KINT64) { 664 } else if (ir->o == IR_KINT64) {
635 emit_loadu64(as, r, ir_kint64(ir)->u64); 665 emit_loadu64(as, r, ir_kint64(ir)->u64);
636#endif 666#endif
@@ -681,8 +711,7 @@ static Reg ra_releasetmp(ASMState *as, IRRef ref)
681#if LJ_64 711#if LJ_64
682#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) 712#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
683#else 713#else
684/* NYI: 32 bit register pairs. */ 714#define REX_64IR(ir, r) (r)
685#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r))
686#endif 715#endif
687 716
688/* Generic move between two regs. */ 717/* Generic move between two regs. */
@@ -939,7 +968,7 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref)
939 emit_loadn(as, dest, tv); 968 emit_loadn(as, dest, tv);
940 return; 969 return;
941 } 970 }
942#if LJ_64 /* NYI: 32 bit register pairs. */ 971#if LJ_64
943 } else if (ir->o == IR_KINT64) { 972 } else if (ir->o == IR_KINT64) {
944 emit_loadu64(as, dest, ir_kint64(ir)->u64); 973 emit_loadu64(as, dest, ir_kint64(ir)->u64);
945 return; 974 return;
@@ -1463,7 +1492,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
1463#endif 1492#endif
1464 if (r) { /* Argument is in a register. */ 1493 if (r) { /* Argument is in a register. */
1465 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 1494 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
1466#if LJ_64 /* NYI: 32 bit register pairs. */ 1495#if LJ_64
1467 if (ir->o == IR_KINT64) 1496 if (ir->o == IR_KINT64)
1468 emit_loadu64(as, r, ir_kint64(ir)->u64); 1497 emit_loadu64(as, r, ir_kint64(ir)->u64);
1469 else 1498 else
@@ -1519,7 +1548,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
1519 ra_evictset(as, drop); /* Evictions must be performed first. */ 1548 ra_evictset(as, drop); /* Evictions must be performed first. */
1520 if (ra_used(ir)) { 1549 if (ra_used(ir)) {
1521 if (irt_isfp(ir->t)) { 1550 if (irt_isfp(ir->t)) {
1522 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1551 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1523#if LJ_64 1552#if LJ_64
1524 if ((ci->flags & CCI_CASTU64)) { 1553 if ((ci->flags & CCI_CASTU64)) {
1525 Reg dest = ir->r; 1554 Reg dest = ir->r;
@@ -1632,19 +1661,24 @@ static void asm_conv(ASMState *as, IRIns *ir)
1632 int stfp = (st == IRT_NUM || st == IRT_FLOAT); 1661 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
1633 IRRef lref = ir->op1; 1662 IRRef lref = ir->op1;
1634 lua_assert(irt_type(ir->t) != st); 1663 lua_assert(irt_type(ir->t) != st);
1664 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */
1635 if (irt_isfp(ir->t)) { 1665 if (irt_isfp(ir->t)) {
1636 Reg dest = ra_dest(as, ir, RSET_FPR); 1666 Reg dest = ra_dest(as, ir, RSET_FPR);
1637 if (stfp) { /* FP to FP conversion. */ 1667 if (stfp) { /* FP to FP conversion. */
1638 Reg left = asm_fuseload(as, lref, RSET_FPR); 1668 Reg left = asm_fuseload(as, lref, RSET_FPR);
1639 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); 1669 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
1640 if (left == dest) return; /* Avoid the XO_XORPS. */ 1670 if (left == dest) return; /* Avoid the XO_XORPS. */
1641#if LJ_32 1671 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
1642 } else if (st >= IRT_U32) { 1672 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
1643 /* NYI: 64 bit integer or uint32_t to number conversion. */ 1673 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000));
1644 setintV(&as->J->errinfo, ir->o); 1674 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
1645 lj_trace_err_info(as->J, LJ_TRERR_NYIIR); 1675 if (irt_isfloat(ir->t))
1676 emit_rr(as, XO_CVTSD2SS, dest, dest);
1677 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
1678 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
1679 emit_loadn(as, bias, k);
1680 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
1646 return; 1681 return;
1647#endif
1648 } else { /* Integer to FP conversion. */ 1682 } else { /* Integer to FP conversion. */
1649 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? 1683 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
1650 ra_alloc1(as, lref, RSET_GPR) : 1684 ra_alloc1(as, lref, RSET_GPR) :
@@ -1663,41 +1697,47 @@ static void asm_conv(ASMState *as, IRIns *ir)
1663 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ 1697 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
1664 } else if (stfp) { /* FP to integer conversion. */ 1698 } else if (stfp) { /* FP to integer conversion. */
1665 if (irt_isguard(ir->t)) { 1699 if (irt_isguard(ir->t)) {
1666 lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */ 1700 /* Checked conversions are only supported from number to int. */
1701 lua_assert(irt_isint(ir->t) && st == IRT_NUM);
1667 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 1702 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
1668#if LJ_32
1669 } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) {
1670 /* NYI: number to 64 bit integer or uint32_t conversion. */
1671 setintV(&as->J->errinfo, ir->o);
1672 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1673#endif
1674 } else { 1703 } else {
1675 Reg dest = ra_dest(as, ir, RSET_GPR); 1704 Reg dest = ra_dest(as, ir, RSET_GPR);
1676 x86Op op = st == IRT_NUM ? 1705 x86Op op = st == IRT_NUM ?
1677 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) : 1706 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
1678 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI); 1707 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
1679 if (LJ_64 && irt_isu64(ir->t)) { 1708 if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */
1680 const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000)); 1709 /* u32 = (int32_t)(number - 2^31) + 2^31 */
1681 MCLabel l_end = emit_label(as); 1710 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1682 Reg left = IR(lref)->r; 1711 ra_scratch(as, RSET_FPR);
1712 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
1713 emit_rr(as, op, dest, tmp);
1714 if (st == IRT_NUM)
1715 emit_rma(as, XO_ADDSD, tmp,
1716 lj_ir_k64_find(as->J, U64x(c1e00000,00000000)));
1717 else
1718 emit_rma(as, XO_ADDSS, tmp,
1719 lj_ir_k64_find(as->J, U64x(00000000,cf000000)));
1720 ra_left(as, tmp, lref);
1721 } else if (LJ_64 && irt_isu64(ir->t)) {
1683 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ 1722 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1684 if (ra_hasreg(left)) { 1723 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1685 Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left)); 1724 ra_scratch(as, RSET_FPR);
1686 emit_rr(as, op, dest|REX_64, tmpn); 1725 MCLabel l_end = emit_label(as);
1687 emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left); 1726 emit_rr(as, op, dest|REX_64, tmp);
1688 emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k); 1727 if (st == IRT_NUM)
1689 } else { 1728 emit_rma(as, XO_ADDSD, tmp,
1690 left = ra_allocref(as, lref, RSET_FPR); 1729 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1691 emit_rr(as, op, dest|REX_64, left); 1730 else
1692 emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k); 1731 emit_rma(as, XO_ADDSS, tmp,
1693 } 1732 lj_ir_k64_find(as->J, U64x(00000000,df800000)));
1694 emit_sjcc(as, CC_NS, l_end); 1733 emit_sjcc(as, CC_NS, l_end);
1695 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */ 1734 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */
1696 emit_rr(as, op, dest|REX_64, left); 1735 emit_rr(as, op, dest|REX_64, tmp);
1736 ra_left(as, tmp, lref);
1697 } else { 1737 } else {
1698 Reg left = asm_fuseload(as, lref, RSET_FPR); 1738 Reg left = asm_fuseload(as, lref, RSET_FPR);
1699 if (LJ_64 && irt_isu32(ir->t)) 1739 if (LJ_64 && irt_isu32(ir->t))
1700 emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */ 1740 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
1701 emit_mrm(as, op, 1741 emit_mrm(as, op,
1702 dest|((LJ_64 && 1742 dest|((LJ_64 &&
1703 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 1743 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
@@ -1728,12 +1768,10 @@ static void asm_conv(ASMState *as, IRIns *ir)
1728 emit_mrm(as, op, dest, left); 1768 emit_mrm(as, op, dest, left);
1729 } 1769 }
1730 } else { /* 32/64 bit integer conversions. */ 1770 } else { /* 32/64 bit integer conversions. */
1731 if (irt_is64(ir->t)) { 1771 if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
1732#if LJ_32 1772 Reg dest = ra_dest(as, ir, RSET_GPR);
1733 /* NYI: conversion to 64 bit integers. */ 1773 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1734 setintV(&as->J->errinfo, ir->o); 1774 } else if (irt_is64(ir->t)) {
1735 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1736#else
1737 Reg dest = ra_dest(as, ir, RSET_GPR); 1775 Reg dest = ra_dest(as, ir, RSET_GPR);
1738 if (st64 || !(ir->op2 & IRCONV_SEXT)) { 1776 if (st64 || !(ir->op2 & IRCONV_SEXT)) {
1739 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ 1777 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
@@ -1742,21 +1780,14 @@ static void asm_conv(ASMState *as, IRIns *ir)
1742 Reg left = asm_fuseload(as, lref, RSET_GPR); 1780 Reg left = asm_fuseload(as, lref, RSET_GPR);
1743 emit_mrm(as, XO_MOVSXd, dest|REX_64, left); 1781 emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
1744 } 1782 }
1745#endif
1746 } else { 1783 } else {
1747 Reg dest = ra_dest(as, ir, RSET_GPR); 1784 Reg dest = ra_dest(as, ir, RSET_GPR);
1748 if (st64) { 1785 if (st64) {
1749#if LJ_32
1750 /* NYI: conversion from 64 bit integers. */
1751 setintV(&as->J->errinfo, ir->o);
1752 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1753#else
1754 Reg left = asm_fuseload(as, lref, RSET_GPR); 1786 Reg left = asm_fuseload(as, lref, RSET_GPR);
1755 /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits 1787 /* This is either a 32 bit reg/reg mov which zeroes the hiword
1756 ** or a load of the lower 32 bits from a 64 bit address. 1788 ** or a load of the loword from a 64 bit address.
1757 */ 1789 */
1758 emit_mrm(as, XO_MOV, dest, left); 1790 emit_mrm(as, XO_MOV, dest, left);
1759#endif
1760 } else { /* 32/32 bit no-op (cast). */ 1791 } else { /* 32/32 bit no-op (cast). */
1761 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 1792 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1762 } 1793 }
@@ -1764,6 +1795,93 @@ static void asm_conv(ASMState *as, IRIns *ir)
1764 } 1795 }
1765} 1796}
1766 1797
1798#if LJ_32 && LJ_HASFFI
1799/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
1800
1801/* 64 bit integer to FP conversion in 32 bit mode. */
1802static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
1803{
1804 Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
1805 Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
1806 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1807 Reg dest = ir->r;
1808 if (ra_hasreg(dest)) {
1809 ra_free(as, dest);
1810 ra_modified(as, dest);
1811 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
1812 dest, RID_ESP, ofs);
1813 }
1814 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
1815 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
1816 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
1817 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
1818 MCLabel l_end = emit_label(as);
1819 emit_rma(as, XO_FADDq, XOg_FADDq,
1820 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
1821 emit_sjcc(as, CC_NS, l_end);
1822 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
1823 } else {
1824 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64);
1825 }
1826 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
1827 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
1828 emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
1829 emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
1830}
1831
1832/* FP to 64 bit integer conversion in 32 bit mode. */
1833static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
1834{
1835 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
1836 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
1837 Reg lo, hi;
1838 lua_assert(st == IRT_NUM || st == IRT_FLOAT);
1839 lua_assert(dt == IRT_I64 || dt == IRT_U64);
1840 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
1841 hi = ra_dest(as, ir, RSET_GPR);
1842 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
1843 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
1844 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1845 if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */
1846 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
1847 emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
1848 emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
1849 }
1850 if (dt == IRT_U64) {
1851 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1852 MCLabel l_pop, l_end = emit_label(as);
1853 emit_x87op(as, XI_FPOP);
1854 l_pop = emit_label(as);
1855 emit_sjmp(as, l_end);
1856 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1857 if ((as->flags & JIT_F_SSE3))
1858 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1859 else
1860 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1861 emit_rma(as, XO_FADDq, XOg_FADDq,
1862 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1863 emit_sjcc(as, CC_NS, l_pop);
1864 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
1865 }
1866 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1867 if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */
1868 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1869 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1870 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1871 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
1872 emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
1873 emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
1874 emit_loadi(as, lo, 0xc00);
1875 emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
1876 }
1877 if (dt == IRT_U64)
1878 emit_x87op(as, XI_FDUP);
1879 emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
1880 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
1881 asm_fuseload(as, ir->op1, RSET_EMPTY));
1882}
1883#endif
1884
1767static void asm_strto(ASMState *as, IRIns *ir) 1885static void asm_strto(ASMState *as, IRIns *ir)
1768{ 1886{
1769 /* Force a spill slot for the destination register (if any). */ 1887 /* Force a spill slot for the destination register (if any). */
@@ -2644,6 +2762,18 @@ static void asm_powi(ASMState *as, IRIns *ir)
2644 ra_left(as, RID_EAX, ir->op2); 2762 ra_left(as, RID_EAX, ir->op2);
2645} 2763}
2646 2764
2765#if LJ_64 && LJ_HASFFI
2766static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
2767{
2768 const CCallInfo *ci = &lj_ir_callinfo[id];
2769 IRRef args[2];
2770 args[0] = ir->op1;
2771 args[1] = ir->op2;
2772 asm_setupresult(as, ir, ci);
2773 asm_gencall(as, ci, args);
2774}
2775#endif
2776
2647/* Find out whether swapping operands might be beneficial. */ 2777/* Find out whether swapping operands might be beneficial. */
2648static int swapops(ASMState *as, IRIns *ir) 2778static int swapops(ASMState *as, IRIns *ir)
2649{ 2779{
@@ -2877,12 +3007,30 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2877/* -- Comparisons --------------------------------------------------------- */ 3007/* -- Comparisons --------------------------------------------------------- */
2878 3008
2879/* Virtual flags for unordered FP comparisons. */ 3009/* Virtual flags for unordered FP comparisons. */
2880#define VCC_U 0x100 /* Unordered. */ 3010#define VCC_U 0x1000 /* Unordered. */
2881#define VCC_P 0x200 /* Needs extra CC_P branch. */ 3011#define VCC_P 0x2000 /* Needs extra CC_P branch. */
2882#define VCC_S 0x400 /* Swap avoids CC_P branch. */ 3012#define VCC_S 0x4000 /* Swap avoids CC_P branch. */
2883#define VCC_PS (VCC_P|VCC_S) 3013#define VCC_PS (VCC_P|VCC_S)
2884 3014
2885static void asm_comp_(ASMState *as, IRIns *ir, int cc) 3015/* Map of comparisons to flags. ORDER IR. */
3016#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
3017static const uint16_t asm_compmap[IR_ABC+1] = {
3018 /* signed non-eq unsigned flags */
3019 /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS),
3020 /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0),
3021 /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS),
3022 /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0),
3023 /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U),
3024 /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS),
3025 /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U),
3026 /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS),
3027 /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
3028 /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P),
3029 /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */
3030};
3031
3032/* FP and integer comparisons. */
3033static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2886{ 3034{
2887 if (irt_isnum(ir->t)) { 3035 if (irt_isnum(ir->t)) {
2888 IRRef lref = ir->op1; 3036 IRRef lref = ir->op1;
@@ -3008,15 +3156,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3008 if (irl+1 == ir) /* Referencing previous ins? */ 3156 if (irl+1 == ir) /* Referencing previous ins? */
3009 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ 3157 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
3010 } else { 3158 } else {
3011 x86Op xo; 3159 emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
3012 if (checki8(imm)) {
3013 emit_i8(as, imm);
3014 xo = XO_ARITHi8;
3015 } else {
3016 emit_i32(as, imm);
3017 xo = XO_ARITHi;
3018 }
3019 emit_mrm(as, xo, r64 + XOg_CMP, left);
3020 } 3160 }
3021 } 3161 }
3022 } else { 3162 } else {
@@ -3028,8 +3168,133 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3028 } 3168 }
3029} 3169}
3030 3170
3031#define asm_comp(as, ir, ci, cf, cu) \ 3171#if LJ_32 && LJ_HASFFI
3032 asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) 3172/* 64 bit integer comparisons in 32 bit mode. */
3173static void asm_comp_int64(ASMState *as, IRIns *ir)
3174{
3175 uint32_t cc = asm_compmap[(ir-1)->o];
3176 RegSet allow = RSET_GPR;
3177 Reg lefthi = RID_NONE, leftlo = RID_NONE;
3178 Reg righthi = RID_NONE, rightlo = RID_NONE;
3179 MCLabel l_around;
3180 x86ModRM mrm;
3181
3182 as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */
3183
3184 /* Allocate/fuse hiword operands. */
3185 if (irref_isk(ir->op2)) {
3186 lefthi = asm_fuseload(as, ir->op1, allow);
3187 } else {
3188 lefthi = ra_alloc1(as, ir->op1, allow);
3189 righthi = asm_fuseload(as, ir->op2, allow);
3190 if (righthi == RID_MRM) {
3191 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3192 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3193 } else {
3194 rset_clear(allow, righthi);
3195 }
3196 }
3197 mrm = as->mrm; /* Save state for hiword instruction. */
3198
3199 /* Allocate/fuse loword operands. */
3200 if (irref_isk((ir-1)->op2)) {
3201 leftlo = asm_fuseload(as, (ir-1)->op1, allow);
3202 } else {
3203 leftlo = ra_alloc1(as, (ir-1)->op1, allow);
3204 rightlo = asm_fuseload(as, (ir-1)->op2, allow);
3205 if (rightlo == RID_MRM) {
3206 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3207 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3208 } else {
3209 rset_clear(allow, rightlo);
3210 }
3211 }
3212
3213 /* All register allocations must be performed _before_ this point. */
3214 l_around = emit_label(as);
3215 as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */
3216
3217 /* Loword comparison and branch. */
3218 asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */
3219 if (ra_noreg(rightlo)) {
3220 int32_t imm = IR((ir-1)->op2)->i;
3221 if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
3222 emit_rr(as, XO_TEST, leftlo, leftlo);
3223 else
3224 emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
3225 } else {
3226 emit_mrm(as, XO_CMP, leftlo, rightlo);
3227 }
3228
3229 /* Hiword comparison and branches. */
3230 if ((cc & 15) != CC_NE)
3231 emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */
3232 if ((cc & 15) != CC_E)
3233 asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */
3234 as->mrm = mrm; /* Restore state. */
3235 if (ra_noreg(righthi)) {
3236 int32_t imm = IR(ir->op2)->i;
3237 if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
3238 emit_rr(as, XO_TEST, lefthi, lefthi);
3239 else
3240 emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
3241 } else {
3242 emit_mrm(as, XO_CMP, lefthi, righthi);
3243 }
3244}
3245#endif
3246
3247/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
3248
3249/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
3250static void asm_hiop(ASMState *as, IRIns *ir)
3251{
3252#if LJ_32 && LJ_HASFFI
3253 /* HIOP is marked as a store because it needs its own DCE logic. */
3254 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
3255 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
3256 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
3257 if (usehi || uselo) {
3258 if (irt_isfp(ir->t))
3259 asm_conv_fp_int64(as, ir);
3260 else
3261 asm_conv_int64_fp(as, ir);
3262 }
3263 as->curins--; /* Always skip the CONV. */
3264 return;
3265 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
3266 asm_comp_int64(as, ir);
3267 return;
3268 }
3269 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
3270 switch ((ir-1)->o) {
3271 case IR_ADD:
3272 asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD);
3273 break;
3274 case IR_SUB:
3275 asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB);
3276 break;
3277 case IR_NEG: {
3278 Reg dest = ra_dest(as, ir, RSET_GPR);
3279 emit_rr(as, XO_GROUP3, XOg_NEG, dest);
3280 if (uselo) {
3281 emit_i8(as, 0);
3282 emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
3283 }
3284 ra_left(as, dest, ir->op1);
3285 break;
3286 }
3287 case IR_CALLN:
3288 ra_destreg(as, ir, RID_RETHI);
3289 if (!uselo)
3290 ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */
3291 break;
3292 default: lua_assert(0); break;
3293 }
3294#else
3295 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
3296#endif
3297}
3033 3298
3034/* -- Stack handling ------------------------------------------------------ */ 3299/* -- Stack handling ------------------------------------------------------ */
3035 3300
@@ -3682,21 +3947,16 @@ static void asm_ir(ASMState *as, IRIns *ir)
3682 switch ((IROp)ir->o) { 3947 switch ((IROp)ir->o) {
3683 /* Miscellaneous ops. */ 3948 /* Miscellaneous ops. */
3684 case IR_LOOP: asm_loop(as); break; 3949 case IR_LOOP: asm_loop(as); break;
3685 case IR_NOP: break; 3950 case IR_NOP: lua_assert(!ra_used(ir)); break;
3686 case IR_PHI: asm_phi(as, ir); break; 3951 case IR_PHI: asm_phi(as, ir); break;
3952 case IR_HIOP: asm_hiop(as, ir); break;
3687 3953
3688 /* Guarded assertions. */ 3954 /* Guarded assertions. */
3689 case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; 3955 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
3690 case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; 3956 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
3691 case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; 3957 case IR_EQ: case IR_NE: case IR_ABC:
3692 case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; 3958 asm_comp(as, ir, asm_compmap[ir->o]);
3693 case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; 3959 break;
3694 case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break;
3695 case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break;
3696 case IR_ABC:
3697 case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
3698 case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
3699 case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break;
3700 3960
3701 case IR_RETF: asm_retf(as, ir); break; 3961 case IR_RETF: asm_retf(as, ir); break;
3702 3962
@@ -3744,7 +4004,15 @@ static void asm_ir(ASMState *as, IRIns *ir)
3744 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: 4004 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
3745 asm_fpmath(as, ir); 4005 asm_fpmath(as, ir);
3746 break; 4006 break;
3747 case IR_POWI: asm_powi(as, ir); break; 4007 case IR_POWI:
4008#if LJ_64 && LJ_HASFFI
4009 if (!irt_isnum(ir->t))
4010 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
4011 IRCALL_lj_carith_powu64);
4012 else
4013#endif
4014 asm_powi(as, ir);
4015 break;
3748 4016
3749 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ 4017 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3750 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; 4018 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@@ -3801,6 +4069,7 @@ static void asm_trace(ASMState *as)
3801{ 4069{
3802 for (as->curins--; as->curins > as->stopins; as->curins--) { 4070 for (as->curins--; as->curins > as->stopins; as->curins--) {
3803 IRIns *ir = IR(as->curins); 4071 IRIns *ir = IR(as->curins);
4072 lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
3804 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE)) 4073 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
3805 continue; /* Dead-code elimination can be soooo easy. */ 4074 continue; /* Dead-code elimination can be soooo easy. */
3806 if (irt_isguard(ir->t)) 4075 if (irt_isguard(ir->t))
@@ -3864,11 +4133,10 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3864 case IR_CALLN: case IR_CALLL: case IR_CALLS: { 4133 case IR_CALLN: case IR_CALLL: case IR_CALLS: {
3865 const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; 4134 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
3866#if LJ_64 4135#if LJ_64
3867 /* NYI: add stack slots for x64 calls with many args. */
3868 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6)); 4136 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6));
3869 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); 4137 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
3870#else 4138#else
3871 /* NYI: not fastcall-aware, but doesn't matter (yet). */ 4139 lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2);
3872 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */ 4140 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
3873 as->evenspill = (int32_t)CCI_NARGS(ci); 4141 as->evenspill = (int32_t)CCI_NARGS(ci);
3874 ir->prev = REGSP_HINT(RID_RET); 4142 ir->prev = REGSP_HINT(RID_RET);
@@ -3878,6 +4146,12 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3878 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH; 4146 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
3879 continue; 4147 continue;
3880 } 4148 }
4149#if LJ_32 && LJ_HASFFI
4150 case IR_HIOP:
4151 if ((ir-1)->o == IR_CALLN)
4152 ir->prev = REGSP_HINT(RID_RETHI);
4153 break;
4154#endif
3881 /* C calls evict all scratch regs and return results in RID_RET. */ 4155 /* C calls evict all scratch regs and return results in RID_RET. */
3882 case IR_SNEW: case IR_NEWREF: 4156 case IR_SNEW: case IR_NEWREF:
3883#if !LJ_64 4157#if !LJ_64
@@ -3894,6 +4168,14 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3894 as->modset = RSET_SCRATCH; 4168 as->modset = RSET_SCRATCH;
3895 break; 4169 break;
3896 case IR_POWI: 4170 case IR_POWI:
4171#if LJ_64 && LJ_HASFFI
4172 if (!irt_isnum(ir->t)) {
4173 ir->prev = REGSP_HINT(RID_RET);
4174 if (inloop)
4175 as->modset |= (RSET_SCRATCH & RSET_GPR);
4176 continue;
4177 }
4178#endif
3897 ir->prev = REGSP_HINT(RID_XMM0); 4179 ir->prev = REGSP_HINT(RID_XMM0);
3898 if (inloop) 4180 if (inloop)
3899 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); 4181 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);