aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Makefile2
-rw-r--r--src/Makefile.dep15
-rw-r--r--src/lj_asm.c444
-rw-r--r--src/lj_carith.c8
-rw-r--r--src/lj_carith.h3
-rw-r--r--src/lj_crecord.c20
-rw-r--r--src/lj_ir.h21
-rw-r--r--src/lj_iropt.h6
-rw-r--r--src/lj_jit.h12
-rw-r--r--src/lj_opt_fold.c25
-rw-r--r--src/lj_opt_split.c343
-rw-r--r--src/lj_target_x86.h9
-rw-r--r--src/lj_trace.c2
-rw-r--r--src/ljamalg.c1
14 files changed, 795 insertions, 116 deletions
diff --git a/src/Makefile b/src/Makefile
index a2be1a18..0150b049 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -331,7 +331,7 @@ LJCORE_O= lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o \
331 lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \ 331 lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \
332 lj_lex.o lj_parse.o \ 332 lj_lex.o lj_parse.o \
333 lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \ 333 lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \
334 lj_opt_dce.o lj_opt_loop.o \ 334 lj_opt_dce.o lj_opt_loop.o lj_opt_split.o \
335 lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \ 335 lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \
336 lj_asm.o lj_trace.o lj_gdbjit.o \ 336 lj_asm.o lj_trace.o lj_gdbjit.o \
337 lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_carith.o lj_clib.o \ 337 lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_carith.o lj_clib.o \
diff --git a/src/Makefile.dep b/src/Makefile.dep
index 3d0c4239..1534ac27 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -128,6 +128,8 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
128lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ 128lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \
129 lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \ 129 lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
130 lj_dispatch.h lj_traceerr.h 130 lj_dispatch.h lj_traceerr.h
131lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \
132 lj_arch.h
131lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ 133lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
132 lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \ 134 lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \
133 lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h 135 lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h
@@ -167,10 +169,11 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \
167 lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_carith.c lj_carith.h \ 169 lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_carith.c lj_carith.h \
168 lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c \ 170 lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c \
169 lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \ 171 lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \
170 lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_mcode.c lj_mcode.h lj_snap.c \ 172 lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_mcode.c \
171 lj_target.h lj_target_*.h lj_record.c lj_record.h lj_ffrecord.h \ 173 lj_mcode.h lj_snap.c lj_target.h lj_target_*.h lj_record.c lj_record.h \
172 lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h \ 174 lj_ffrecord.h lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h \
173 lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c \ 175 lj_asm.c lj_asm.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c \
174 lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \ 176 lib_aux.c lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c \
175 lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c lib_init.c 177 lib_io.c lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c \
178 lib_ffi.c lib_init.c
176luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h 179luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h
diff --git a/src/lj_asm.c b/src/lj_asm.c
index cc2ae597..441700d4 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -347,6 +347,20 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
347 } 347 }
348} 348}
349 349
350/* op rm/mrm, i */
351static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i)
352{
353 x86Op xo;
354 if (checki8(i)) {
355 emit_i8(as, i);
356 xo = XG_TOXOi8(xg);
357 } else {
358 emit_i32(as, i);
359 xo = XG_TOXOi(xg);
360 }
361 emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64));
362}
363
350/* -- Emit moves ---------------------------------------------------------- */ 364/* -- Emit moves ---------------------------------------------------------- */
351 365
352/* mov [base+ofs], i */ 366/* mov [base+ofs], i */
@@ -371,7 +385,10 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
371/* mov r, i / xor r, r */ 385/* mov r, i / xor r, r */
372static void emit_loadi(ASMState *as, Reg r, int32_t i) 386static void emit_loadi(ASMState *as, Reg r, int32_t i)
373{ 387{
374 if (i == 0) { 388 /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
389 if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP ||
390 (as->curins+1 < as->T->nins &&
391 IR(as->curins+1)->o == IR_HIOP)))) {
375 emit_rr(as, XO_ARITH(XOg_XOR), r, r); 392 emit_rr(as, XO_ARITH(XOg_XOR), r, r);
376 } else { 393 } else {
377 MCode *p = as->mcp; 394 MCode *p = as->mcp;
@@ -422,6 +439,19 @@ static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
422/* Label for short jumps. */ 439/* Label for short jumps. */
423typedef MCode *MCLabel; 440typedef MCode *MCLabel;
424 441
442#if LJ_32 && LJ_HASFFI
443/* jmp short target */
444static void emit_sjmp(ASMState *as, MCLabel target)
445{
446 MCode *p = as->mcp;
447 ptrdiff_t delta = target - p;
448 lua_assert(delta == (int8_t)delta);
449 p[-1] = (MCode)(int8_t)delta;
450 p[-2] = XI_JMPs;
451 as->mcp = p - 2;
452}
453#endif
454
425/* jcc short target */ 455/* jcc short target */
426static void emit_sjcc(ASMState *as, int cc, MCLabel target) 456static void emit_sjcc(ASMState *as, int cc, MCLabel target)
427{ 457{
@@ -630,7 +660,7 @@ static Reg ra_rematk(ASMState *as, IRIns *ir)
630 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */ 660 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
631 lua_assert(irt_isnil(ir->t)); 661 lua_assert(irt_isnil(ir->t));
632 emit_getgl(as, r, jit_L); 662 emit_getgl(as, r, jit_L);
633#if LJ_64 /* NYI: 32 bit register pairs. */ 663#if LJ_64
634 } else if (ir->o == IR_KINT64) { 664 } else if (ir->o == IR_KINT64) {
635 emit_loadu64(as, r, ir_kint64(ir)->u64); 665 emit_loadu64(as, r, ir_kint64(ir)->u64);
636#endif 666#endif
@@ -681,8 +711,7 @@ static Reg ra_releasetmp(ASMState *as, IRRef ref)
681#if LJ_64 711#if LJ_64
682#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) 712#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
683#else 713#else
684/* NYI: 32 bit register pairs. */ 714#define REX_64IR(ir, r) (r)
685#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r))
686#endif 715#endif
687 716
688/* Generic move between two regs. */ 717/* Generic move between two regs. */
@@ -939,7 +968,7 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref)
939 emit_loadn(as, dest, tv); 968 emit_loadn(as, dest, tv);
940 return; 969 return;
941 } 970 }
942#if LJ_64 /* NYI: 32 bit register pairs. */ 971#if LJ_64
943 } else if (ir->o == IR_KINT64) { 972 } else if (ir->o == IR_KINT64) {
944 emit_loadu64(as, dest, ir_kint64(ir)->u64); 973 emit_loadu64(as, dest, ir_kint64(ir)->u64);
945 return; 974 return;
@@ -1463,7 +1492,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
1463#endif 1492#endif
1464 if (r) { /* Argument is in a register. */ 1493 if (r) { /* Argument is in a register. */
1465 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 1494 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
1466#if LJ_64 /* NYI: 32 bit register pairs. */ 1495#if LJ_64
1467 if (ir->o == IR_KINT64) 1496 if (ir->o == IR_KINT64)
1468 emit_loadu64(as, r, ir_kint64(ir)->u64); 1497 emit_loadu64(as, r, ir_kint64(ir)->u64);
1469 else 1498 else
@@ -1519,7 +1548,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
1519 ra_evictset(as, drop); /* Evictions must be performed first. */ 1548 ra_evictset(as, drop); /* Evictions must be performed first. */
1520 if (ra_used(ir)) { 1549 if (ra_used(ir)) {
1521 if (irt_isfp(ir->t)) { 1550 if (irt_isfp(ir->t)) {
1522 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1551 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1523#if LJ_64 1552#if LJ_64
1524 if ((ci->flags & CCI_CASTU64)) { 1553 if ((ci->flags & CCI_CASTU64)) {
1525 Reg dest = ir->r; 1554 Reg dest = ir->r;
@@ -1632,19 +1661,24 @@ static void asm_conv(ASMState *as, IRIns *ir)
1632 int stfp = (st == IRT_NUM || st == IRT_FLOAT); 1661 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
1633 IRRef lref = ir->op1; 1662 IRRef lref = ir->op1;
1634 lua_assert(irt_type(ir->t) != st); 1663 lua_assert(irt_type(ir->t) != st);
1664 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */
1635 if (irt_isfp(ir->t)) { 1665 if (irt_isfp(ir->t)) {
1636 Reg dest = ra_dest(as, ir, RSET_FPR); 1666 Reg dest = ra_dest(as, ir, RSET_FPR);
1637 if (stfp) { /* FP to FP conversion. */ 1667 if (stfp) { /* FP to FP conversion. */
1638 Reg left = asm_fuseload(as, lref, RSET_FPR); 1668 Reg left = asm_fuseload(as, lref, RSET_FPR);
1639 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); 1669 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
1640 if (left == dest) return; /* Avoid the XO_XORPS. */ 1670 if (left == dest) return; /* Avoid the XO_XORPS. */
1641#if LJ_32 1671 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
1642 } else if (st >= IRT_U32) { 1672 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
1643 /* NYI: 64 bit integer or uint32_t to number conversion. */ 1673 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000));
1644 setintV(&as->J->errinfo, ir->o); 1674 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
1645 lj_trace_err_info(as->J, LJ_TRERR_NYIIR); 1675 if (irt_isfloat(ir->t))
1676 emit_rr(as, XO_CVTSD2SS, dest, dest);
1677 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
1678 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
1679 emit_loadn(as, bias, k);
1680 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
1646 return; 1681 return;
1647#endif
1648 } else { /* Integer to FP conversion. */ 1682 } else { /* Integer to FP conversion. */
1649 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? 1683 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
1650 ra_alloc1(as, lref, RSET_GPR) : 1684 ra_alloc1(as, lref, RSET_GPR) :
@@ -1663,41 +1697,47 @@ static void asm_conv(ASMState *as, IRIns *ir)
1663 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ 1697 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
1664 } else if (stfp) { /* FP to integer conversion. */ 1698 } else if (stfp) { /* FP to integer conversion. */
1665 if (irt_isguard(ir->t)) { 1699 if (irt_isguard(ir->t)) {
1666 lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */ 1700 /* Checked conversions are only supported from number to int. */
1701 lua_assert(irt_isint(ir->t) && st == IRT_NUM);
1667 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 1702 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
1668#if LJ_32
1669 } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) {
1670 /* NYI: number to 64 bit integer or uint32_t conversion. */
1671 setintV(&as->J->errinfo, ir->o);
1672 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1673#endif
1674 } else { 1703 } else {
1675 Reg dest = ra_dest(as, ir, RSET_GPR); 1704 Reg dest = ra_dest(as, ir, RSET_GPR);
1676 x86Op op = st == IRT_NUM ? 1705 x86Op op = st == IRT_NUM ?
1677 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) : 1706 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
1678 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI); 1707 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
1679 if (LJ_64 && irt_isu64(ir->t)) { 1708 if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */
1680 const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000)); 1709 /* u32 = (int32_t)(number - 2^31) + 2^31 */
1681 MCLabel l_end = emit_label(as); 1710 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1682 Reg left = IR(lref)->r; 1711 ra_scratch(as, RSET_FPR);
1712 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
1713 emit_rr(as, op, dest, tmp);
1714 if (st == IRT_NUM)
1715 emit_rma(as, XO_ADDSD, tmp,
1716 lj_ir_k64_find(as->J, U64x(c1e00000,00000000)));
1717 else
1718 emit_rma(as, XO_ADDSS, tmp,
1719 lj_ir_k64_find(as->J, U64x(00000000,cf000000)));
1720 ra_left(as, tmp, lref);
1721 } else if (LJ_64 && irt_isu64(ir->t)) {
1683 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ 1722 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1684 if (ra_hasreg(left)) { 1723 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1685 Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left)); 1724 ra_scratch(as, RSET_FPR);
1686 emit_rr(as, op, dest|REX_64, tmpn); 1725 MCLabel l_end = emit_label(as);
1687 emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left); 1726 emit_rr(as, op, dest|REX_64, tmp);
1688 emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k); 1727 if (st == IRT_NUM)
1689 } else { 1728 emit_rma(as, XO_ADDSD, tmp,
1690 left = ra_allocref(as, lref, RSET_FPR); 1729 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1691 emit_rr(as, op, dest|REX_64, left); 1730 else
1692 emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k); 1731 emit_rma(as, XO_ADDSS, tmp,
1693 } 1732 lj_ir_k64_find(as->J, U64x(00000000,df800000)));
1694 emit_sjcc(as, CC_NS, l_end); 1733 emit_sjcc(as, CC_NS, l_end);
1695 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */ 1734 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */
1696 emit_rr(as, op, dest|REX_64, left); 1735 emit_rr(as, op, dest|REX_64, tmp);
1736 ra_left(as, tmp, lref);
1697 } else { 1737 } else {
1698 Reg left = asm_fuseload(as, lref, RSET_FPR); 1738 Reg left = asm_fuseload(as, lref, RSET_FPR);
1699 if (LJ_64 && irt_isu32(ir->t)) 1739 if (LJ_64 && irt_isu32(ir->t))
1700 emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */ 1740 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
1701 emit_mrm(as, op, 1741 emit_mrm(as, op,
1702 dest|((LJ_64 && 1742 dest|((LJ_64 &&
1703 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 1743 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
@@ -1728,12 +1768,10 @@ static void asm_conv(ASMState *as, IRIns *ir)
1728 emit_mrm(as, op, dest, left); 1768 emit_mrm(as, op, dest, left);
1729 } 1769 }
1730 } else { /* 32/64 bit integer conversions. */ 1770 } else { /* 32/64 bit integer conversions. */
1731 if (irt_is64(ir->t)) { 1771 if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
1732#if LJ_32 1772 Reg dest = ra_dest(as, ir, RSET_GPR);
1733 /* NYI: conversion to 64 bit integers. */ 1773 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1734 setintV(&as->J->errinfo, ir->o); 1774 } else if (irt_is64(ir->t)) {
1735 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1736#else
1737 Reg dest = ra_dest(as, ir, RSET_GPR); 1775 Reg dest = ra_dest(as, ir, RSET_GPR);
1738 if (st64 || !(ir->op2 & IRCONV_SEXT)) { 1776 if (st64 || !(ir->op2 & IRCONV_SEXT)) {
1739 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ 1777 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
@@ -1742,21 +1780,14 @@ static void asm_conv(ASMState *as, IRIns *ir)
1742 Reg left = asm_fuseload(as, lref, RSET_GPR); 1780 Reg left = asm_fuseload(as, lref, RSET_GPR);
1743 emit_mrm(as, XO_MOVSXd, dest|REX_64, left); 1781 emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
1744 } 1782 }
1745#endif
1746 } else { 1783 } else {
1747 Reg dest = ra_dest(as, ir, RSET_GPR); 1784 Reg dest = ra_dest(as, ir, RSET_GPR);
1748 if (st64) { 1785 if (st64) {
1749#if LJ_32
1750 /* NYI: conversion from 64 bit integers. */
1751 setintV(&as->J->errinfo, ir->o);
1752 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1753#else
1754 Reg left = asm_fuseload(as, lref, RSET_GPR); 1786 Reg left = asm_fuseload(as, lref, RSET_GPR);
1755 /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits 1787 /* This is either a 32 bit reg/reg mov which zeroes the hiword
1756 ** or a load of the lower 32 bits from a 64 bit address. 1788 ** or a load of the loword from a 64 bit address.
1757 */ 1789 */
1758 emit_mrm(as, XO_MOV, dest, left); 1790 emit_mrm(as, XO_MOV, dest, left);
1759#endif
1760 } else { /* 32/32 bit no-op (cast). */ 1791 } else { /* 32/32 bit no-op (cast). */
1761 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 1792 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1762 } 1793 }
@@ -1764,6 +1795,93 @@ static void asm_conv(ASMState *as, IRIns *ir)
1764 } 1795 }
1765} 1796}
1766 1797
1798#if LJ_32 && LJ_HASFFI
1799/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
1800
1801/* 64 bit integer to FP conversion in 32 bit mode. */
1802static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
1803{
1804 Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
1805 Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
1806 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1807 Reg dest = ir->r;
1808 if (ra_hasreg(dest)) {
1809 ra_free(as, dest);
1810 ra_modified(as, dest);
1811 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
1812 dest, RID_ESP, ofs);
1813 }
1814 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
1815 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
1816 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
1817 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
1818 MCLabel l_end = emit_label(as);
1819 emit_rma(as, XO_FADDq, XOg_FADDq,
1820 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
1821 emit_sjcc(as, CC_NS, l_end);
1822 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
1823 } else {
1824 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64);
1825 }
1826 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
1827 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
1828 emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
1829 emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
1830}
1831
1832/* FP to 64 bit integer conversion in 32 bit mode. */
1833static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
1834{
1835 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
1836 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
1837 Reg lo, hi;
1838 lua_assert(st == IRT_NUM || st == IRT_FLOAT);
1839 lua_assert(dt == IRT_I64 || dt == IRT_U64);
1840 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
1841 hi = ra_dest(as, ir, RSET_GPR);
1842 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
1843 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
1844 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1845 if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */
1846 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
1847 emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
1848 emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
1849 }
1850 if (dt == IRT_U64) {
1851 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1852 MCLabel l_pop, l_end = emit_label(as);
1853 emit_x87op(as, XI_FPOP);
1854 l_pop = emit_label(as);
1855 emit_sjmp(as, l_end);
1856 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1857 if ((as->flags & JIT_F_SSE3))
1858 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1859 else
1860 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1861 emit_rma(as, XO_FADDq, XOg_FADDq,
1862 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1863 emit_sjcc(as, CC_NS, l_pop);
1864 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
1865 }
1866 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1867 if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */
1868 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1869 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1870 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1871 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
1872 emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
1873 emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
1874 emit_loadi(as, lo, 0xc00);
1875 emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
1876 }
1877 if (dt == IRT_U64)
1878 emit_x87op(as, XI_FDUP);
1879 emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
1880 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
1881 asm_fuseload(as, ir->op1, RSET_EMPTY));
1882}
1883#endif
1884
1767static void asm_strto(ASMState *as, IRIns *ir) 1885static void asm_strto(ASMState *as, IRIns *ir)
1768{ 1886{
1769 /* Force a spill slot for the destination register (if any). */ 1887 /* Force a spill slot for the destination register (if any). */
@@ -2644,6 +2762,18 @@ static void asm_powi(ASMState *as, IRIns *ir)
2644 ra_left(as, RID_EAX, ir->op2); 2762 ra_left(as, RID_EAX, ir->op2);
2645} 2763}
2646 2764
2765#if LJ_64 && LJ_HASFFI
2766static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
2767{
2768 const CCallInfo *ci = &lj_ir_callinfo[id];
2769 IRRef args[2];
2770 args[0] = ir->op1;
2771 args[1] = ir->op2;
2772 asm_setupresult(as, ir, ci);
2773 asm_gencall(as, ci, args);
2774}
2775#endif
2776
2647/* Find out whether swapping operands might be beneficial. */ 2777/* Find out whether swapping operands might be beneficial. */
2648static int swapops(ASMState *as, IRIns *ir) 2778static int swapops(ASMState *as, IRIns *ir)
2649{ 2779{
@@ -2877,12 +3007,30 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2877/* -- Comparisons --------------------------------------------------------- */ 3007/* -- Comparisons --------------------------------------------------------- */
2878 3008
2879/* Virtual flags for unordered FP comparisons. */ 3009/* Virtual flags for unordered FP comparisons. */
2880#define VCC_U 0x100 /* Unordered. */ 3010#define VCC_U 0x1000 /* Unordered. */
2881#define VCC_P 0x200 /* Needs extra CC_P branch. */ 3011#define VCC_P 0x2000 /* Needs extra CC_P branch. */
2882#define VCC_S 0x400 /* Swap avoids CC_P branch. */ 3012#define VCC_S 0x4000 /* Swap avoids CC_P branch. */
2883#define VCC_PS (VCC_P|VCC_S) 3013#define VCC_PS (VCC_P|VCC_S)
2884 3014
2885static void asm_comp_(ASMState *as, IRIns *ir, int cc) 3015/* Map of comparisons to flags. ORDER IR. */
3016#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
3017static const uint16_t asm_compmap[IR_ABC+1] = {
3018 /* signed non-eq unsigned flags */
3019 /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS),
3020 /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0),
3021 /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS),
3022 /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0),
3023 /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U),
3024 /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS),
3025 /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U),
3026 /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS),
3027 /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
3028 /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P),
3029 /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */
3030};
3031
3032/* FP and integer comparisons. */
3033static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2886{ 3034{
2887 if (irt_isnum(ir->t)) { 3035 if (irt_isnum(ir->t)) {
2888 IRRef lref = ir->op1; 3036 IRRef lref = ir->op1;
@@ -3008,15 +3156,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3008 if (irl+1 == ir) /* Referencing previous ins? */ 3156 if (irl+1 == ir) /* Referencing previous ins? */
3009 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ 3157 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
3010 } else { 3158 } else {
3011 x86Op xo; 3159 emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
3012 if (checki8(imm)) {
3013 emit_i8(as, imm);
3014 xo = XO_ARITHi8;
3015 } else {
3016 emit_i32(as, imm);
3017 xo = XO_ARITHi;
3018 }
3019 emit_mrm(as, xo, r64 + XOg_CMP, left);
3020 } 3160 }
3021 } 3161 }
3022 } else { 3162 } else {
@@ -3028,8 +3168,133 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3028 } 3168 }
3029} 3169}
3030 3170
3031#define asm_comp(as, ir, ci, cf, cu) \ 3171#if LJ_32 && LJ_HASFFI
3032 asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) 3172/* 64 bit integer comparisons in 32 bit mode. */
3173static void asm_comp_int64(ASMState *as, IRIns *ir)
3174{
3175 uint32_t cc = asm_compmap[(ir-1)->o];
3176 RegSet allow = RSET_GPR;
3177 Reg lefthi = RID_NONE, leftlo = RID_NONE;
3178 Reg righthi = RID_NONE, rightlo = RID_NONE;
3179 MCLabel l_around;
3180 x86ModRM mrm;
3181
3182 as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */
3183
3184 /* Allocate/fuse hiword operands. */
3185 if (irref_isk(ir->op2)) {
3186 lefthi = asm_fuseload(as, ir->op1, allow);
3187 } else {
3188 lefthi = ra_alloc1(as, ir->op1, allow);
3189 righthi = asm_fuseload(as, ir->op2, allow);
3190 if (righthi == RID_MRM) {
3191 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3192 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3193 } else {
3194 rset_clear(allow, righthi);
3195 }
3196 }
3197 mrm = as->mrm; /* Save state for hiword instruction. */
3198
3199 /* Allocate/fuse loword operands. */
3200 if (irref_isk((ir-1)->op2)) {
3201 leftlo = asm_fuseload(as, (ir-1)->op1, allow);
3202 } else {
3203 leftlo = ra_alloc1(as, (ir-1)->op1, allow);
3204 rightlo = asm_fuseload(as, (ir-1)->op2, allow);
3205 if (rightlo == RID_MRM) {
3206 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3207 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3208 } else {
3209 rset_clear(allow, rightlo);
3210 }
3211 }
3212
3213 /* All register allocations must be performed _before_ this point. */
3214 l_around = emit_label(as);
3215 as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */
3216
3217 /* Loword comparison and branch. */
3218 asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */
3219 if (ra_noreg(rightlo)) {
3220 int32_t imm = IR((ir-1)->op2)->i;
3221 if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
3222 emit_rr(as, XO_TEST, leftlo, leftlo);
3223 else
3224 emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
3225 } else {
3226 emit_mrm(as, XO_CMP, leftlo, rightlo);
3227 }
3228
3229 /* Hiword comparison and branches. */
3230 if ((cc & 15) != CC_NE)
3231 emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */
3232 if ((cc & 15) != CC_E)
3233 asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */
3234 as->mrm = mrm; /* Restore state. */
3235 if (ra_noreg(righthi)) {
3236 int32_t imm = IR(ir->op2)->i;
3237 if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
3238 emit_rr(as, XO_TEST, lefthi, lefthi);
3239 else
3240 emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
3241 } else {
3242 emit_mrm(as, XO_CMP, lefthi, righthi);
3243 }
3244}
3245#endif
3246
3247/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
3248
3249/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
3250static void asm_hiop(ASMState *as, IRIns *ir)
3251{
3252#if LJ_32 && LJ_HASFFI
3253 /* HIOP is marked as a store because it needs its own DCE logic. */
3254 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
3255 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
3256 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
3257 if (usehi || uselo) {
3258 if (irt_isfp(ir->t))
3259 asm_conv_fp_int64(as, ir);
3260 else
3261 asm_conv_int64_fp(as, ir);
3262 }
3263 as->curins--; /* Always skip the CONV. */
3264 return;
3265 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
3266 asm_comp_int64(as, ir);
3267 return;
3268 }
3269 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
3270 switch ((ir-1)->o) {
3271 case IR_ADD:
3272 asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD);
3273 break;
3274 case IR_SUB:
3275 asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB);
3276 break;
3277 case IR_NEG: {
3278 Reg dest = ra_dest(as, ir, RSET_GPR);
3279 emit_rr(as, XO_GROUP3, XOg_NEG, dest);
3280 if (uselo) {
3281 emit_i8(as, 0);
3282 emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
3283 }
3284 ra_left(as, dest, ir->op1);
3285 break;
3286 }
3287 case IR_CALLN:
3288 ra_destreg(as, ir, RID_RETHI);
3289 if (!uselo)
3290 ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */
3291 break;
3292 default: lua_assert(0); break;
3293 }
3294#else
3295 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
3296#endif
3297}
3033 3298
3034/* -- Stack handling ------------------------------------------------------ */ 3299/* -- Stack handling ------------------------------------------------------ */
3035 3300
@@ -3682,21 +3947,16 @@ static void asm_ir(ASMState *as, IRIns *ir)
3682 switch ((IROp)ir->o) { 3947 switch ((IROp)ir->o) {
3683 /* Miscellaneous ops. */ 3948 /* Miscellaneous ops. */
3684 case IR_LOOP: asm_loop(as); break; 3949 case IR_LOOP: asm_loop(as); break;
3685 case IR_NOP: break; 3950 case IR_NOP: lua_assert(!ra_used(ir)); break;
3686 case IR_PHI: asm_phi(as, ir); break; 3951 case IR_PHI: asm_phi(as, ir); break;
3952 case IR_HIOP: asm_hiop(as, ir); break;
3687 3953
3688 /* Guarded assertions. */ 3954 /* Guarded assertions. */
3689 case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; 3955 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
3690 case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; 3956 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
3691 case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; 3957 case IR_EQ: case IR_NE: case IR_ABC:
3692 case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; 3958 asm_comp(as, ir, asm_compmap[ir->o]);
3693 case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; 3959 break;
3694 case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break;
3695 case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break;
3696 case IR_ABC:
3697 case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
3698 case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
3699 case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break;
3700 3960
3701 case IR_RETF: asm_retf(as, ir); break; 3961 case IR_RETF: asm_retf(as, ir); break;
3702 3962
@@ -3744,7 +4004,15 @@ static void asm_ir(ASMState *as, IRIns *ir)
3744 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: 4004 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
3745 asm_fpmath(as, ir); 4005 asm_fpmath(as, ir);
3746 break; 4006 break;
3747 case IR_POWI: asm_powi(as, ir); break; 4007 case IR_POWI:
4008#if LJ_64 && LJ_HASFFI
4009 if (!irt_isnum(ir->t))
4010 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
4011 IRCALL_lj_carith_powu64);
4012 else
4013#endif
4014 asm_powi(as, ir);
4015 break;
3748 4016
3749 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ 4017 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3750 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; 4018 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@@ -3801,6 +4069,7 @@ static void asm_trace(ASMState *as)
3801{ 4069{
3802 for (as->curins--; as->curins > as->stopins; as->curins--) { 4070 for (as->curins--; as->curins > as->stopins; as->curins--) {
3803 IRIns *ir = IR(as->curins); 4071 IRIns *ir = IR(as->curins);
4072 lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
3804 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE)) 4073 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
3805 continue; /* Dead-code elimination can be soooo easy. */ 4074 continue; /* Dead-code elimination can be soooo easy. */
3806 if (irt_isguard(ir->t)) 4075 if (irt_isguard(ir->t))
@@ -3864,11 +4133,10 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3864 case IR_CALLN: case IR_CALLL: case IR_CALLS: { 4133 case IR_CALLN: case IR_CALLL: case IR_CALLS: {
3865 const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; 4134 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
3866#if LJ_64 4135#if LJ_64
3867 /* NYI: add stack slots for x64 calls with many args. */
3868 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6)); 4136 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6));
3869 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); 4137 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
3870#else 4138#else
3871 /* NYI: not fastcall-aware, but doesn't matter (yet). */ 4139 lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2);
3872 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */ 4140 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
3873 as->evenspill = (int32_t)CCI_NARGS(ci); 4141 as->evenspill = (int32_t)CCI_NARGS(ci);
3874 ir->prev = REGSP_HINT(RID_RET); 4142 ir->prev = REGSP_HINT(RID_RET);
@@ -3878,6 +4146,12 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3878 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH; 4146 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
3879 continue; 4147 continue;
3880 } 4148 }
4149#if LJ_32 && LJ_HASFFI
4150 case IR_HIOP:
4151 if ((ir-1)->o == IR_CALLN)
4152 ir->prev = REGSP_HINT(RID_RETHI);
4153 break;
4154#endif
3881 /* C calls evict all scratch regs and return results in RID_RET. */ 4155 /* C calls evict all scratch regs and return results in RID_RET. */
3882 case IR_SNEW: case IR_NEWREF: 4156 case IR_SNEW: case IR_NEWREF:
3883#if !LJ_64 4157#if !LJ_64
@@ -3894,6 +4168,14 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3894 as->modset = RSET_SCRATCH; 4168 as->modset = RSET_SCRATCH;
3895 break; 4169 break;
3896 case IR_POWI: 4170 case IR_POWI:
4171#if LJ_64 && LJ_HASFFI
4172 if (!irt_isnum(ir->t)) {
4173 ir->prev = REGSP_HINT(RID_RET);
4174 if (inloop)
4175 as->modset |= (RSET_SCRATCH & RSET_GPR);
4176 continue;
4177 }
4178#endif
3897 ir->prev = REGSP_HINT(RID_XMM0); 4179 ir->prev = REGSP_HINT(RID_XMM0);
3898 if (inloop) 4180 if (inloop)
3899 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); 4181 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
diff --git a/src/lj_carith.c b/src/lj_carith.c
index 46f07be7..134a61fb 100644
--- a/src/lj_carith.c
+++ b/src/lj_carith.c
@@ -230,6 +230,14 @@ int lj_carith_op(lua_State *L, MMS mm)
230 230
231/* -- 64 bit integer arithmetic helpers ----------------------------------- */ 231/* -- 64 bit integer arithmetic helpers ----------------------------------- */
232 232
233#if LJ_32
234/* Signed/unsigned 64 bit multiply. */
235int64_t lj_carith_mul64(int64_t a, int64_t b)
236{
237 return a * b;
238}
239#endif
240
233/* Unsigned 64 bit x^k. */ 241/* Unsigned 64 bit x^k. */
234uint64_t lj_carith_powu64(uint64_t x, uint64_t k) 242uint64_t lj_carith_powu64(uint64_t x, uint64_t k)
235{ 243{
diff --git a/src/lj_carith.h b/src/lj_carith.h
index 6870172b..14073603 100644
--- a/src/lj_carith.h
+++ b/src/lj_carith.h
@@ -12,6 +12,9 @@
12 12
13LJ_FUNC int lj_carith_op(lua_State *L, MMS mm); 13LJ_FUNC int lj_carith_op(lua_State *L, MMS mm);
14 14
15#if LJ_32
16LJ_FUNC int64_t lj_carith_mul64(int64_t x, int64_t k);
17#endif
15LJ_FUNC uint64_t lj_carith_powu64(uint64_t x, uint64_t k); 18LJ_FUNC uint64_t lj_carith_powu64(uint64_t x, uint64_t k);
16LJ_FUNC int64_t lj_carith_powi64(int64_t x, int64_t k); 19LJ_FUNC int64_t lj_carith_powi64(int64_t x, int64_t k);
17 20
diff --git a/src/lj_crecord.c b/src/lj_crecord.c
index 61210907..5eafa3a7 100644
--- a/src/lj_crecord.c
+++ b/src/lj_crecord.c
@@ -189,6 +189,7 @@ static void crec_ct_ct(jit_State *J, CType *d, CType *s, TRef dp, TRef sp,
189 sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0); 189 sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0);
190#endif 190#endif
191 xstore: 191 xstore:
192 if (dt == IRT_I64 || dt == IRT_U64) lj_needsplit(J);
192 emitir(IRT(IR_XSTORE, dt), dp, sp); 193 emitir(IRT(IR_XSTORE, dt), dp, sp);
193 break; 194 break;
194 case CCX(I, C): 195 case CCX(I, C):
@@ -311,6 +312,7 @@ static TRef crec_tv_ct(jit_State *J, CType *s, CTypeID sid, TRef sp)
311 TRef ptr = emitir(IRT(IR_ADD, IRT_PTR), dp, 312 TRef ptr = emitir(IRT(IR_ADD, IRT_PTR), dp,
312 lj_ir_kintp(J, sizeof(GCcdata))); 313 lj_ir_kintp(J, sizeof(GCcdata)));
313 emitir(IRT(IR_XSTORE, t), ptr, tr); 314 emitir(IRT(IR_XSTORE, t), ptr, tr);
315 lj_needsplit(J);
314 return dp; 316 return dp;
315 } else if ((sinfo & CTF_BOOL)) { 317 } else if ((sinfo & CTF_BOOL)) {
316 /* Assume not equal to zero. Fixup and emit pending guard later. */ 318 /* Assume not equal to zero. Fixup and emit pending guard later. */
@@ -406,7 +408,10 @@ static void crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, TValue *sval)
406 if (ctype_isenum(s->info)) s = ctype_child(cts, s); 408 if (ctype_isenum(s->info)) s = ctype_child(cts, s);
407 if (ctype_isnum(s->info)) { /* Load number value. */ 409 if (ctype_isnum(s->info)) { /* Load number value. */
408 IRType t = crec_ct2irt(s); 410 IRType t = crec_ct2irt(s);
409 if (t != IRT_CDATA) sp = emitir(IRT(IR_XLOAD, t), sp, 0); 411 if (t != IRT_CDATA) {
412 sp = emitir(IRT(IR_XLOAD, t), sp, 0);
413 if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J);
414 }
410 } 415 }
411 goto doconv; 416 goto doconv;
412 } 417 }
@@ -499,8 +504,10 @@ void LJ_FASTCALL recff_cdata_index(jit_State *J, RecordFFData *rd)
499 if (ctype_isinteger(ctk->info) && (t = crec_ct2irt(ctk)) != IRT_CDATA) { 504 if (ctype_isinteger(ctk->info) && (t = crec_ct2irt(ctk)) != IRT_CDATA) {
500 idx = emitir(IRT(IR_ADD, IRT_PTR), idx, lj_ir_kintp(J, sizeof(GCcdata))); 505 idx = emitir(IRT(IR_ADD, IRT_PTR), idx, lj_ir_kintp(J, sizeof(GCcdata)));
501 idx = emitir(IRT(IR_XLOAD, t), idx, 0); 506 idx = emitir(IRT(IR_XLOAD, t), idx, 0);
502 if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) 507 if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) {
503 idx = emitconv(idx, IRT_INT, t, 0); 508 idx = emitconv(idx, IRT_INT, t, 0);
509 lj_needsplit(J);
510 }
504 goto integer_key; 511 goto integer_key;
505 } 512 }
506 } else if (tref_isstr(idx)) { 513 } else if (tref_isstr(idx)) {
@@ -664,6 +671,7 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm)
664 CTypeID id; 671 CTypeID id;
665 TRef tr, dp, ptr; 672 TRef tr, dp, ptr;
666 MSize i; 673 MSize i;
674 lj_needsplit(J);
667 if (((s[0]->info & CTF_UNSIGNED) && s[0]->size == 8) || 675 if (((s[0]->info & CTF_UNSIGNED) && s[0]->size == 8) ||
668 ((s[1]->info & CTF_UNSIGNED) && s[1]->size == 8)) { 676 ((s[1]->info & CTF_UNSIGNED) && s[1]->size == 8)) {
669 dt = IRT_U64; id = CTID_UINT64; 677 dt = IRT_U64; id = CTID_UINT64;
@@ -691,9 +699,6 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm)
691 lj_ir_set(J, IRTG(op, dt), sp[0], sp[1]); 699 lj_ir_set(J, IRTG(op, dt), sp[0], sp[1]);
692 J->postproc = LJ_POST_FIXGUARD; 700 J->postproc = LJ_POST_FIXGUARD;
693 return TREF_TRUE; 701 return TREF_TRUE;
694 } else if (mm == MM_pow) {
695 tr = lj_ir_call(J, dt == IRT_I64 ? IRCALL_lj_carith_powi64 :
696 IRCALL_lj_carith_powu64, sp[0], sp[1]);
697 } else { 702 } else {
698 if (mm == MM_div || mm == MM_mod) 703 if (mm == MM_div || mm == MM_mod)
699 return 0; /* NYI: integer div, mod. */ 704 return 0; /* NYI: integer div, mod. */
@@ -754,10 +759,11 @@ static TRef crec_arith_ptr(jit_State *J, TRef *sp, CType **s, MMS mm)
754 tr = emitconv(tr, IRT_INTP, IRT_INT, 759 tr = emitconv(tr, IRT_INTP, IRT_INT,
755 ((t - IRT_I8) & 1) ? 0 : IRCONV_SEXT); 760 ((t - IRT_I8) & 1) ? 0 : IRCONV_SEXT);
756#else 761#else
757 if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) 762 if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) {
758 tr = emitconv(tr, IRT_INTP, t, 763 tr = emitconv(tr, IRT_INTP, t,
759 (t == IRT_NUM || t == IRT_FLOAT) ? 764 (t == IRT_NUM || t == IRT_FLOAT) ?
760 IRCONV_TRUNC|IRCONV_ANY : 0); 765 IRCONV_TRUNC|IRCONV_ANY : 0);
766 }
761#endif 767#endif
762 tr = emitir(IRT(IR_MUL, IRT_INTP), tr, lj_ir_kintp(J, sz)); 768 tr = emitir(IRT(IR_MUL, IRT_INTP), tr, lj_ir_kintp(J, sz));
763 tr = emitir(IRT(IR_ADD, IRT_PTR), sp[0], tr); 769 tr = emitir(IRT(IR_ADD, IRT_PTR), sp[0], tr);
@@ -790,6 +796,7 @@ void LJ_FASTCALL recff_cdata_arith(jit_State *J, RecordFFData *rd)
790 if (ctype_isnum(ct->info)) { 796 if (ctype_isnum(ct->info)) {
791 IRType t = crec_ct2irt(ct); 797 IRType t = crec_ct2irt(ct);
792 if (t == IRT_CDATA) goto err_type; 798 if (t == IRT_CDATA) goto err_type;
799 if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J);
793 tr = emitir(IRT(IR_XLOAD, t), tr, 0); 800 tr = emitir(IRT(IR_XLOAD, t), tr, 0);
794 } else if (!(ctype_isptr(ct->info) || ctype_isrefarray(ct->info))) { 801 } else if (!(ctype_isptr(ct->info) || ctype_isrefarray(ct->info))) {
795 goto err_type; 802 goto err_type;
@@ -842,6 +849,7 @@ void LJ_FASTCALL lj_crecord_tonumber(jit_State *J, RecordFFData *rd)
842 IRType t = crec_ct2irt(s); 849 IRType t = crec_ct2irt(s);
843 if (t != IRT_CDATA) { 850 if (t != IRT_CDATA) {
844 TRef tr = emitir(IRT(IR_XLOAD, t), sp, 0); /* Load number value. */ 851 TRef tr = emitir(IRT(IR_XLOAD, t), sp, 0); /* Load number value. */
852 if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J);
845 if (t == IRT_FLOAT || t == IRT_U32 || t == IRT_I64 || t == IRT_U64) 853 if (t == IRT_FLOAT || t == IRT_U32 || t == IRT_I64 || t == IRT_U64)
846 tr = emitconv(tr, IRT_NUM, t, 0); 854 tr = emitconv(tr, IRT_NUM, t, 0);
847 J->base[0] = tr; 855 J->base[0] = tr;
diff --git a/src/lj_ir.h b/src/lj_ir.h
index 1cb3566e..286eb219 100644
--- a/src/lj_ir.h
+++ b/src/lj_ir.h
@@ -33,6 +33,7 @@
33 /* Miscellaneous ops. */ \ 33 /* Miscellaneous ops. */ \
34 _(NOP, N , ___, ___) \ 34 _(NOP, N , ___, ___) \
35 _(BASE, N , lit, lit) \ 35 _(BASE, N , lit, lit) \
36 _(HIOP, S , ref, ref) \
36 _(LOOP, S , ___, ___) \ 37 _(LOOP, S , ___, ___) \
37 _(PHI, S , ref, ref) \ 38 _(PHI, S , ref, ref) \
38 _(RENAME, S , ref, lit) \ 39 _(RENAME, S , ref, lit) \
@@ -212,8 +213,9 @@ IRFLDEF(FLENUM)
212/* CONV mode, stored in op2. */ 213/* CONV mode, stored in op2. */
213#define IRCONV_SRCMASK 0x001f /* Source IRType. */ 214#define IRCONV_SRCMASK 0x001f /* Source IRType. */
214#define IRCONV_DSTMASK 0x03e0 /* Dest. IRType (also in ir->t). */ 215#define IRCONV_DSTMASK 0x03e0 /* Dest. IRType (also in ir->t). */
215#define IRCONV_NUM_INT ((IRT_NUM<<5)|IRT_INT) 216#define IRCONV_DSH 5
216#define IRCONV_INT_NUM ((IRT_INT<<5)|IRT_NUM) 217#define IRCONV_NUM_INT ((IRT_NUM<<IRCONV_DSH)|IRT_INT)
218#define IRCONV_INT_NUM ((IRT_INT<<IRCONV_DSH)|IRT_NUM)
217#define IRCONV_TRUNC 0x0400 /* Truncate number to integer. */ 219#define IRCONV_TRUNC 0x0400 /* Truncate number to integer. */
218#define IRCONV_SEXT 0x0800 /* Sign-extend integer to integer. */ 220#define IRCONV_SEXT 0x0800 /* Sign-extend integer to integer. */
219#define IRCONV_MODEMASK 0x0fff 221#define IRCONV_MODEMASK 0x0fff
@@ -251,13 +253,21 @@ typedef struct CCallInfo {
251#define CCI_CASTU64 0x0200 /* Cast u64 result to number. */ 253#define CCI_CASTU64 0x0200 /* Cast u64 result to number. */
252#define CCI_NOFPRCLOBBER 0x0400 /* Does not clobber any FPRs. */ 254#define CCI_NOFPRCLOBBER 0x0400 /* Does not clobber any FPRs. */
253#define CCI_FASTCALL 0x0800 /* Fastcall convention. */ 255#define CCI_FASTCALL 0x0800 /* Fastcall convention. */
254#define CCI_STACK64 0x1000 /* Needs 64 bits per argument. */
255 256
256/* Function definitions for CALL* instructions. */ 257/* Function definitions for CALL* instructions. */
257#if LJ_HASFFI 258#if LJ_HASFFI
259#if LJ_32
260#define ARG2_64 4 /* Treat as 4 32 bit arguments. */
261#define IRCALLDEF_FFI32(_) \
262 _(lj_carith_mul64, ARG2_64, N, I64, CCI_NOFPRCLOBBER)
263#else
264#define ARG2_64 2
265#define IRCALLDEF_FFI32(_)
266#endif
258#define IRCALLDEF_FFI(_) \ 267#define IRCALLDEF_FFI(_) \
259 _(lj_carith_powi64, 2, N, I64, CCI_STACK64|CCI_NOFPRCLOBBER) \ 268 IRCALLDEF_FFI32(_) \
260 _(lj_carith_powu64, 2, N, U64, CCI_STACK64|CCI_NOFPRCLOBBER) 269 _(lj_carith_powi64, ARG2_64, N, I64, CCI_NOFPRCLOBBER) \
270 _(lj_carith_powu64, ARG2_64, N, U64, CCI_NOFPRCLOBBER)
261#else 271#else
262#define IRCALLDEF_FFI(_) 272#define IRCALLDEF_FFI(_)
263#endif 273#endif
@@ -402,6 +412,7 @@ typedef struct IRType1 { uint8_t irt; } IRType1;
402#define irt_isinteger(t) (irt_typerange((t), IRT_I8, IRT_INT)) 412#define irt_isinteger(t) (irt_typerange((t), IRT_I8, IRT_INT))
403#define irt_isgcv(t) (irt_typerange((t), IRT_STR, IRT_UDATA)) 413#define irt_isgcv(t) (irt_typerange((t), IRT_STR, IRT_UDATA))
404#define irt_isaddr(t) (irt_typerange((t), IRT_LIGHTUD, IRT_UDATA)) 414#define irt_isaddr(t) (irt_typerange((t), IRT_LIGHTUD, IRT_UDATA))
415#define irt_isint64(t) (irt_typerange((t), IRT_I64, IRT_U64))
405 416
406#if LJ_64 417#if LJ_64
407#define IRT_IS64 \ 418#define IRT_IS64 \
diff --git a/src/lj_iropt.h b/src/lj_iropt.h
index 43c414c1..db99c118 100644
--- a/src/lj_iropt.h
+++ b/src/lj_iropt.h
@@ -141,6 +141,12 @@ LJ_FUNC IRType lj_opt_narrow_forl(cTValue *forbase);
141/* Optimization passes. */ 141/* Optimization passes. */
142LJ_FUNC void lj_opt_dce(jit_State *J); 142LJ_FUNC void lj_opt_dce(jit_State *J);
143LJ_FUNC int lj_opt_loop(jit_State *J); 143LJ_FUNC int lj_opt_loop(jit_State *J);
144#if LJ_HASFFI && LJ_32
145LJ_FUNC void lj_opt_split(jit_State *J);
146#else
147#define lj_opt_split(J) UNUSED(J)
148#endif
149
144#endif 150#endif
145 151
146#endif 152#endif
diff --git a/src/lj_jit.h b/src/lj_jit.h
index a8be1a97..38970fc7 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -240,6 +240,15 @@ enum {
240#define LJ_KSIMD(J, n) \ 240#define LJ_KSIMD(J, n) \
241 ((TValue *)(((intptr_t)&J->ksimd[2*(n)] + 15) & ~(intptr_t)15)) 241 ((TValue *)(((intptr_t)&J->ksimd[2*(n)] + 15) & ~(intptr_t)15))
242 242
243/* Set/reset flag to activate the SPLIT pass for the current trace. */
244#if LJ_32 && LJ_HASFFI
245#define lj_needsplit(J) (J->needsplit = 1)
246#define lj_resetsplit(J) (J->needsplit = 0)
247#else
248#define lj_needsplit(J) UNUSED(J)
249#define lj_resetsplit(J) UNUSED(J)
250#endif
251
243/* Fold state is used to fold instructions on-the-fly. */ 252/* Fold state is used to fold instructions on-the-fly. */
244typedef struct FoldState { 253typedef struct FoldState {
245 IRIns ins; /* Currently emitted instruction. */ 254 IRIns ins; /* Currently emitted instruction. */
@@ -293,6 +302,9 @@ typedef struct jit_State {
293 MSize sizesnapmap; /* Size of temp. snapshot map buffer. */ 302 MSize sizesnapmap; /* Size of temp. snapshot map buffer. */
294 303
295 PostProc postproc; /* Required post-processing after execution. */ 304 PostProc postproc; /* Required post-processing after execution. */
305#if LJ_32 && LJ_HASFFI
306 int needsplit; /* Need SPLIT pass. */
307#endif
296 308
297 GCRef *trace; /* Array of traces. */ 309 GCRef *trace; /* Array of traces. */
298 TraceNo freetrace; /* Start of scan for next free trace. */ 310 TraceNo freetrace; /* Start of scan for next free trace. */
diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c
index 2d08e187..03caf80d 100644
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@@ -538,6 +538,13 @@ LJFOLDF(kfold_conv_knum_int_num)
538 } 538 }
539} 539}
540 540
541LJFOLD(CONV KNUM IRCONV_U32_NUM)
542LJFOLDF(kfold_conv_knum_u32_num)
543{
544 lua_assert((fins->op2 & IRCONV_TRUNC));
545 return INTFOLD((int32_t)(uint32_t)knumleft);
546}
547
541LJFOLD(CONV KNUM IRCONV_I64_NUM) 548LJFOLD(CONV KNUM IRCONV_I64_NUM)
542LJFOLDF(kfold_conv_knum_i64_num) 549LJFOLDF(kfold_conv_knum_i64_num)
543{ 550{
@@ -805,6 +812,7 @@ LJFOLDF(simplify_conv_u32_num)
805} 812}
806 813
807LJFOLD(CONV CONV IRCONV_I64_NUM) /* _INT or _U32*/ 814LJFOLD(CONV CONV IRCONV_I64_NUM) /* _INT or _U32*/
815LJFOLD(CONV CONV IRCONV_U64_NUM) /* _INT or _U32*/
808LJFOLDF(simplify_conv_i64_num) 816LJFOLDF(simplify_conv_i64_num)
809{ 817{
810 PHIBARRIER(fleft); 818 PHIBARRIER(fleft);
@@ -826,23 +834,6 @@ LJFOLDF(simplify_conv_i64_num)
826 return NEXTFOLD; 834 return NEXTFOLD;
827} 835}
828 836
829LJFOLD(CONV CONV IRCONV_U64_NUM) /* _U32*/
830LJFOLDF(simplify_conv_u64_num)
831{
832 PHIBARRIER(fleft);
833 if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
834#if LJ_TARGET_X64
835 return fleft->op1;
836#else
837 /* Reduce to a zero-extension. */
838 fins->op1 = fleft->op1;
839 fins->op2 = (IRT_U64<<5)|IRT_U32;
840 return RETRYFOLD;
841#endif
842 }
843 return NEXTFOLD;
844}
845
846/* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */ 837/* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
847LJFOLD(TOBIT CONV KNUM) 838LJFOLD(TOBIT CONV KNUM)
848LJFOLDF(simplify_tobit_conv) 839LJFOLDF(simplify_tobit_conv)
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c
new file mode 100644
index 00000000..3cb30514
--- /dev/null
+++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
1/*
2** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
3** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
4*/
5
6#define lj_opt_split_c
7#define LUA_CORE
8
9#include "lj_obj.h"
10
11#if LJ_HASJIT && LJ_HASFFI && LJ_32
12
13#include "lj_err.h"
14#include "lj_str.h"
15#include "lj_ir.h"
16#include "lj_jit.h"
17#include "lj_iropt.h"
18#include "lj_vm.h"
19
20/* SPLIT pass:
21**
22** This pass splits up 64 bit IR instructions into multiple 32 bit IR
23** instructions. It's only active for 32 bit CPUs which lack native 64 bit
24** operations. The FFI is currently the only emitter for 64 bit
25** instructions, so this pass is disabled if the FFI is disabled.
26**
27** Splitting the IR in a separate pass keeps each 32 bit IR assembler
28** backend simple. Only a small amount of extra functionality needs to be
29** implemented. This is much easier than adding support for allocating
30** register pairs to each backend (believe me, I tried). A few simple, but
31** important optimizations can be performed by the SPLIT pass, which would
32** be tedious to do in the backend.
33**
34** The basic idea is to replace each 64 bit IR instruction with its 32 bit
35** equivalent plus an extra HIOP instruction. The splitted IR is not passed
36** through FOLD or any other optimizations, so each HIOP is guaranteed to
37** immediately follow it's counterpart. The actual functionality of HIOP is
38** inferred from the previous instruction.
39**
40** The operands of HIOP hold the hiword input references. The output of HIOP
41** is the hiword output reference, which is also used to hold the hiword
42** register or spill slot information. The register allocator treats this
43** instruction independent of any other instruction, which improves code
44** quality compared to using fixed register pairs.
45**
46** It's easier to split up some instructions into two regular 32 bit
47** instructions. E.g. XLOAD is split up into two XLOADs with two different
48** addresses. Obviously 64 bit constants need to be split up into two 32 bit
49** constants, too. Some hiword instructions can be entirely omitted, e.g.
50** when zero-extending a 32 bit value to 64 bits.
51**
52** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
53** two int64_t fields:
54**
55** 0100 p32 ADD base +8
56** 0101 i64 XLOAD 0100
57** 0102 i64 ADD 0101 +1
58** 0103 p32 ADD base +16
59** 0104 i64 XSTORE 0103 0102
60**
61** mov rax, [esi+0x8]
62** add rax, +0x01
63** mov [esi+0x10], rax
64**
65** Here's the transformed IR and the x86 machine code after the SPLIT pass:
66**
67** 0100 p32 ADD base +8
68** 0101 int XLOAD 0100
69** 0102 p32 ADD base +12
70** 0103 int XLOAD 0102
71** 0104 int ADD 0101 +1
72** 0105 int HIOP 0103 +0
73** 0106 p32 ADD base +16
74** 0107 int XSTORE 0106 0104
75** 0108 p32 ADD base +20
76** 0109 int XSTORE 0108 0105
77**
78** mov eax, [esi+0x8]
79** mov ecx, [esi+0xc]
80** add eax, +0x01
81** adc ecx, +0x00
82** mov [esi+0x10], eax
83** mov [esi+0x14], ecx
84**
85** You may notice the reassociated hiword address computation, which is
86** later fused into the mov operands by the assembler.
87*/
88
89/* Some local macros to save typing. Undef'd at the end. */
90#define IR(ref) (&J->cur.ir[(ref)])
91
92/* Directly emit the transformed IR without updating chains etc. */
93static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
94{
95 IRRef nref = lj_ir_nextins(J);
96 IRIns *ir = IR(nref);
97 ir->ot = ot;
98 ir->op1 = op1;
99 ir->op2 = op2;
100 return nref;
101}
102
103/* Emit a CALLN with two split 64 bit arguments. */
104static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
105 IRIns *ir, IRCallID id)
106{
107 IRRef tmp, op1 = ir->op1, op2 = ir->op2;
108 J->cur.nins--;
109#if LJ_LE
110 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
111 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
112 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
113#else
114 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
115 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
116 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
117#endif
118 ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
119 return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
120}
121
122/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
123static IRRef split_ptr(jit_State *J, IRRef ref)
124{
125 IRIns *ir = IR(ref);
126 int32_t ofs = 4;
127 if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */
128 ofs += IR(ir->op2)->i;
129 ref = ir->op1;
130 if (ofs == 0) return ref;
131 }
132 return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
133}
134
135/* Transform the old IR to the new IR. */
136static void split_ir(jit_State *J)
137{
138 IRRef nins = J->cur.nins, nk = J->cur.nk;
139 MSize irlen = nins - nk;
140 MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
141 IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
142 IRRef1 *hisubst;
143 IRRef ref;
144
145 /* Copy old IR to buffer. */
146 memcpy(oir, IR(nk), irlen*sizeof(IRIns));
147 /* Bias hiword substitution table and old IR. Loword kept in field prev. */
148 hisubst = (IRRef1 *)&oir[irlen] - nk;
149 oir -= nk;
150
151 /* Remove all IR instructions, but retain IR constants. */
152 J->cur.nins = REF_FIRST;
153
154 /* Process constants and fixed references. */
155 for (ref = nk; ref <= REF_BASE; ref++) {
156 IRIns *ir = &oir[ref];
157 if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */
158 TValue tv = *ir_k64(ir);
159 ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
160 hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
161 } else {
162 ir->prev = (IRRef1)ref; /* Identity substitution for loword. */
163 }
164 }
165
166 /* Process old IR instructions. */
167 for (ref = REF_FIRST; ref < nins; ref++) {
168 IRIns *ir = &oir[ref];
169 IRRef nref = lj_ir_nextins(J);
170 IRIns *nir = IR(nref);
171
172 /* Copy-substitute old instruction to new instruction. */
173 nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
174 nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
175 ir->prev = nref; /* Loword substitution. */
176 nir->o = ir->o;
177 nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
178
179 /* Split 64 bit instructions. */
180 if (irt_isint64(ir->t)) {
181 IRRef hi = hisubst[ir->op1];
182 nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */
183 switch (ir->o) {
184 case IR_ADD:
185 case IR_SUB:
186 /* Use plain op for hiword if loword cannot produce a carry/borrow. */
187 if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
188 ir->prev = nir->op1; /* Pass through loword. */
189 nir->op1 = hi; nir->op2 = hisubst[ir->op2];
190 hi = nref;
191 break;
192 }
193 /* fallthrough */
194 case IR_NEG:
195 hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
196 break;
197 case IR_MUL:
198 hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
199 break;
200 case IR_POWI:
201 hi = split_call64(J, hisubst, oir, ir,
202 irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
203 IRCALL_lj_carith_powu64);
204 break;
205 case IR_XLOAD:
206 hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
207#if LJ_BE
208 ir->prev = hi; hi = nref;
209#endif
210 break;
211 case IR_XSTORE:
212#if LJ_LE
213 hi = hisubst[ir->op2];
214#else
215 hi = nir->op2; nir->op2 = hisubst[ir->op2];
216#endif
217 split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
218 continue;
219 case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */
220 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
221 if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */
222 hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
223 } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */
224 /* Drop cast, since assembler doesn't care. */
225 hisubst[ref] = hi;
226 goto fwdlo;
227 } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
228 IRRef k31 = lj_ir_kint(J, 31);
229 nir = IR(nref); /* May have been reallocated. */
230 ir->prev = nir->op1; /* Pass through loword. */
231 nir->o = IR_BSAR; /* hi = bsar(lo, 31). */
232 nir->op2 = k31;
233 hi = nref;
234 } else { /* Zero-extend to 64 bit. */
235 hisubst[ref] = lj_ir_kint(J, 0);
236 goto fwdlo;
237 }
238 break;
239 }
240 case IR_PHI: {
241 IRRef hi2;
242 if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
243 nir->op1 == nir->op2)
244 J->cur.nins--; /* Drop useless PHIs. */
245 hi2 = hisubst[ir->op2];
246 if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
247 split_emit(J, IRTI(IR_PHI), hi, hi2);
248 continue;
249 }
250 default:
251 lua_assert(ir->o <= IR_NE);
252 split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */
253 continue;
254 }
255 hisubst[ref] = hi; /* Store hiword substitution. */
256 } else if (ir->o == IR_CONV) { /* See above, too. */
257 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
258 if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */
259 if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */
260 ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
261 hisubst[ir->op1], nref);
262 } else { /* Truncate to lower 32 bits. */
263 fwdlo:
264 ir->prev = nir->op1; /* Forward loword. */
265 /* Replace with NOP to avoid messing up the snapshot logic. */
266 nir->ot = IRT(IR_NOP, IRT_NIL);
267 nir->op1 = nir->op2 = 0;
268 }
269 }
270 } else if (ir->o == IR_LOOP) {
271 J->loopref = nref; /* Needed by assembler. */
272 }
273 }
274
275 /* Add PHI marks. */
276 for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
277 IRIns *ir = IR(ref);
278 if (ir->o != IR_PHI) break;
279 if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
280 if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
281 }
282
283 /* Substitute snapshot maps. */
284 oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */
285 {
286 SnapNo i, nsnap = J->cur.nsnap;
287 for (i = 0; i < nsnap; i++) {
288 SnapShot *snap = &J->cur.snap[i];
289 SnapEntry *map = &J->cur.snapmap[snap->mapofs];
290 MSize n, nent = snap->nent;
291 snap->ref = oir[snap->ref].prev;
292 for (n = 0; n < nent; n++) {
293 SnapEntry sn = map[n];
294 map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
295 }
296 }
297 }
298}
299
300/* Protected callback for split pass. */
301static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
302{
303 jit_State *J = (jit_State *)ud;
304 split_ir(J);
305 UNUSED(L); UNUSED(dummy);
306 return NULL;
307}
308
309#ifdef LUA_USE_ASSERT
310/* Slow, but sure way to check whether a SPLIT pass is needed. */
311static int split_needsplit(jit_State *J)
312{
313 IRIns *ir, *irend;
314 IRRef ref;
315 for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
316 if (irt_isint64(ir->t))
317 return 1;
318 for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
319 if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
320 (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
321 return 1;
322 return 0; /* Nope. */
323}
324#endif
325
326/* SPLIT pass. */
327void lj_opt_split(jit_State *J)
328{
329 lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */
330 if (J->needsplit) {
331 int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
332 if (errcode) {
333 /* Completely reset the trace to avoid inconsistent dump on abort. */
334 J->cur.nins = J->cur.nk = REF_BASE;
335 J->cur.nsnap = 0;
336 lj_err_throw(J->L, errcode); /* Propagate errors. */
337 }
338 }
339}
340
341#undef IR
342
343#endif
diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h
index 94ab3c32..37c68f4b 100644
--- a/src/lj_target_x86.h
+++ b/src/lj_target_x86.h
@@ -193,6 +193,7 @@ typedef enum {
193 XI_FLD1 = 0xe8d9, 193 XI_FLD1 = 0xe8d9,
194 XI_FLDLG2 = 0xecd9, 194 XI_FLDLG2 = 0xecd9,
195 XI_FLDLN2 = 0xedd9, 195 XI_FLDLN2 = 0xedd9,
196 XI_FDUP = 0xc0d9, /* Really fld st0. */
196 XI_FPOP = 0xd8dd, /* Really fstp st0. */ 197 XI_FPOP = 0xd8dd, /* Really fstp st0. */
197 XI_FPOP1 = 0xd9dd, /* Really fstp st1. */ 198 XI_FPOP1 = 0xd9dd, /* Really fstp st1. */
198 XI_FRNDINT = 0xfcd9, 199 XI_FRNDINT = 0xfcd9,
@@ -263,10 +264,17 @@ typedef enum {
263 XO_MOVD = XO_660f(6e), 264 XO_MOVD = XO_660f(6e),
264 XO_MOVDto = XO_660f(7e), 265 XO_MOVDto = XO_660f(7e),
265 266
267 XO_FLDd = XO_(d9), XOg_FLDd = 0,
266 XO_FLDq = XO_(dd), XOg_FLDq = 0, 268 XO_FLDq = XO_(dd), XOg_FLDq = 0,
267 XO_FILDd = XO_(db), XOg_FILDd = 0, 269 XO_FILDd = XO_(db), XOg_FILDd = 0,
270 XO_FILDq = XO_(df), XOg_FILDq = 5,
271 XO_FSTPd = XO_(d9), XOg_FSTPd = 3,
268 XO_FSTPq = XO_(dd), XOg_FSTPq = 3, 272 XO_FSTPq = XO_(dd), XOg_FSTPq = 3,
269 XO_FISTPq = XO_(df), XOg_FISTPq = 7, 273 XO_FISTPq = XO_(df), XOg_FISTPq = 7,
274 XO_FISTTPq = XO_(dd), XOg_FISTTPq = 1,
275 XO_FADDq = XO_(dc), XOg_FADDq = 0,
276 XO_FLDCW = XO_(d9), XOg_FLDCW = 5,
277 XO_FNSTCW = XO_(d9), XOg_FNSTCW = 7
270} x86Op; 278} x86Op;
271 279
272/* x86 opcode groups. */ 280/* x86 opcode groups. */
@@ -278,6 +286,7 @@ typedef uint32_t x86Group;
278#define XG_TOXOi8(xg) ((x86Op)(0x000000fe + (((xg)<<8) & 0xff000000))) 286#define XG_TOXOi8(xg) ((x86Op)(0x000000fe + (((xg)<<8) & 0xff000000)))
279 287
280#define XO_ARITH(a) ((x86Op)(0x030000fe + ((a)<<27))) 288#define XO_ARITH(a) ((x86Op)(0x030000fe + ((a)<<27)))
289#define XO_ARITHw(a) ((x86Op)(0x036600fd + ((a)<<27)))
281 290
282typedef enum { 291typedef enum {
283 XOg_ADD, XOg_OR, XOg_ADC, XOg_SBB, XOg_AND, XOg_SUB, XOg_XOR, XOg_CMP, 292 XOg_ADD, XOg_OR, XOg_ADC, XOg_SBB, XOg_AND, XOg_SUB, XOg_XOR, XOg_CMP,
diff --git a/src/lj_trace.c b/src/lj_trace.c
index da20f991..b67e8f75 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -394,6 +394,7 @@ static void trace_start(jit_State *J)
394 J->bcskip = 0; 394 J->bcskip = 0;
395 J->guardemit.irt = 0; 395 J->guardemit.irt = 0;
396 J->postproc = LJ_POST_NONE; 396 J->postproc = LJ_POST_NONE;
397 lj_resetsplit(J);
397 setgcref(J->cur.startpt, obj2gco(J->pt)); 398 setgcref(J->cur.startpt, obj2gco(J->pt));
398 399
399 L = J->L; 400 L = J->L;
@@ -592,6 +593,7 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
592 } 593 }
593 J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */ 594 J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */
594 } 595 }
596 lj_opt_split(J);
595 J->state = LJ_TRACE_ASM; 597 J->state = LJ_TRACE_ASM;
596 break; 598 break;
597 599
diff --git a/src/ljamalg.c b/src/ljamalg.c
index 4d5f7600..5d90c002 100644
--- a/src/ljamalg.c
+++ b/src/ljamalg.c
@@ -58,6 +58,7 @@
58#include "lj_opt_narrow.c" 58#include "lj_opt_narrow.c"
59#include "lj_opt_dce.c" 59#include "lj_opt_dce.c"
60#include "lj_opt_loop.c" 60#include "lj_opt_loop.c"
61#include "lj_opt_split.c"
61#include "lj_mcode.c" 62#include "lj_mcode.c"
62#include "lj_snap.c" 63#include "lj_snap.c"
63#include "lj_record.c" 64#include "lj_record.c"